Merge branch 'next' into for-linus

author: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2011-05-24 03:06:26 -0400
committer: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2011-05-24 03:06:26 -0400
commit: b73077eb03f510a84b102fb97640e595a958403c (patch)
tree: 8b639000418e2756bf6baece4e00e07d2534bccc /fs
parent: 28350e330cfab46b60a1dbf763b678d859f9f3d9 (diff)
parent: 9d2e173644bb5c42ff1b280fbdda3f195a7cf1f7 (diff)
610 files changed, 21991 insertions, 13202 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 02a2cf616318..535ab6eccb1a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -21,8 +21,8 @@
 #include <linux/posix_acl_xattr.h>
 #include "xattr.h"
 #include "acl.h"
-#include "v9fs_vfs.h"
 #include "v9fs.h"
+#include "v9fs_vfs.h"
 static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
 {
@@ -59,7 +59,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
        struct v9fs_session_info *v9ses;
        v9ses = v9fs_inode2v9ses(inode);
-        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+        if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
+                        ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
                set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
                set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
                return 0;
@@ -71,11 +72,15 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
        if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
                set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
                set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
-                posix_acl_release(dacl);
-                posix_acl_release(pacl);
        } else
                retval = -EIO;
+        if (!IS_ERR(dacl))
+                posix_acl_release(dacl);
+        if (!IS_ERR(pacl))
+                posix_acl_release(pacl);
        return retval;
 }
@@ -100,9 +105,10 @@ int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
                return -ECHILD;
        v9ses = v9fs_inode2v9ses(inode);
-        if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) {
+        if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
+                        ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
                /*
-                 * On access = client mode get the acl
+                 * On access = client  and acl = on mode get the acl
                 * values from the server
                 */
                return 0;
@@ -128,6 +134,10 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
        struct inode *inode = dentry->d_inode;
        set_cached_acl(inode, type, acl);
+        if (!acl)
+                return 0;
        /* Set a setxattr request to server */
        size = posix_acl_xattr_size(acl->a_count);
        buffer = kmalloc(size, GFP_KERNEL);
@@ -177,10 +187,8 @@ int v9fs_acl_chmod(struct dentry *dentry)
 int v9fs_set_create_acl(struct dentry *dentry,
                        struct posix_acl *dpacl, struct posix_acl *pacl)
 {
-        if (dpacl)
+        v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
-                v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
+        v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
-        if (pacl)
-                v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
        posix_acl_release(dpacl);
        posix_acl_release(pacl);
        return 0;
@@ -254,7 +262,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
        if (strcmp(name, "") != 0)
                return -EINVAL;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
        /*
         * We allow set/get/list of acl when access=client is not specified
         */
@@ -304,7 +312,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
        if (strcmp(name, "") != 0)
                return -EINVAL;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
        /*
         * set the attribute on the remote. Without even looking at the
         * xattr value. We leave it to the server to validate
@@ -315,7 +323,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
        if (S_ISLNK(inode->i_mode))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value) {
                /* update the cached acl value */
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 0dbe0d139ac2..5b335c5086a1 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -33,67 +33,11 @@
 #define CACHETAG_LEN  11
-struct kmem_cache *vcookie_cache;
 struct fscache_netfs v9fs_cache_netfs = {
        .name           = "9p",
        .version        = 0,
 };
-static void init_once(void *foo)
-{
-        struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo;
-        vcookie->fscache = NULL;
-        vcookie->qid = NULL;
-        inode_init_once(&vcookie->inode);
-}
-/**
- * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain
- *                          vcookie to inode mapping
- *
- * Returns 0 on success.
- */
-static int v9fs_init_vcookiecache(void)
-{
-        vcookie_cache = kmem_cache_create("vcookie_cache",
-                                          sizeof(struct v9fs_cookie),
-                                          0, (SLAB_RECLAIM_ACCOUNT|
-                                              SLAB_MEM_SPREAD),
-                                          init_once);
-        if (!vcookie_cache)
-                return -ENOMEM;
-        return 0;
-}
-/**
- * v9fs_destroy_vcookiecache - destroy the cache of vcookies
- *
- */
-static void v9fs_destroy_vcookiecache(void)
-{
-        kmem_cache_destroy(vcookie_cache);
-}
-int __v9fs_cache_register(void)
-{
-        int ret;
-        ret = v9fs_init_vcookiecache();
-        if (ret < 0)
-                return ret;
-        return fscache_register_netfs(&v9fs_cache_netfs);
-}
-void __v9fs_cache_unregister(void)
-{
-        v9fs_destroy_vcookiecache();
-        fscache_unregister_netfs(&v9fs_cache_netfs);
-}
 /**
 * v9fs_random_cachetag - Generate a random tag to be associated
 *                        with a new cache session.
@@ -133,9 +77,9 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
 }
 const struct fscache_cookie_def v9fs_cache_session_index_def = {
-        .name           = "9P.session",
+        .name           = "9P.session",
-        .type           = FSCACHE_COOKIE_TYPE_INDEX,
+        .type           = FSCACHE_COOKIE_TYPE_INDEX,
-        .get_key        = v9fs_cache_session_get_key,
+        .get_key        = v9fs_cache_session_get_key,
 };
 void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
@@ -163,33 +107,33 @@ void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
 static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
                                         void *buffer, uint16_t bufmax)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path));
+        memcpy(buffer, &v9inode->fscache_key->path,
+               sizeof(v9inode->fscache_key->path));
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode,
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode,
-                   vcookie->qid->path);
+                   v9inode->fscache_key->path);
-        return sizeof(vcookie->qid->path);
+        return sizeof(v9inode->fscache_key->path);
 }
 static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
                                      uint64_t *size)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        *size = i_size_read(&vcookie->inode);
+        *size = i_size_read(&v9inode->vfs_inode);
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode,
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode,
                   *size);
 }
 static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
                                         void *buffer, uint16_t buflen)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version));
+        memcpy(buffer, &v9inode->fscache_key->version,
+               sizeof(v9inode->fscache_key->version));
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode,
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode,
-                   vcookie->qid->version);
+                   v9inode->fscache_key->version);
-        return sizeof(vcookie->qid->version);
+        return sizeof(v9inode->fscache_key->version);
 }
 static enum
@@ -197,13 +141,13 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
                                            const void *buffer,
                                            uint16_t buflen)
 {
-        const struct v9fs_cookie *vcookie = cookie_netfs_data;
+        const struct v9fs_inode *v9inode = cookie_netfs_data;
-        if (buflen != sizeof(vcookie->qid->version))
+        if (buflen != sizeof(v9inode->fscache_key->version))
                return FSCACHE_CHECKAUX_OBSOLETE;
-        if (memcmp(buffer, &vcookie->qid->version,
+        if (memcmp(buffer, &v9inode->fscache_key->version,
-                   sizeof(vcookie->qid->version)))
+                   sizeof(v9inode->fscache_key->version)))
                return FSCACHE_CHECKAUX_OBSOLETE;
        return FSCACHE_CHECKAUX_OKAY;
@@ -211,7 +155,7 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
 static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
 {
-        struct v9fs_cookie *vcookie = cookie_netfs_data;
+        struct v9fs_inode *v9inode = cookie_netfs_data;
        struct pagevec pvec;
        pgoff_t first;
        int loop, nr_pages;
@@ -220,7 +164,7 @@ static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
        first = 0;
        for (;;) {
-                nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping,
+                nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
                                          first,
                                          PAGEVEC_SIZE - pagevec_count(&pvec));
                if (!nr_pages)
@@ -249,115 +193,114 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = {
 void v9fs_cache_inode_get_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie;
+        struct v9fs_inode *v9inode;
        struct v9fs_session_info *v9ses;
        if (!S_ISREG(inode->i_mode))
                return;
-        vcookie = v9fs_inode2cookie(inode);
+        v9inode = V9FS_I(inode);
-        if (vcookie->fscache)
+        if (v9inode->fscache)
                return;
        v9ses = v9fs_inode2v9ses(inode);
-        vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+        v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
                                                  &v9fs_cache_inode_index_def,
-                                                  vcookie);
+                                                  v9inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
-                   vcookie->fscache);
+                   v9inode->fscache);
 }
 void v9fs_cache_inode_put_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
-                   vcookie->fscache);
+                   v9inode->fscache);
-        fscache_relinquish_cookie(vcookie->fscache, 0);
+        fscache_relinquish_cookie(v9inode->fscache, 0);
-        vcookie->fscache = NULL;
+        v9inode->fscache = NULL;
 }
 void v9fs_cache_inode_flush_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
-                   vcookie->fscache);
+                   v9inode->fscache);
-        fscache_relinquish_cookie(vcookie->fscache, 1);
+        fscache_relinquish_cookie(v9inode->fscache, 1);
-        vcookie->fscache = NULL;
+        v9inode->fscache = NULL;
 }
 void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        struct p9_fid *fid;
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
-        spin_lock(&vcookie->lock);
+        spin_lock(&v9inode->fscache_lock);
        fid = filp->private_data;
        if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
                v9fs_cache_inode_flush_cookie(inode);
        else
                v9fs_cache_inode_get_cookie(inode);
-        spin_unlock(&vcookie->lock);
+        spin_unlock(&v9inode->fscache_lock);
 }
 void v9fs_cache_inode_reset_cookie(struct inode *inode)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        struct v9fs_session_info *v9ses;
        struct fscache_cookie *old;
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return;
-        old = vcookie->fscache;
+        old = v9inode->fscache;
-        spin_lock(&vcookie->lock);
+        spin_lock(&v9inode->fscache_lock);
-        fscache_relinquish_cookie(vcookie->fscache, 1);
+        fscache_relinquish_cookie(v9inode->fscache, 1);
        v9ses = v9fs_inode2v9ses(inode);
-        vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
+        v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
                                                  &v9fs_cache_inode_index_def,
-                                                  vcookie);
+                                                  v9inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
-                   inode, old, vcookie->fscache);
+                   inode, old, v9inode->fscache);
-        spin_unlock(&vcookie->lock);
+        spin_unlock(&v9inode->fscache_lock);
 }
 int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
 {
        struct inode *inode = page->mapping->host;
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        BUG_ON(!vcookie->fscache);
+        BUG_ON(!v9inode->fscache);
-        return fscache_maybe_release_page(vcookie->fscache, page, gfp);
+        return fscache_maybe_release_page(v9inode->fscache, page, gfp);
 }
 void __v9fs_fscache_invalidate_page(struct page *page)
 {
        struct inode *inode = page->mapping->host;
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        BUG_ON(!vcookie->fscache);
+        BUG_ON(!v9inode->fscache);
        if (PageFsCache(page)) {
-                fscache_wait_on_page_write(vcookie->fscache, page);
+                fscache_wait_on_page_write(v9inode->fscache, page);
                BUG_ON(!PageLocked(page));
-                fscache_uncache_page(vcookie->fscache, page);
+                fscache_uncache_page(v9inode->fscache, page);
        }
 }
@@ -380,13 +323,13 @@ static void v9fs_vfs_readpage_complete(struct page *page, void *data,
 int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
 {
        int ret;
-        const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return -ENOBUFS;
-        ret = fscache_read_or_alloc_page(vcookie->fscache,
+        ret = fscache_read_or_alloc_page(v9inode->fscache,
                                         page,
                                         v9fs_vfs_readpage_complete,
                                         NULL,
@@ -418,13 +361,13 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
                                  unsigned *nr_pages)
 {
        int ret;
-        const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
-        if (!vcookie->fscache)
+        if (!v9inode->fscache)
                return -ENOBUFS;
-        ret = fscache_read_or_alloc_pages(vcookie->fscache,
+        ret = fscache_read_or_alloc_pages(v9inode->fscache,
                                          mapping, pages, nr_pages,
                                          v9fs_vfs_readpage_complete,
                                          NULL,
@@ -453,11 +396,22 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
 void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
 {
        int ret;
-        const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
-        ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL);
+        ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
        P9_DPRINTK(P9_DEBUG_FSC, "ret =  %d", ret);
        if (ret != 0)
                v9fs_uncache_page(inode, page);
 }
+/*
+ * wait for a page to complete writing to the cache
+ */
+void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
+{
+        const struct v9fs_inode *v9inode = V9FS_I(inode);
+        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+        if (PageFsCache(page))
+                fscache_wait_on_page_write(v9inode->fscache, page);
+}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index a94192bfaee8..049507a5b01c 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -25,20 +25,6 @@
 #include <linux/fscache.h>
 #include <linux/spinlock.h>
-extern struct kmem_cache *vcookie_cache;
-struct v9fs_cookie {
-        spinlock_t lock;
-        struct inode inode;
-        struct fscache_cookie *fscache;
-        struct p9_qid *qid;
-};
-static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode)
-{
-        return container_of(inode, struct v9fs_cookie, inode);
-}
 extern struct fscache_netfs v9fs_cache_netfs;
 extern const struct fscache_cookie_def v9fs_cache_session_index_def;
 extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
@@ -64,23 +50,8 @@ extern int __v9fs_readpages_from_fscache(struct inode *inode,
                                         struct list_head *pages,
                                         unsigned *nr_pages);
 extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
+extern void __v9fs_fscache_wait_on_page_write(struct inode *inode,
+                                              struct page *page);
-/**
- * v9fs_cache_register - Register v9fs file system with the cache
- */
-static inline int v9fs_cache_register(void)
-{
-        return __v9fs_cache_register();
-}
-/**
- * v9fs_cache_unregister - Unregister v9fs from the cache
- */
-static inline void v9fs_cache_unregister(void)
-{
-        __v9fs_cache_unregister();
-}
 static inline int v9fs_fscache_release_page(struct page *page,
                                            gfp_t gfp)
@@ -117,28 +88,27 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode,
 static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        fscache_uncache_page(vcookie->fscache, page);
+        fscache_uncache_page(v9inode->fscache, page);
        BUG_ON(PageFsCache(page));
 }
-static inline void v9fs_vcookie_set_qid(struct inode *inode,
+static inline void v9fs_fscache_set_key(struct inode *inode,
                                        struct p9_qid *qid)
 {
-        struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
+        struct v9fs_inode *v9inode = V9FS_I(inode);
-        spin_lock(&vcookie->lock);
+        spin_lock(&v9inode->fscache_lock);
-        vcookie->qid = qid;
+        v9inode->fscache_key = qid;
-        spin_unlock(&vcookie->lock);
+        spin_unlock(&v9inode->fscache_lock);
 }
-#else /* CONFIG_9P_FSCACHE */
+static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
+                                                   struct page *page)
-static inline int v9fs_cache_register(void)
 {
-        return 1;
+        return __v9fs_fscache_wait_on_page_write(inode, page);
 }
-static inline void v9fs_cache_unregister(void) {}
+#else /* CONFIG_9P_FSCACHE */
 static inline int v9fs_fscache_release_page(struct page *page,
                                            gfp_t gfp) {
@@ -168,9 +138,11 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode,
 static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
 {}
-static inline void v9fs_vcookie_set_qid(struct inode *inode,
+static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
-                                        struct p9_qid *qid)
+                                                   struct page *page)
-{}
+{
+        return;
+}
 #endif /* CONFIG_9P_FSCACHE */
 #endif /* _9P_CACHE_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index b00223c99d70..85b67ffa2a43 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -125,46 +125,17 @@ err_out:
        return -ENOMEM;
 }
-/**
+static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
- * v9fs_fid_lookup - lookup for a fid, try to walk if not found
+                                               uid_t uid, int any)
- * @dentry: dentry to look for fid in
- *
- * Look for a fid in the specified dentry for the current user.
- * If no fid is found, try to create one walking from a fid from the parent
- * dentry (if it has one), or the root dentry. If the user haven't accessed
- * the fs yet, attach now and walk from the root.
- */
-struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 {
-        int i, n, l, clone, any, access;
-        u32 uid;
-        struct p9_fid *fid, *old_fid = NULL;
        struct dentry *ds;
-        struct v9fs_session_info *v9ses;
        char **wnames, *uname;
+        int i, n, l, clone, access;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid, *old_fid = NULL;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
        access = v9ses->flags & V9FS_ACCESS_MASK;
-        switch (access) {
-        case V9FS_ACCESS_SINGLE:
-        case V9FS_ACCESS_USER:
-        case V9FS_ACCESS_CLIENT:
-                uid = current_fsuid();
-                any = 0;
-                break;
-        case V9FS_ACCESS_ANY:
-                uid = v9ses->uid;
-                any = 1;
-                break;
-        default:
-                uid = ~0;
-                any = 0;
-                break;
-        }
        fid = v9fs_fid_find(dentry, uid, any);
        if (fid)
                return fid;
@@ -250,6 +221,45 @@ err_out:
        return fid;
 }
+/**
+ * v9fs_fid_lookup - lookup for a fid, try to walk if not found
+ * @dentry: dentry to look for fid in
+ *
+ * Look for a fid in the specified dentry for the current user.
+ * If no fid is found, try to create one walking from a fid from the parent
+ * dentry (if it has one), or the root dentry. If the user haven't accessed
+ * the fs yet, attach now and walk from the root.
+ */
+struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
+{
+        uid_t uid;
+        int  any, access;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_dentry2v9ses(dentry);
+        access = v9ses->flags & V9FS_ACCESS_MASK;
+        switch (access) {
+        case V9FS_ACCESS_SINGLE:
+        case V9FS_ACCESS_USER:
+        case V9FS_ACCESS_CLIENT:
+                uid = current_fsuid();
+                any = 0;
+                break;
+        case V9FS_ACCESS_ANY:
+                uid = v9ses->uid;
+                any = 1;
+                break;
+        default:
+                uid = ~0;
+                any = 0;
+                break;
+        }
+        return v9fs_fid_lookup_with_uid(dentry, uid, any);
+}
 struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
 {
        struct p9_fid *fid, *ret;
@@ -261,3 +271,39 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
        ret = p9_client_walk(fid, 0, NULL, 1);
        return ret;
 }
+static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
+{
+        struct p9_fid *fid, *ret;
+        fid = v9fs_fid_lookup_with_uid(dentry, uid, 0);
+        if (IS_ERR(fid))
+                return fid;
+        ret = p9_client_walk(fid, 0, NULL, 1);
+        return ret;
+}
+struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
+{
+        int err;
+        struct p9_fid *fid;
+        fid = v9fs_fid_clone_with_uid(dentry, 0);
+        if (IS_ERR(fid))
+                goto error_out;
+        /*
+         * writeback fid will only be used to write back the
+         * dirty pages. We always request for the open fid in read-write
+         * mode so that a partial page write which result in page
+         * read can work.
+         */
+        err = p9_client_open(fid, O_RDWR);
+        if (err < 0) {
+                p9_client_clunk(fid);
+                fid = ERR_PTR(err);
+                goto error_out;
+        }
+error_out:
+        return fid;
+}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index c3bbd6af996d..bb0b6e7f58fc 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -19,7 +19,8 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#ifndef FS_9P_FID_H
+#define FS_9P_FID_H
 #include <linux/list.h>
 /**
@@ -45,3 +46,5 @@ struct v9fs_dentry {
 struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
 struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
 int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
+struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
+#endif
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2f77cd33ba83..c82b017f51f3 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -39,6 +39,7 @@
 static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
 static LIST_HEAD(v9fs_sessionlist);
+struct kmem_cache *v9fs_inode_cache;
 /*
 * Option Parsing (code inspired by NFS code)
@@ -55,7 +56,7 @@ enum {
        /* Cache options */
        Opt_cache_loose, Opt_fscache,
        /* Access options */
-        Opt_access,
+        Opt_access, Opt_posixacl,
        /* Error token */
        Opt_err
 };
@@ -73,6 +74,7 @@ static const match_table_t tokens = {
        {Opt_fscache, "fscache"},
        {Opt_cachetag, "cachetag=%s"},
        {Opt_access, "access=%s"},
+        {Opt_posixacl, "posixacl"},
        {Opt_err, NULL}
 };
@@ -194,15 +196,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                        else if (strcmp(s, "any") == 0)
                                v9ses->flags |= V9FS_ACCESS_ANY;
                        else if (strcmp(s, "client") == 0) {
-#ifdef CONFIG_9P_FS_POSIX_ACL
                                v9ses->flags |= V9FS_ACCESS_CLIENT;
-#else
-                                P9_DPRINTK(P9_DEBUG_ERROR,
-                                        "access=client option not supported\n");
-                                kfree(s);
-                                ret = -EINVAL;
-                                goto free_and_return;
-#endif
                        } else {
                                v9ses->flags |= V9FS_ACCESS_SINGLE;
                                v9ses->uid = simple_strtoul(s, &e, 10);
@@ -212,6 +206,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                        kfree(s);
                        break;
+                case Opt_posixacl:
+#ifdef CONFIG_9P_FS_POSIX_ACL
+                        v9ses->flags |= V9FS_POSIX_ACL;
+#else
+                        P9_DPRINTK(P9_DEBUG_ERROR,
+                                        "Not defined CONFIG_9P_FS_POSIX_ACL. "
+                                        "Ignoring posixacl option\n");
+#endif
+                        break;
                default:
                        continue;
                }
@@ -260,19 +264,12 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        list_add(&v9ses->slist, &v9fs_sessionlist);
        spin_unlock(&v9fs_sessionlist_lock);
-        v9ses->flags = V9FS_ACCESS_USER;
        strcpy(v9ses->uname, V9FS_DEFUSER);
        strcpy(v9ses->aname, V9FS_DEFANAME);
        v9ses->uid = ~0;
        v9ses->dfltuid = V9FS_DEFUID;
        v9ses->dfltgid = V9FS_DEFGID;
-        rc = v9fs_parse_options(v9ses, data);
-        if (rc < 0) {
-                retval = rc;
-                goto error;
-        }
        v9ses->clnt = p9_client_create(dev_name, data);
        if (IS_ERR(v9ses->clnt)) {
                retval = PTR_ERR(v9ses->clnt);
@@ -281,10 +278,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                goto error;
        }
-        if (p9_is_proto_dotl(v9ses->clnt))
+        v9ses->flags = V9FS_ACCESS_USER;
+        if (p9_is_proto_dotl(v9ses->clnt)) {
+                v9ses->flags = V9FS_ACCESS_CLIENT;
                v9ses->flags |= V9FS_PROTO_2000L;
-        else if (p9_is_proto_dotu(v9ses->clnt))
+        } else if (p9_is_proto_dotu(v9ses->clnt)) {
                v9ses->flags |= V9FS_PROTO_2000U;
+        }
+        rc = v9fs_parse_options(v9ses, data);
+        if (rc < 0) {
+                retval = rc;
+                goto error;
+        }
        v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
@@ -306,6 +313,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                v9ses->flags |= V9FS_ACCESS_ANY;
                v9ses->uid = ~0;
        }
+        if (!v9fs_proto_dotl(v9ses) ||
+                !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
+                /*
+                 * We support ACL checks on clinet only if the protocol is
+                 * 9P2000.L and access is V9FS_ACCESS_CLIENT.
+                 */
+                v9ses->flags &= ~V9FS_ACL_MASK;
+        }
        fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0,
                                                        v9ses->aname);
@@ -467,6 +482,63 @@ static void v9fs_sysfs_cleanup(void)
        kobject_put(v9fs_kobj);
 }
+static void v9fs_inode_init_once(void *foo)
+{
+        struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
+#ifdef CONFIG_9P_FSCACHE
+        v9inode->fscache = NULL;
+        v9inode->fscache_key = NULL;
+#endif
+        inode_init_once(&v9inode->vfs_inode);
+}
+/**
+ * v9fs_init_inode_cache - initialize a cache for 9P
+ * Returns 0 on success.
+ */
+static int v9fs_init_inode_cache(void)
+{
+        v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
+                                          sizeof(struct v9fs_inode),
+                                          0, (SLAB_RECLAIM_ACCOUNT|
+                                              SLAB_MEM_SPREAD),
+                                          v9fs_inode_init_once);
+        if (!v9fs_inode_cache)
+                return -ENOMEM;
+        return 0;
+}
+/**
+ * v9fs_destroy_inode_cache - destroy the cache of 9P inode
+ *
+ */
+static void v9fs_destroy_inode_cache(void)
+{
+        kmem_cache_destroy(v9fs_inode_cache);
+}
+static int v9fs_cache_register(void)
+{
+        int ret;
+        ret = v9fs_init_inode_cache();
+        if (ret < 0)
+                return ret;
+#ifdef CONFIG_9P_FSCACHE
+        return fscache_register_netfs(&v9fs_cache_netfs);
+#else
+        return ret;
+#endif
+}
+static void v9fs_cache_unregister(void)
+{
+        v9fs_destroy_inode_cache();
+#ifdef CONFIG_9P_FSCACHE
+        fscache_unregister_netfs(&v9fs_cache_netfs);
+#endif
+}
 /**
 * init_v9fs - Initialize module
 *
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index c4b5d8864f0d..e5ebedfc5ed8 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -20,6 +20,9 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#ifndef FS_9P_V9FS_H
+#define FS_9P_V9FS_H
 #include <linux/backing-dev.h>
 /**
@@ -28,8 +31,10 @@
 * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
+ * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client.
 * @V9FS_ACCESS_ANY: use a single attach for all users
 * @V9FS_ACCESS_MASK: bit mask of different ACCESS options
+ * @V9FS_POSIX_ACL: POSIX ACLs are enforced
 *
 * Session flags reflect options selected by users at mount time
 */
@@ -37,13 +42,15 @@
                         V9FS_ACCESS_USER |   \
                         V9FS_ACCESS_CLIENT)
 #define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
+#define V9FS_ACL_MASK V9FS_POSIX_ACL
 enum p9_session_flags {
        V9FS_PROTO_2000U        = 0x01,
        V9FS_PROTO_2000L        = 0x02,
        V9FS_ACCESS_SINGLE      = 0x04,
        V9FS_ACCESS_USER        = 0x08,
-        V9FS_ACCESS_CLIENT      = 0x10
+        V9FS_ACCESS_CLIENT      = 0x10,
+        V9FS_POSIX_ACL          = 0x20
 };
 /* possible values of ->cache */
@@ -111,6 +118,26 @@ struct v9fs_session_info {
        struct rw_semaphore rename_sem;
 };
+/* cache_validity flags */
+#define V9FS_INO_INVALID_ATTR 0x01
+struct v9fs_inode {
+#ifdef CONFIG_9P_FSCACHE
+        spinlock_t fscache_lock;
+        struct fscache_cookie *fscache;
+        struct p9_qid *fscache_key;
+#endif
+        unsigned int cache_validity;
+        struct p9_fid *writeback_fid;
+        struct mutex v_mutex;
+        struct inode vfs_inode;
+};
+static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
+{
+        return container_of(inode, struct v9fs_inode, vfs_inode);
+}
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
                                                                        char *);
 extern void v9fs_session_close(struct v9fs_session_info *v9ses);
@@ -124,16 +151,15 @@ extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry);
 extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
                        void *p);
-extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses,
+extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
-                        struct p9_fid *fid,
+                                         struct p9_fid *fid,
-                        struct super_block *sb);
+                                         struct super_block *sb);
 extern const struct inode_operations v9fs_dir_inode_operations_dotl;
 extern const struct inode_operations v9fs_file_inode_operations_dotl;
 extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
-extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses,
+extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
-                        struct p9_fid *fid,
+                                              struct p9_fid *fid,
-                        struct super_block *sb);
+                                              struct super_block *sb);
 /* other default globals */
 #define V9FS_PORT       564
@@ -147,6 +173,11 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
        return (inode->i_sb->s_fs_info);
 }
+static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry)
+{
+        return dentry->d_sb->s_fs_info;
+}
 static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
 {
        return v9ses->flags & V9FS_PROTO_2000U;
@@ -158,7 +189,7 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 }
 /**
- * v9fs_inode_from_fid - Helper routine to populate an inode by
+ * v9fs_get_inode_from_fid - Helper routine to populate an inode by
 * issuing a attribute request
 * @v9ses: session information
 * @fid: fid to issue attribute request for
@@ -166,11 +197,12 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
 *
 */
 static inline struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-                                struct super_block *sb)
+                        struct super_block *sb)
 {
        if (v9fs_proto_dotl(v9ses))
-                return v9fs_inode_dotl(v9ses, fid, sb);
+                return v9fs_inode_from_fid_dotl(v9ses, fid, sb);
        else
-                return v9fs_inode(v9ses, fid, sb);
+                return v9fs_inode_from_fid(v9ses, fid, sb);
 }
+#endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index b789f8e597ec..4014160903a9 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -20,6 +20,8 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#ifndef FS_9P_V9FS_VFS_H
+#define FS_9P_V9FS_VFS_H
 /* plan9 semantics are that created files are implicitly opened.
 * But linux semantics are that you call create, then open.
@@ -36,6 +38,7 @@
 * unlink calls remove, which is an implicit clunk. So we have to track
 * that kind of thing so that we don't try to clunk a dead fid.
 */
+#define P9_LOCK_TIMEOUT (30*HZ)
 extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
@@ -45,13 +48,15 @@ extern const struct file_operations v9fs_dir_operations;
 extern const struct file_operations v9fs_dir_operations_dotl;
 extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
+extern const struct file_operations v9fs_cached_file_operations;
+extern const struct file_operations v9fs_cached_file_operations_dotl;
+extern struct kmem_cache *v9fs_inode_cache;
-#ifdef CONFIG_9P_FSCACHE
 struct inode *v9fs_alloc_inode(struct super_block *sb);
 void v9fs_destroy_inode(struct inode *inode);
-#endif
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+int v9fs_init_inode(struct v9fs_session_info *v9ses,
+                    struct inode *inode, int mode);
 void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
 void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
@@ -62,8 +67,19 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
 int v9fs_uflags2omode(int uflags, int extended);
 ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
+ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64);
 void v9fs_blank_wstat(struct p9_wstat *wstat);
 int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
 int v9fs_file_fsync_dotl(struct file *filp, int datasync);
+ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *,
-#define P9_LOCK_TIMEOUT (30*HZ)
+                                 const char __user *, size_t, loff_t *, int);
+int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
+int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode);
+static inline void v9fs_invalidate_inode_attr(struct inode *inode)
+{
+        struct v9fs_inode *v9inode;
+        v9inode = V9FS_I(inode);
+        v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
+        return;
+}
+#endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index b7f2a8e3863e..2524e4cbb8ea 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -39,16 +39,16 @@
 #include "v9fs.h"
 #include "v9fs_vfs.h"
 #include "cache.h"
+#include "fid.h"
 /**
- * v9fs_vfs_readpage - read an entire page in from 9P
+ * v9fs_fid_readpage - read an entire page in from 9P
 *
- * @filp: file being read
+ * @fid: fid being read
 * @page: structure to page
 *
 */
+static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
-static int v9fs_vfs_readpage(struct file *filp, struct page *page)
 {
        int retval;
        loff_t offset;
@@ -67,7 +67,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
        buffer = kmap(page);
        offset = page_offset(page);
-        retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
+        retval = v9fs_fid_readn(fid, buffer, NULL, PAGE_CACHE_SIZE, offset);
        if (retval < 0) {
                v9fs_uncache_page(inode, page);
                goto done;
@@ -87,6 +87,19 @@ done:
 }
 /**
+ * v9fs_vfs_readpage - read an entire page in from 9P
+ *
+ * @filp: file being read
+ * @page: structure to page
+ *
+ */
+static int v9fs_vfs_readpage(struct file *filp, struct page *page)
+{
+        return v9fs_fid_readpage(filp->private_data, page);
+}
+/**
 * v9fs_vfs_readpages - read a set of pages from 9P
 *
 * @filp: file being read
@@ -124,7 +137,6 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
 {
        if (PagePrivate(page))
                return 0;
        return v9fs_fscache_release_page(page, gfp);
 }
@@ -137,20 +149,89 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
 static void v9fs_invalidate_page(struct page *page, unsigned long offset)
 {
+        /*
+         * If called with zero offset, we should release
+         * the private state assocated with the page
+         */
        if (offset == 0)
                v9fs_fscache_invalidate_page(page);
 }
+static int v9fs_vfs_writepage_locked(struct page *page)
+{
+        char *buffer;
+        int retval, len;
+        loff_t offset, size;
+        mm_segment_t old_fs;
+        struct v9fs_inode *v9inode;
+        struct inode *inode = page->mapping->host;
+        v9inode = V9FS_I(inode);
+        size = i_size_read(inode);
+        if (page->index == size >> PAGE_CACHE_SHIFT)
+                len = size & ~PAGE_CACHE_MASK;
+        else
+                len = PAGE_CACHE_SIZE;
+        set_page_writeback(page);
+        buffer = kmap(page);
+        offset = page_offset(page);
+        old_fs = get_fs();
+        set_fs(get_ds());
+        /* We should have writeback_fid always set */
+        BUG_ON(!v9inode->writeback_fid);
+        retval = v9fs_file_write_internal(inode,
+                                          v9inode->writeback_fid,
+                                          (__force const char __user *)buffer,
+                                          len, &offset, 0);
+        if (retval > 0)
+                retval = 0;
+        set_fs(old_fs);
+        kunmap(page);
+        end_page_writeback(page);
+        return retval;
+}
+static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        int retval;
+        retval = v9fs_vfs_writepage_locked(page);
+        if (retval < 0) {
+                if (retval == -EAGAIN) {
+                        redirty_page_for_writepage(wbc, page);
+                        retval = 0;
+                } else {
+                        SetPageError(page);
+                        mapping_set_error(page->mapping, retval);
+                }
+        } else
+                retval = 0;
+        unlock_page(page);
+        return retval;
+}
 /**
 * v9fs_launder_page - Writeback a dirty page
- * Since the writes go directly to the server, we simply return a 0
- * here to indicate success.
- *
 * Returns 0 on success.
 */
 static int v9fs_launder_page(struct page *page)
 {
+        int retval;
+        struct inode *inode = page->mapping->host;
+        v9fs_fscache_wait_on_page_write(inode, page);
+        if (clear_page_dirty_for_io(page)) {
+                retval = v9fs_vfs_writepage_locked(page);
+                if (retval)
+                        return retval;
+        }
        return 0;
 }
@@ -173,9 +254,15 @@ static int v9fs_launder_page(struct page *page)
 * with an error.
 *
 */
-ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+static ssize_t
-                loff_t pos, unsigned long nr_segs)
+v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+               loff_t pos, unsigned long nr_segs)
 {
+        /*
+         * FIXME
+         * Now that we do caching with cache mode enabled, We need
+         * to support direct IO
+         */
        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
                        "off/no(%lld/%lu) EINVAL\n",
                        iocb->ki_filp->f_path.dentry->d_name.name,
@@ -183,11 +270,84 @@ ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        return -EINVAL;
 }
+static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
+                            loff_t pos, unsigned len, unsigned flags,
+                            struct page **pagep, void **fsdata)
+{
+        int retval = 0;
+        struct page *page;
+        struct v9fs_inode *v9inode;
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+        struct inode *inode = mapping->host;
+        v9inode = V9FS_I(inode);
+start:
+        page = grab_cache_page_write_begin(mapping, index, flags);
+        if (!page) {
+                retval = -ENOMEM;
+                goto out;
+        }
+        BUG_ON(!v9inode->writeback_fid);
+        if (PageUptodate(page))
+                goto out;
+        if (len == PAGE_CACHE_SIZE)
+                goto out;
+        retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
+        page_cache_release(page);
+        if (!retval)
+                goto start;
+out:
+        *pagep = page;
+        return retval;
+}
+static int v9fs_write_end(struct file *filp, struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata)
+{
+        loff_t last_pos = pos + copied;
+        struct inode *inode = page->mapping->host;
+        if (unlikely(copied < len)) {
+                /*
+                 * zero out the rest of the area
+                 */
+                unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+                zero_user(page, from + copied, len - copied);
+                flush_dcache_page(page);
+        }
+        if (!PageUptodate(page))
+                SetPageUptodate(page);
+        /*
+         * No need to use i_size_read() here, the i_size
+         * cannot change under us because we hold the i_mutex.
+         */
+        if (last_pos > inode->i_size) {
+                inode_add_bytes(inode, last_pos - inode->i_size);
+                i_size_write(inode, last_pos);
+        }
+        set_page_dirty(page);
+        unlock_page(page);
+        page_cache_release(page);
+        return copied;
+}
 const struct address_space_operations v9fs_addr_operations = {
-      .readpage = v9fs_vfs_readpage,
+        .readpage = v9fs_vfs_readpage,
-      .readpages = v9fs_vfs_readpages,
+        .readpages = v9fs_vfs_readpages,
-      .releasepage = v9fs_release_page,
+        .set_page_dirty = __set_page_dirty_nobuffers,
-      .invalidatepage = v9fs_invalidate_page,
+        .writepage = v9fs_vfs_writepage,
-      .launder_page = v9fs_launder_page,
+        .write_begin = v9fs_write_begin,
-      .direct_IO = v9fs_direct_IO,
+        .write_end = v9fs_write_end,
+        .releasepage = v9fs_release_page,
+        .invalidatepage = v9fs_invalidate_page,
+        .launder_page = v9fs_launder_page,
+        .direct_IO = v9fs_direct_IO,
 };
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 233b7d4ffe5e..e022890c6f40 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -63,20 +63,15 @@ static int v9fs_dentry_delete(const struct dentry *dentry)
 * v9fs_cached_dentry_delete - called when dentry refcount equals 0
 * @dentry:  dentry in question
 *
- * Only return 1 if our inode is invalid.  Only non-synthetic files
- * (ones without mtime == 0) should be calling this function.
- *
 */
 static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 {
-        struct inode *inode = dentry->d_inode;
+        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n",
-        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+                   dentry->d_name.name, dentry);
-                                                                        dentry);
-        if(!inode)
+        /* Don't cache negative dentries */
+        if (!dentry->d_inode)
                return 1;
        return 0;
 }
@@ -105,7 +100,43 @@ static void v9fs_dentry_release(struct dentry *dentry)
        }
 }
+static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        struct p9_fid *fid;
+        struct inode *inode;
+        struct v9fs_inode *v9inode;
+        if (nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        inode = dentry->d_inode;
+        if (!inode)
+                goto out_valid;
+        v9inode = V9FS_I(inode);
+        if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
+                int retval;
+                struct v9fs_session_info *v9ses;
+                fid = v9fs_fid_lookup(dentry);
+                if (IS_ERR(fid))
+                        return PTR_ERR(fid);
+                v9ses = v9fs_inode2v9ses(inode);
+                if (v9fs_proto_dotl(v9ses))
+                        retval = v9fs_refresh_inode_dotl(fid, inode);
+                else
+                        retval = v9fs_refresh_inode(fid, inode);
+                if (retval == -ENOENT)
+                        return 0;
+                if (retval < 0)
+                        return retval;
+        }
+out_valid:
+        return 1;
+}
 const struct dentry_operations v9fs_cached_dentry_operations = {
+        .d_revalidate = v9fs_lookup_revalidate,
        .d_delete = v9fs_cached_dentry_delete,
        .d_release = v9fs_dentry_release,
 };
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index b84ebe8cefed..9c2bdda5cd9d 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -295,7 +295,6 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
        P9_DPRINTK(P9_DEBUG_VFS,
                        "v9fs_dir_release: inode: %p filp: %p fid: %d\n",
                        inode, filp, fid ? fid->fid : -1);
-        filemap_write_and_wait(inode->i_mapping);
        if (fid)
                p9_client_clunk(fid);
        return 0;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 240c30674396..ffed55817f0c 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -44,8 +44,7 @@
 #include "fid.h"
 #include "cache.h"
-static const struct file_operations v9fs_cached_file_operations;
+static const struct vm_operations_struct v9fs_file_vm_ops;
-static const struct file_operations v9fs_cached_file_operations_dotl;
 /**
 * v9fs_file_open - open a file (or directory)
@@ -57,11 +56,13 @@ static const struct file_operations v9fs_cached_file_operations_dotl;
 int v9fs_file_open(struct inode *inode, struct file *file)
 {
        int err;
+        struct v9fs_inode *v9inode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
        int omode;
        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
+        v9inode = V9FS_I(inode);
        v9ses = v9fs_inode2v9ses(inode);
        if (v9fs_proto_dotl(v9ses))
                omode = file->f_flags;
@@ -89,20 +90,34 @@ int v9fs_file_open(struct inode *inode, struct file *file)
        }
        file->private_data = fid;
-        if ((fid->qid.version) && (v9ses->cache)) {
+        mutex_lock(&v9inode->v_mutex);
-                P9_DPRINTK(P9_DEBUG_VFS, "cached");
+        if (v9ses->cache && !v9inode->writeback_fid &&
-                /* enable cached file options */
+            ((file->f_flags & O_ACCMODE) != O_RDONLY)) {
-                if(file->f_op == &v9fs_file_operations)
+                /*
-                        file->f_op = &v9fs_cached_file_operations;
+                 * clone a fid and add it to writeback_fid
-                else if (file->f_op == &v9fs_file_operations_dotl)
+                 * we do it during open time instead of
-                        file->f_op = &v9fs_cached_file_operations_dotl;
+                 * page dirty time via write_begin/page_mkwrite
+                 * because we want write after unlink usecase
+                 * to work.
+                 */
+                fid = v9fs_writeback_fid(file->f_path.dentry);
+                if (IS_ERR(fid)) {
+                        err = PTR_ERR(fid);
+                        mutex_unlock(&v9inode->v_mutex);
+                        goto out_error;
+                }
+                v9inode->writeback_fid = (void *) fid;
+        }
+        mutex_unlock(&v9inode->v_mutex);
 #ifdef CONFIG_9P_FSCACHE
+        if (v9ses->cache)
                v9fs_cache_inode_set_cookie(inode, file);
 #endif
-        }
        return 0;
+out_error:
+        p9_client_clunk(file->private_data);
+        file->private_data = NULL;
+        return err;
 }
 /**
@@ -335,25 +350,22 @@ out_err:
 }
 /**
- * v9fs_file_readn - read from a file
+ * v9fs_fid_readn - read from a fid
- * @filp: file pointer to read
+ * @fid: fid to read
 * @data: data buffer to read data into
 * @udata: user data buffer to read data into
 * @count: size of buffer
 * @offset: offset at which to read data
 *
 */
 ssize_t
-v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
+v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
               u64 offset)
 {
        int n, total, size;
-        struct p9_fid *fid = filp->private_data;
        P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
-                                        (long long unsigned) offset, count);
+                   (long long unsigned) offset, count);
        n = 0;
        total = 0;
        size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -379,6 +391,22 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
 }
 /**
+ * v9fs_file_readn - read from a file
+ * @filp: file pointer to read
+ * @data: data buffer to read data into
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+ssize_t
+v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
+               u64 offset)
+{
+        return v9fs_fid_readn(filp->private_data, data, udata, count, offset);
+}
+/**
 * v9fs_file_read - read from a file
 * @filp: file pointer to read
 * @udata: user data buffer to read data into
@@ -410,45 +438,22 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
        return ret;
 }
-/**
+ssize_t
- * v9fs_file_write - write to a file
+v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
- * @filp: file pointer to write
+                         const char __user *data, size_t count,
- * @data: data buffer to write data from
+                         loff_t *offset, int invalidate)
- * @count: size of buffer
- * @offset: offset at which to write data
- *
- */
-static ssize_t
-v9fs_file_write(struct file *filp, const char __user * data,
-                size_t count, loff_t * offset)
 {
-        ssize_t retval;
-        size_t total = 0;
        int n;
-        struct p9_fid *fid;
+        loff_t i_size;
+        size_t total = 0;
        struct p9_client *clnt;
-        struct inode *inode = filp->f_path.dentry->d_inode;
        loff_t origin = *offset;
        unsigned long pg_start, pg_end;
        P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
                (int)count, (int)*offset);
-        fid = filp->private_data;
        clnt = fid->clnt;
-        retval = generic_write_checks(filp, &origin, &count, 0);
-        if (retval)
-                goto out;
-        retval = -EINVAL;
-        if ((ssize_t) count < 0)
-                goto out;
-        retval = 0;
-        if (!count)
-                goto out;
        do {
                n = p9_client_write(fid, NULL, data+total, origin+total, count);
                if (n <= 0)
@@ -457,25 +462,63 @@ v9fs_file_write(struct file *filp, const char __user * data,
                total += n;
        } while (count > 0);
-        if (total > 0) {
+        if (invalidate && (total > 0)) {
                pg_start = origin >> PAGE_CACHE_SHIFT;
                pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
                if (inode->i_mapping && inode->i_mapping->nrpages)
                        invalidate_inode_pages2_range(inode->i_mapping,
                                                      pg_start, pg_end);
                *offset += total;
-                i_size_write(inode, i_size_read(inode) + total);
+                i_size = i_size_read(inode);
-                inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+                if (*offset > i_size) {
+                        inode_add_bytes(inode, *offset - i_size);
+                        i_size_write(inode, *offset);
+                }
        }
        if (n < 0)
-                retval = n;
+                return n;
-        else
-                retval = total;
+        return total;
+}
+/**
+ * v9fs_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_file_write(struct file *filp, const char __user * data,
+                size_t count, loff_t *offset)
+{
+        ssize_t retval = 0;
+        loff_t origin = *offset;
+        retval = generic_write_checks(filp, &origin, &count, 0);
+        if (retval)
+                goto out;
+        retval = -EINVAL;
+        if ((ssize_t) count < 0)
+                goto out;
+        retval = 0;
+        if (!count)
+                goto out;
+        retval = v9fs_file_write_internal(filp->f_path.dentry->d_inode,
+                                        filp->private_data,
+                                        data, count, &origin, 1);
+        /* update offset on successful write */
+        if (retval > 0)
+                *offset = origin;
 out:
        return retval;
 }
 static int v9fs_file_fsync(struct file *filp, int datasync)
 {
        struct p9_fid *fid;
@@ -505,28 +548,182 @@ int v9fs_file_fsync_dotl(struct file *filp, int datasync)
        return retval;
 }
-static const struct file_operations v9fs_cached_file_operations = {
+static int
+v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        int retval;
+        retval = generic_file_mmap(file, vma);
+        if (!retval)
+                vma->vm_ops = &v9fs_file_vm_ops;
+        return retval;
+}
+static int
+v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct v9fs_inode *v9inode;
+        struct page *page = vmf->page;
+        struct file *filp = vma->vm_file;
+        struct inode *inode = filp->f_path.dentry->d_inode;
+        P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n",
+                   page, (unsigned long)filp->private_data);
+        v9inode = V9FS_I(inode);
+        /* make sure the cache has finished storing the page */
+        v9fs_fscache_wait_on_page_write(inode, page);
+        BUG_ON(!v9inode->writeback_fid);
+        lock_page(page);
+        if (page->mapping != inode->i_mapping)
+                goto out_unlock;
+        return VM_FAULT_LOCKED;
+out_unlock:
+        unlock_page(page);
+        return VM_FAULT_NOPAGE;
+}
+static ssize_t
+v9fs_direct_read(struct file *filp, char __user *udata, size_t count,
+                 loff_t *offsetp)
+{
+        loff_t size, offset;
+        struct inode *inode;
+        struct address_space *mapping;
+        offset = *offsetp;
+        mapping = filp->f_mapping;
+        inode = mapping->host;
+        if (!count)
+                return 0;
+        size = i_size_read(inode);
+        if (offset < size)
+                filemap_write_and_wait_range(mapping, offset,
+                                             offset + count - 1);
+        return v9fs_file_read(filp, udata, count, offsetp);
+}
+/**
+ * v9fs_cached_file_read - read from a file
+ * @filp: file pointer to read
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+static ssize_t
+v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
+                      loff_t *offset)
+{
+        if (filp->f_flags & O_DIRECT)
+                return v9fs_direct_read(filp, data, count, offset);
+        return do_sync_read(filp, data, count, offset);
+}
+static ssize_t
+v9fs_direct_write(struct file *filp, const char __user * data,
+                  size_t count, loff_t *offsetp)
+{
+        loff_t offset;
+        ssize_t retval;
+        struct inode *inode;
+        struct address_space *mapping;
+        offset = *offsetp;
+        mapping = filp->f_mapping;
+        inode = mapping->host;
+        if (!count)
+                return 0;
+        mutex_lock(&inode->i_mutex);
+        retval = filemap_write_and_wait_range(mapping, offset,
+                                              offset + count - 1);
+        if (retval)
+                goto err_out;
+        /*
+         * After a write we want buffered reads to be sure to go to disk to get
+         * the new data.  We invalidate clean cached page from the region we're
+         * about to write.  We do this *before* the write so that if we fail
+         * here we fall back to buffered write
+         */
+        if (mapping->nrpages) {
+                pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT;
+                pgoff_t pg_end   = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+                retval = invalidate_inode_pages2_range(mapping,
+                                                        pg_start, pg_end);
+                /*
+                 * If a page can not be invalidated, fall back
+                 * to buffered write.
+                 */
+                if (retval) {
+                        if (retval == -EBUSY)
+                                goto buff_write;
+                        goto err_out;
+                }
+        }
+        retval = v9fs_file_write(filp, data, count, offsetp);
+err_out:
+        mutex_unlock(&inode->i_mutex);
+        return retval;
+buff_write:
+        mutex_unlock(&inode->i_mutex);
+        return do_sync_write(filp, data, count, offsetp);
+}
+/**
+ * v9fs_cached_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_cached_file_write(struct file *filp, const char __user * data,
+                       size_t count, loff_t *offset)
+{
+        if (filp->f_flags & O_DIRECT)
+                return v9fs_direct_write(filp, data, count, offset);
+        return do_sync_write(filp, data, count, offset);
+}
+static const struct vm_operations_struct v9fs_file_vm_ops = {
+        .fault = filemap_fault,
+        .page_mkwrite = v9fs_vm_page_mkwrite,
+};
+const struct file_operations v9fs_cached_file_operations = {
        .llseek = generic_file_llseek,
-        .read = do_sync_read,
+        .read = v9fs_cached_file_read,
+        .write = v9fs_cached_file_write,
        .aio_read = generic_file_aio_read,
-        .write = v9fs_file_write,
+        .aio_write = generic_file_aio_write,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
        .lock = v9fs_file_lock,
-        .mmap = generic_file_readonly_mmap,
+        .mmap = v9fs_file_mmap,
        .fsync = v9fs_file_fsync,
 };
-static const struct file_operations v9fs_cached_file_operations_dotl = {
+const struct file_operations v9fs_cached_file_operations_dotl = {
        .llseek = generic_file_llseek,
-        .read = do_sync_read,
+        .read = v9fs_cached_file_read,
+        .write = v9fs_cached_file_write,
        .aio_read = generic_file_aio_read,
-        .write = v9fs_file_write,
+        .aio_write = generic_file_aio_write,
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
        .lock = v9fs_file_lock_dotl,
        .flock = v9fs_file_flock_dotl,
-        .mmap = generic_file_readonly_mmap,
+        .mmap = v9fs_file_mmap,
        .fsync = v9fs_file_fsync_dotl,
 };
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b76a40bdf4c2..7f6c67703195 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -203,26 +203,26 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
        wstat->extension = NULL;
 }
-#ifdef CONFIG_9P_FSCACHE
 /**
 * v9fs_alloc_inode - helper function to allocate an inode
- * This callback is executed before setting up the inode so that we
- * can associate a vcookie with each inode.
 *
 */
 struct inode *v9fs_alloc_inode(struct super_block *sb)
 {
-        struct v9fs_cookie *vcookie;
+        struct v9fs_inode *v9inode;
-        vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache,
+        v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache,
-                                                         GFP_KERNEL);
+                                                        GFP_KERNEL);
-        if (!vcookie)
+        if (!v9inode)
                return NULL;
+#ifdef CONFIG_9P_FSCACHE
-        vcookie->fscache = NULL;
+        v9inode->fscache = NULL;
-        vcookie->qid = NULL;
+        v9inode->fscache_key = NULL;
-        spin_lock_init(&vcookie->lock);
+        spin_lock_init(&v9inode->fscache_lock);
-        return &vcookie->inode;
+#endif
+        v9inode->writeback_fid = NULL;
+        v9inode->cache_validity = 0;
+        mutex_init(&v9inode->v_mutex);
+        return &v9inode->vfs_inode;
 }
 /**
@@ -234,35 +234,18 @@ static void v9fs_i_callback(struct rcu_head *head)
 {
        struct inode *inode = container_of(head, struct inode, i_rcu);
        INIT_LIST_HEAD(&inode->i_dentry);
-        kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
+        kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
 }
 void v9fs_destroy_inode(struct inode *inode)
 {
        call_rcu(&inode->i_rcu, v9fs_i_callback);
 }
-#endif
-/**
- * v9fs_get_inode - helper function to setup an inode
- * @sb: superblock
- * @mode: mode to setup inode with
- *
- */
-struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+int v9fs_init_inode(struct v9fs_session_info *v9ses,
+                    struct inode *inode, int mode)
 {
-        int err;
+        int err = 0;
-        struct inode *inode;
-        struct v9fs_session_info *v9ses = sb->s_fs_info;
-        P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
-        inode = new_inode(sb);
-        if (!inode) {
-                P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
-                return ERR_PTR(-ENOMEM);
-        }
        inode_init_owner(inode, NULL, mode);
        inode->i_blocks = 0;
@@ -292,14 +275,20 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
        case S_IFREG:
                if (v9fs_proto_dotl(v9ses)) {
                        inode->i_op = &v9fs_file_inode_operations_dotl;
-                        inode->i_fop = &v9fs_file_operations_dotl;
+                        if (v9ses->cache)
+                                inode->i_fop =
+                                        &v9fs_cached_file_operations_dotl;
+                        else
+                                inode->i_fop = &v9fs_file_operations_dotl;
                } else {
                        inode->i_op = &v9fs_file_inode_operations;
-                        inode->i_fop = &v9fs_file_operations;
+                        if (v9ses->cache)
+                                inode->i_fop = &v9fs_cached_file_operations;
+                        else
+                                inode->i_fop = &v9fs_file_operations;
                }
                break;
        case S_IFLNK:
                if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
                        P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
@@ -335,12 +324,37 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                err = -EINVAL;
                goto error;
        }
+error:
+        return err;
-        return inode;
+}
-error:
+/**
-        iput(inode);
+ * v9fs_get_inode - helper function to setup an inode
-        return ERR_PTR(err);
+ * @sb: superblock
+ * @mode: mode to setup inode with
+ *
+ */
+struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+{
+        int err;
+        struct inode *inode;
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
+        inode = new_inode(sb);
+        if (!inode) {
+                P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        err = v9fs_init_inode(v9ses, inode, mode);
+        if (err) {
+                iput(inode);
+                return ERR_PTR(err);
+        }
+        return inode;
 }
 /*
@@ -403,6 +417,8 @@ error:
 */
 void v9fs_evict_inode(struct inode *inode)
 {
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        truncate_inode_pages(inode->i_mapping, 0);
        end_writeback(inode);
        filemap_fdatawrite(inode->i_mapping);
@@ -410,41 +426,67 @@ void v9fs_evict_inode(struct inode *inode)
 #ifdef CONFIG_9P_FSCACHE
        v9fs_cache_inode_put_cookie(inode);
 #endif
+        /* clunk the fid stashed in writeback_fid */
+        if (v9inode->writeback_fid) {
+                p9_client_clunk(v9inode->writeback_fid);
+                v9inode->writeback_fid = NULL;
+        }
 }
-struct inode *
+static struct inode *v9fs_qid_iget(struct super_block *sb,
-v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                                   struct p9_qid *qid,
-        struct super_block *sb)
+                                   struct p9_wstat *st)
 {
-        int err, umode;
+        int retval, umode;
-        struct inode *ret = NULL;
+        unsigned long i_ino;
-        struct p9_wstat *st;
+        struct inode *inode;
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
-        st = p9_client_stat(fid);
-        if (IS_ERR(st))
-                return ERR_CAST(st);
+        i_ino = v9fs_qid2ino(qid);
+        inode = iget_locked(sb, i_ino);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        /*
+         * initialize the inode with the stat info
+         * FIXME!! we may need support for stale inodes
+         * later.
+         */
        umode = p9mode2unixmode(v9ses, st->mode);
-        ret = v9fs_get_inode(sb, umode);
+        retval = v9fs_init_inode(v9ses, inode, umode);
-        if (IS_ERR(ret)) {
+        if (retval)
-                err = PTR_ERR(ret);
                goto error;
-        }
-        v9fs_stat2inode(st, ret, sb);
-        ret->i_ino = v9fs_qid2ino(&st->qid);
+        v9fs_stat2inode(st, inode, sb);
 #ifdef CONFIG_9P_FSCACHE
-        v9fs_vcookie_set_qid(ret, &st->qid);
+        v9fs_fscache_set_key(inode, &st->qid);
-        v9fs_cache_inode_get_cookie(ret);
+        v9fs_cache_inode_get_cookie(inode);
 #endif
-        p9stat_free(st);
+        unlock_new_inode(inode);
-        kfree(st);
+        return inode;
-        return ret;
 error:
+        unlock_new_inode(inode);
+        iput(inode);
+        return ERR_PTR(retval);
+}
+struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+                    struct super_block *sb)
+{
+        struct p9_wstat *st;
+        struct inode *inode = NULL;
+        st = p9_client_stat(fid);
+        if (IS_ERR(st))
+                return ERR_CAST(st);
+        inode = v9fs_qid_iget(sb, &st->qid, st);
        p9stat_free(st);
        kfree(st);
-        return ERR_PTR(err);
+        return inode;
 }
 /**
@@ -458,8 +500,8 @@ error:
 static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 {
        int retval;
-        struct inode *file_inode;
        struct p9_fid *v9fid;
+        struct inode *file_inode;
        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
                rmdir);
@@ -470,8 +512,20 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
                return PTR_ERR(v9fid);
        retval = p9_client_remove(v9fid);
-        if (!retval)
+        if (!retval) {
-                drop_nlink(file_inode);
+                /*
+                 * directories on unlink should have zero
+                 * link count
+                 */
+                if (rmdir) {
+                        clear_nlink(file_inode);
+                        drop_nlink(dir);
+                } else
+                        drop_nlink(file_inode);
+                v9fs_invalidate_inode_attr(file_inode);
+                v9fs_invalidate_inode_attr(dir);
+        }
        return retval;
 }
@@ -531,7 +585,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        }
        /* instantiate inode and assign the unopened fid to the dentry */
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -570,9 +624,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        int err;
        u32 perm;
        int flags;
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *fid;
        struct file *filp;
+        struct v9fs_inode *v9inode;
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid, *inode_fid;
        err = 0;
        fid = NULL;
@@ -592,8 +647,29 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
                goto error;
        }
+        v9fs_invalidate_inode_attr(dir);
        /* if we are opening a file, assign the open fid to the file */
        if (nd && nd->flags & LOOKUP_OPEN) {
+                v9inode = V9FS_I(dentry->d_inode);
+                mutex_lock(&v9inode->v_mutex);
+                if (v9ses->cache && !v9inode->writeback_fid &&
+                    ((flags & O_ACCMODE) != O_RDONLY)) {
+                        /*
+                         * clone a fid and add it to writeback_fid
+                         * we do it during open time instead of
+                         * page dirty time via write_begin/page_mkwrite
+                         * because we want write after unlink usecase
+                         * to work.
+                         */
+                        inode_fid = v9fs_writeback_fid(dentry);
+                        if (IS_ERR(inode_fid)) {
+                                err = PTR_ERR(inode_fid);
+                                mutex_unlock(&v9inode->v_mutex);
+                                goto error;
+                        }
+                        v9inode->writeback_fid = (void *) inode_fid;
+                }
+                mutex_unlock(&v9inode->v_mutex);
                filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
                if (IS_ERR(filp)) {
                        err = PTR_ERR(filp);
@@ -601,6 +677,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
                }
                filp->private_data = fid;
+#ifdef CONFIG_9P_FSCACHE
+                if (v9ses->cache)
+                        v9fs_cache_inode_set_cookie(dentry->d_inode, filp);
+#endif
        } else
                p9_client_clunk(fid);
@@ -625,8 +705,8 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
        int err;
        u32 perm;
-        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
+        struct v9fs_session_info *v9ses;
        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
        err = 0;
@@ -636,6 +716,9 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (IS_ERR(fid)) {
                err = PTR_ERR(fid);
                fid = NULL;
+        } else {
+                inc_nlink(dir);
+                v9fs_invalidate_inode_attr(dir);
        }
        if (fid)
@@ -687,7 +770,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
                return ERR_PTR(result);
        }
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                result = PTR_ERR(inode);
                inode = NULL;
@@ -747,17 +830,19 @@ int
 v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                struct inode *new_dir, struct dentry *new_dentry)
 {
+        int retval;
        struct inode *old_inode;
+        struct inode *new_inode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *oldfid;
        struct p9_fid *olddirfid;
        struct p9_fid *newdirfid;
        struct p9_wstat wstat;
-        int retval;
        P9_DPRINTK(P9_DEBUG_VFS, "\n");
        retval = 0;
        old_inode = old_dentry->d_inode;
+        new_inode = new_dentry->d_inode;
        v9ses = v9fs_inode2v9ses(old_inode);
        oldfid = v9fs_fid_lookup(old_dentry);
        if (IS_ERR(oldfid))
@@ -798,9 +883,30 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        retval = p9_client_wstat(oldfid, &wstat);
 clunk_newdir:
-        if (!retval)
+        if (!retval) {
+                if (new_inode) {
+                        if (S_ISDIR(new_inode->i_mode))
+                                clear_nlink(new_inode);
+                        else
+                                drop_nlink(new_inode);
+                        /*
+                         * Work around vfs rename rehash bug with
+                         * FS_RENAME_DOES_D_MOVE
+                         */
+                        v9fs_invalidate_inode_attr(new_inode);
+                }
+                if (S_ISDIR(old_inode->i_mode)) {
+                        if (!new_inode)
+                                inc_nlink(new_dir);
+                        drop_nlink(old_dir);
+                }
+                v9fs_invalidate_inode_attr(old_inode);
+                v9fs_invalidate_inode_attr(old_dir);
+                v9fs_invalidate_inode_attr(new_dir);
                /* successful rename */
                d_move(old_dentry, new_dentry);
+        }
        up_write(&v9ses->rename_sem);
        p9_client_clunk(newdirfid);
@@ -830,10 +936,11 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
        err = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                return simple_getattr(mnt, dentry, stat);
+                generic_fillattr(dentry->d_inode, stat);
+                return 0;
+        }
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid))
                return PTR_ERR(fid);
@@ -865,8 +972,12 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
        struct p9_wstat wstat;
        P9_DPRINTK(P9_DEBUG_VFS, "\n");
+        retval = inode_change_ok(dentry->d_inode, iattr);
+        if (retval)
+                return retval;
        retval = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
        fid = v9fs_fid_lookup(dentry);
        if(IS_ERR(fid))
                return PTR_ERR(fid);
@@ -892,16 +1003,19 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
                        wstat.n_gid = iattr->ia_gid;
        }
+        /* Write all dirty data */
+        if (S_ISREG(dentry->d_inode->i_mode))
+                filemap_write_and_wait(dentry->d_inode->i_mapping);
        retval = p9_client_wstat(fid, &wstat);
        if (retval < 0)
                return retval;
        if ((iattr->ia_valid & ATTR_SIZE) &&
-            iattr->ia_size != i_size_read(dentry->d_inode)) {
+            iattr->ia_size != i_size_read(dentry->d_inode))
-                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+                truncate_setsize(dentry->d_inode, iattr->ia_size);
-                if (retval)
-                        return retval;
+        v9fs_invalidate_inode_attr(dentry->d_inode);
-        }
        setattr_copy(dentry->d_inode, iattr);
        mark_inode_dirty(dentry->d_inode);
@@ -924,6 +1038,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
        char tag_name[14];
        unsigned int i_nlink;
        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        inode->i_nlink = 1;
@@ -983,6 +1098,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
        /* not real number of blocks, but 512 byte ones ... */
        inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+        v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
 }
 /**
@@ -1023,7 +1139,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
        P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
        retval = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid))
                return PTR_ERR(fid);
@@ -1115,8 +1231,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
        int mode, const char *extension)
 {
        u32 perm;
-        struct v9fs_session_info *v9ses;
        struct p9_fid *fid;
+        struct v9fs_session_info *v9ses;
        v9ses = v9fs_inode2v9ses(dir);
        if (!v9fs_proto_dotu(v9ses)) {
@@ -1130,6 +1246,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(fid))
                return PTR_ERR(fid);
+        v9fs_invalidate_inode_attr(dir);
        p9_client_clunk(fid);
        return 0;
 }
@@ -1166,8 +1283,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
              struct dentry *dentry)
 {
        int retval;
-        struct p9_fid *oldfid;
        char *name;
+        struct p9_fid *oldfid;
        P9_DPRINTK(P9_DEBUG_VFS,
                " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
@@ -1186,7 +1303,10 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
        sprintf(name, "%d\n", oldfid->fid);
        retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name);
        __putname(name);
+        if (!retval) {
+                v9fs_refresh_inode(oldfid, old_dentry->d_inode);
+                v9fs_invalidate_inode_attr(dir);
+        }
 clunk_fid:
        p9_client_clunk(oldfid);
        return retval;
@@ -1237,6 +1357,32 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        return retval;
 }
+int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
+{
+        loff_t i_size;
+        struct p9_wstat *st;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        st = p9_client_stat(fid);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        spin_lock(&inode->i_lock);
+        /*
+         * We don't want to refresh inode->i_size,
+         * because we may have cached data
+         */
+        i_size = inode->i_size;
+        v9fs_stat2inode(st, inode, inode->i_sb);
+        if (v9ses->cache)
+                inode->i_size = i_size;
+        spin_unlock(&inode->i_lock);
+        p9stat_free(st);
+        kfree(st);
+        return 0;
+}
 static const struct inode_operations v9fs_dir_inode_operations_dotu = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index fe3ffa9aace4..82a7c38ddad0 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -86,40 +86,63 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
        return dentry;
 }
+static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
+                                        struct p9_qid *qid,
+                                        struct p9_fid *fid,
+                                        struct p9_stat_dotl *st)
+{
+        int retval;
+        unsigned long i_ino;
+        struct inode *inode;
+        struct v9fs_session_info *v9ses = sb->s_fs_info;
+        i_ino = v9fs_qid2ino(qid);
+        inode = iget_locked(sb, i_ino);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        /*
+         * initialize the inode with the stat info
+         * FIXME!! we may need support for stale inodes
+         * later.
+         */
+        retval = v9fs_init_inode(v9ses, inode, st->st_mode);
+        if (retval)
+                goto error;
+        v9fs_stat2inode_dotl(st, inode);
+#ifdef CONFIG_9P_FSCACHE
+        v9fs_fscache_set_key(inode, &st->qid);
+        v9fs_cache_inode_get_cookie(inode);
+#endif
+        retval = v9fs_get_acl(inode, fid);
+        if (retval)
+                goto error;
+        unlock_new_inode(inode);
+        return inode;
+error:
+        unlock_new_inode(inode);
+        iput(inode);
+        return ERR_PTR(retval);
+}
 struct inode *
-v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-        struct super_block *sb)
+                         struct super_block *sb)
 {
-        struct inode *ret = NULL;
-        int err;
        struct p9_stat_dotl *st;
+        struct inode *inode = NULL;
        st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
        if (IS_ERR(st))
                return ERR_CAST(st);
-        ret = v9fs_get_inode(sb, st->st_mode);
+        inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st);
-        if (IS_ERR(ret)) {
-                err = PTR_ERR(ret);
-                goto error;
-        }
-        v9fs_stat2inode_dotl(st, ret);
-        ret->i_ino = v9fs_qid2ino(&st->qid);
-#ifdef CONFIG_9P_FSCACHE
-        v9fs_vcookie_set_qid(ret, &st->qid);
-        v9fs_cache_inode_get_cookie(ret);
-#endif
-        err = v9fs_get_acl(ret, fid);
-        if (err) {
-                iput(ret);
-                goto error;
-        }
        kfree(st);
-        return ret;
+        return inode;
-error:
-        kfree(st);
-        return ERR_PTR(err);
 }
 /**
@@ -136,16 +159,17 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                struct nameidata *nd)
 {
        int err = 0;
-        char *name = NULL;
        gid_t gid;
        int flags;
        mode_t mode;
-        struct v9fs_session_info *v9ses;
+        char *name = NULL;
-        struct p9_fid *fid = NULL;
-        struct p9_fid *dfid, *ofid;
        struct file *filp;
        struct p9_qid qid;
        struct inode *inode;
+        struct p9_fid *fid = NULL;
+        struct v9fs_inode *v9inode;
+        struct p9_fid *dfid, *ofid, *inode_fid;
+        struct v9fs_session_info *v9ses;
        struct posix_acl *pacl = NULL, *dacl = NULL;
        v9ses = v9fs_inode2v9ses(dir);
@@ -196,6 +220,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                                err);
                goto error;
        }
+        v9fs_invalidate_inode_attr(dir);
        /* instantiate inode and assign the unopened fid to the dentry */
        fid = p9_client_walk(dfid, 1, &name, 1);
@@ -205,7 +230,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                fid = NULL;
                goto error;
        }
-        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+        inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -219,6 +244,26 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
        /* Now set the ACL based on the default value */
        v9fs_set_create_acl(dentry, dacl, pacl);
+        v9inode = V9FS_I(inode);
+        mutex_lock(&v9inode->v_mutex);
+        if (v9ses->cache && !v9inode->writeback_fid &&
+            ((flags & O_ACCMODE) != O_RDONLY)) {
+                /*
+                 * clone a fid and add it to writeback_fid
+                 * we do it during open time instead of
+                 * page dirty time via write_begin/page_mkwrite
+                 * because we want write after unlink usecase
+                 * to work.
+                 */
+                inode_fid = v9fs_writeback_fid(dentry);
+                if (IS_ERR(inode_fid)) {
+                        err = PTR_ERR(inode_fid);
+                        mutex_unlock(&v9inode->v_mutex);
+                        goto error;
+                }
+                v9inode->writeback_fid = (void *) inode_fid;
+        }
+        mutex_unlock(&v9inode->v_mutex);
        /* Since we are opening a file, assign the open fid to the file */
        filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
        if (IS_ERR(filp)) {
@@ -226,6 +271,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
                return PTR_ERR(filp);
        }
        filp->private_data = ofid;
+#ifdef CONFIG_9P_FSCACHE
+        if (v9ses->cache)
+                v9fs_cache_inode_set_cookie(inode, filp);
+#endif
        return 0;
 error:
@@ -300,7 +349,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
                        goto error;
                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -327,7 +376,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
        }
        /* Now set the ACL based on the default value */
        v9fs_set_create_acl(dentry, dacl, pacl);
+        inc_nlink(dir);
+        v9fs_invalidate_inode_attr(dir);
 error:
        if (fid)
                p9_client_clunk(fid);
@@ -345,10 +395,11 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
        err = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
-        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
+        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
-                return simple_getattr(mnt, dentry, stat);
+                generic_fillattr(dentry->d_inode, stat);
+                return 0;
+        }
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid))
                return PTR_ERR(fid);
@@ -401,22 +452,24 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
        p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
        retval = -EPERM;
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid))
                return PTR_ERR(fid);
+        /* Write all dirty data */
+        if (S_ISREG(dentry->d_inode->i_mode))
+                filemap_write_and_wait(dentry->d_inode->i_mapping);
        retval = p9_client_setattr(fid, &p9attr);
        if (retval < 0)
                return retval;
        if ((iattr->ia_valid & ATTR_SIZE) &&
-            iattr->ia_size != i_size_read(dentry->d_inode)) {
+            iattr->ia_size != i_size_read(dentry->d_inode))
-                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+                truncate_setsize(dentry->d_inode, iattr->ia_size);
-                if (retval)
-                        return retval;
-        }
+        v9fs_invalidate_inode_attr(dentry->d_inode);
        setattr_copy(dentry->d_inode, iattr);
        mark_inode_dirty(dentry->d_inode);
        if (iattr->ia_valid & ATTR_MODE) {
@@ -439,6 +492,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 void
 v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 {
+        struct v9fs_inode *v9inode = V9FS_I(inode);
        if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
                inode->i_atime.tv_sec = stat->st_atime_sec;
@@ -497,20 +551,21 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
        /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
         * because the inode structure does not have fields for them.
         */
+        v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
 }
 static int
 v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                const char *symname)
 {
-        struct v9fs_session_info *v9ses;
-        struct p9_fid *dfid;
-        struct p9_fid *fid = NULL;
-        struct inode *inode;
-        struct p9_qid qid;
-        char *name;
        int err;
        gid_t gid;
+        char *name;
+        struct p9_qid qid;
+        struct inode *inode;
+        struct p9_fid *dfid;
+        struct p9_fid *fid = NULL;
+        struct v9fs_session_info *v9ses;
        name = (char *) dentry->d_name.name;
        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
@@ -534,6 +589,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                goto error;
        }
+        v9fs_invalidate_inode_attr(dir);
        if (v9ses->cache) {
                /* Now walk from the parent so we can get an unopened fid. */
                fid = p9_client_walk(dfid, 1, &name, 1);
@@ -546,7 +602,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                }
                /* instantiate inode and assign the unopened fid to dentry */
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -588,10 +644,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                struct dentry *dentry)
 {
        int err;
-        struct p9_fid *dfid, *oldfid;
        char *name;
-        struct v9fs_session_info *v9ses;
        struct dentry *dir_dentry;
+        struct p9_fid *dfid, *oldfid;
+        struct v9fs_session_info *v9ses;
        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
                        dir->i_ino, old_dentry->d_name.name,
@@ -616,29 +672,17 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                return err;
        }
+        v9fs_invalidate_inode_attr(dir);
        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
                /* Get the latest stat info from server. */
                struct p9_fid *fid;
-                struct p9_stat_dotl *st;
                fid = v9fs_fid_lookup(old_dentry);
                if (IS_ERR(fid))
                        return PTR_ERR(fid);
-                st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+                v9fs_refresh_inode_dotl(fid, old_dentry->d_inode);
-                if (IS_ERR(st))
-                        return PTR_ERR(st);
-                v9fs_stat2inode_dotl(st, old_dentry->d_inode);
-                kfree(st);
-        } else {
-                /* Caching disabled. No need to get upto date stat info.
-                 * This dentry will be released immediately. So, just hold the
-                 * inode
-                 */
-                ihold(old_dentry->d_inode);
        }
+        ihold(old_dentry->d_inode);
        d_instantiate(dentry, old_dentry->d_inode);
        return err;
@@ -657,12 +701,12 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
                dev_t rdev)
 {
        int err;
+        gid_t gid;
        char *name;
        mode_t mode;
        struct v9fs_session_info *v9ses;
        struct p9_fid *fid = NULL, *dfid = NULL;
        struct inode *inode;
-        gid_t gid;
        struct p9_qid qid;
        struct dentry *dir_dentry;
        struct posix_acl *dacl = NULL, *pacl = NULL;
@@ -699,6 +743,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
        if (err < 0)
                goto error;
+        v9fs_invalidate_inode_attr(dir);
        /* instantiate inode and assign the unopened fid to the dentry */
        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
                fid = p9_client_walk(dfid, 1, &name, 1);
@@ -710,7 +755,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
                        goto error;
                }
-                inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
+                inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -766,7 +811,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid)) {
                __putname(link);
-                link = ERR_PTR(PTR_ERR(fid));
+                link = ERR_CAST(fid);
                goto ndset;
        }
        retval = p9_client_readlink(fid, &target);
@@ -782,6 +827,31 @@ ndset:
        return NULL;
 }
+int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
+{
+        loff_t i_size;
+        struct p9_stat_dotl *st;
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
+        if (IS_ERR(st))
+                return PTR_ERR(st);
+        spin_lock(&inode->i_lock);
+        /*
+         * We don't want to refresh inode->i_size,
+         * because we may have cached data
+         */
+        i_size = inode->i_size;
+        v9fs_stat2inode_dotl(st, inode);
+        if (v9ses->cache)
+                inode->i_size = i_size;
+        spin_unlock(&inode->i_lock);
+        kfree(st);
+        return 0;
+}
 const struct inode_operations v9fs_dir_inode_operations_dotl = {
        .create = v9fs_vfs_create_dotl,
        .lookup = v9fs_vfs_lookup,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index dbaabe3b8131..feef6cdc1fd2 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -86,12 +86,15 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        } else
                sb->s_op = &v9fs_super_ops;
        sb->s_bdi = &v9ses->bdi;
+        if (v9ses->cache)
+                sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
-        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
+        sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
-            MS_NOATIME;
+        if (!v9ses->cache)
+                sb->s_flags |= MS_SYNCHRONOUS;
 #ifdef CONFIG_9P_FS_POSIX_ACL
-        if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)
+        if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
                sb->s_flags |= MS_POSIXACL;
 #endif
@@ -166,7 +169,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
                        retval = PTR_ERR(st);
                        goto release_sb;
                }
+                root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
                v9fs_stat2inode_dotl(st, root->d_inode);
                kfree(st);
        } else {
@@ -253,7 +256,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
                goto done;
        }
-        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        v9ses = v9fs_dentry2v9ses(dentry);
        if (v9fs_proto_dotl(v9ses)) {
                res = p9_client_statfs(fid, &rs);
                if (res == 0) {
@@ -276,26 +279,84 @@ done:
        return res;
 }
+static int v9fs_drop_inode(struct inode *inode)
+{
+        struct v9fs_session_info *v9ses;
+        v9ses = v9fs_inode2v9ses(inode);
+        if (v9ses->cache)
+                return generic_drop_inode(inode);
+        /*
+         * in case of non cached mode always drop the
+         * the inode because we want the inode attribute
+         * to always match that on the server.
+         */
+        return 1;
+}
+static int v9fs_write_inode(struct inode *inode,
+                            struct writeback_control *wbc)
+{
+        int ret;
+        struct p9_wstat wstat;
+        struct v9fs_inode *v9inode;
+        /*
+         * send an fsync request to server irrespective of
+         * wbc->sync_mode.
+         */
+        P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+        v9inode = V9FS_I(inode);
+        if (!v9inode->writeback_fid)
+                return 0;
+        v9fs_blank_wstat(&wstat);
+        ret = p9_client_wstat(v9inode->writeback_fid, &wstat);
+        if (ret < 0) {
+                __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+                return ret;
+        }
+        return 0;
+}
+static int v9fs_write_inode_dotl(struct inode *inode,
+                                 struct writeback_control *wbc)
+{
+        int ret;
+        struct v9fs_inode *v9inode;
+        /*
+         * send an fsync request to server irrespective of
+         * wbc->sync_mode.
+         */
+        P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+        v9inode = V9FS_I(inode);
+        if (!v9inode->writeback_fid)
+                return 0;
+        ret = p9_client_fsync(v9inode->writeback_fid, 0);
+        if (ret < 0) {
+                __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+                return ret;
+        }
+        return 0;
+}
 static const struct super_operations v9fs_super_ops = {
-#ifdef CONFIG_9P_FSCACHE
        .alloc_inode = v9fs_alloc_inode,
        .destroy_inode = v9fs_destroy_inode,
-#endif
        .statfs = simple_statfs,
        .evict_inode = v9fs_evict_inode,
        .show_options = generic_show_options,
        .umount_begin = v9fs_umount_begin,
+        .write_inode = v9fs_write_inode,
 };
 static const struct super_operations v9fs_super_ops_dotl = {
-#ifdef CONFIG_9P_FSCACHE
        .alloc_inode = v9fs_alloc_inode,
        .destroy_inode = v9fs_destroy_inode,
-#endif
        .statfs = v9fs_statfs,
+        .drop_inode = v9fs_drop_inode,
        .evict_inode = v9fs_evict_inode,
        .show_options = generic_show_options,
        .umount_begin = v9fs_umount_begin,
+        .write_inode = v9fs_write_inode_dotl,
 };
 struct file_system_type v9fs_fs_type = {
@@ -303,5 +364,5 @@ struct file_system_type v9fs_fs_type = {
        .mount = v9fs_mount,
        .kill_sb = v9fs_kill_super,
        .owner = THIS_MODULE,
-        .fs_flags = FS_RENAME_DOES_D_MOVE,
+        .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT,
 };
diff --git a/fs/Kconfig b/fs/Kconfig
index 3db9caa57edc..f3aa9b08b228 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
        def_bool n
 config EXPORTFS
-        tristate
+        bool
 config FILE_LOCKING
        bool "Enable POSIX file locking API" if EXPERT
@@ -187,6 +187,7 @@ source "fs/omfs/Kconfig"
 source "fs/hpfs/Kconfig"
 source "fs/qnx4/Kconfig"
 source "fs/romfs/Kconfig"
+source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index a7f7cef0c0c8..fb68c2b8cf8a 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)        += nfs_common/
 obj-$(CONFIG_GENERIC_ACL)       += generic_acl.o
+obj-$(CONFIG_FHANDLE)           += fhandle.o
 obj-y                           += quota/
 obj-$(CONFIG_PROC_FS)           += proc/
@@ -121,3 +123,4 @@ obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_EXOFS_FS)          += exofs/
 obj-$(CONFIG_CEPH_FS)           += ceph/
+obj-$(CONFIG_PSTORE)            += pstore/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index 1dd5f34b3cf2..e55182a74605 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,7 +1,6 @@
 config ADFS_FS
        tristate "ADFS file system support (EXPERIMENTAL)"
        depends on BLOCK && EXPERIMENTAL
-        depends on BKL # need to fix
        help
          The Acorn Disc Filing System is the standard file system of the
          RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 2ff622f6f547..718ac1f440c6 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -50,6 +50,7 @@ struct adfs_sb_info {
        gid_t           s_gid;          /* owner gid                             */
        umode_t         s_owner_mask;   /* ADFS owner perm -> unix perm          */
        umode_t         s_other_mask;   /* ADFS other perm -> unix perm          */
+        int             s_ftsuffix;     /* ,xyz hex filetype suffix option */
        __u32           s_ids_per_zone; /* max. no ids in one zone               */
        __u32           s_idlen;        /* length of ID in map                   */
@@ -79,6 +80,10 @@ struct adfs_dir {
        int                     nr_buffers;
        struct buffer_head      *bh[4];
+        /* big directories need allocated buffers */
+        struct buffer_head      **bh_fplus;
        unsigned int            pos;
        unsigned int            parent_id;
@@ -89,7 +94,7 @@ struct adfs_dir {
 /*
 * This is the overall maximum name length
 */
-#define ADFS_MAX_NAME_LEN       256
+#define ADFS_MAX_NAME_LEN       (256 + 4) /* +4 for ,xyz hex filetype suffix */
 struct object_info {
        __u32           parent_id;              /* parent object id     */
        __u32           file_id;                /* object id            */
@@ -97,10 +102,26 @@ struct object_info {
        __u32           execaddr;               /* execution address    */
        __u32           size;                   /* size                 */
        __u8            attr;                   /* RISC OS attributes   */
-        unsigned char   name_len;               /* name length          */
+        unsigned int    name_len;               /* name length          */
        char            name[ADFS_MAX_NAME_LEN];/* file name            */
+        /* RISC OS file type (12-bit: derived from loadaddr) */
+        __u16           filetype;
 };
+/* RISC OS 12-bit filetype converts to ,xyz hex filename suffix */
+static inline int append_filetype_suffix(char *buf, __u16 filetype)
+{
+        if (filetype == 0xffff) /* no explicit 12-bit file type was set */
+                return 0;
+        *buf++ = ',';
+        *buf++ = hex_asc_lo(filetype >> 8);
+        *buf++ = hex_asc_lo(filetype >> 4);
+        *buf++ = hex_asc_lo(filetype >> 0);
+        return 4;
+}
 struct adfs_dir_ops {
        int     (*read)(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir);
        int     (*setpos)(struct adfs_dir *dir, unsigned int fpos);
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 3b4a764ed780..3d83075aaa2e 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -9,7 +9,6 @@
 *
 *  Common directory handling for ADFS
 */
-#include <linux/smp_lock.h>
 #include "adfs.h"
 /*
@@ -27,8 +26,6 @@ adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct adfs_dir dir;
        int ret = 0;
-        lock_kernel();  
        if (filp->f_pos >> 32)
                goto out;
@@ -70,7 +67,6 @@ free_out:
        ops->free(&dir);
 out:
-        unlock_kernel();
        return ret;
 }
@@ -276,7 +272,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        struct object_info obj;
        int error;
-        lock_kernel();
        error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
        if (error == 0) {
                error = -EACCES;
@@ -288,7 +283,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                if (inode)
                        error = 0;
        }
-        unlock_kernel();
        d_add(dentry, inode);
        return ERR_PTR(error);
 }
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index bafc71222e25..4bbe853ee50a 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -52,7 +52,6 @@ static inline int adfs_readname(char *buf, char *ptr, int maxlen)
                        *buf++ = *ptr;
                ptr++;
        }
-        *buf = '\0';
        return buf - old_buf;
 }
@@ -208,7 +207,8 @@ release_buffers:
 * convert a disk-based directory entry to a Linux ADFS directory entry
 */
 static inline void
-adfs_dir2obj(struct object_info *obj, struct adfs_direntry *de)
+adfs_dir2obj(struct adfs_dir *dir, struct object_info *obj,
+        struct adfs_direntry *de)
 {
        obj->name_len = adfs_readname(obj->name, de->dirobname, ADFS_F_NAME_LEN);
        obj->file_id  = adfs_readval(de->dirinddiscadd, 3);
@@ -216,6 +216,23 @@ adfs_dir2obj(struct object_info *obj, struct adfs_direntry *de)
        obj->execaddr = adfs_readval(de->direxec, 4);
        obj->size     = adfs_readval(de->dirlen,  4);
        obj->attr     = de->newdiratts;
+        obj->filetype = -1;
+        /*
+         * object is a file and is filetyped and timestamped?
+         * RISC OS 12-bit filetype is stored in load_address[19:8]
+         */
+        if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
+                (0xfff00000 == (0xfff00000 & obj->loadaddr))) {
+                obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
+                /* optionally append the ,xyz hex filetype suffix */
+                if (ADFS_SB(dir->sb)->s_ftsuffix)
+                        obj->name_len +=
+                                append_filetype_suffix(
+                                        &obj->name[obj->name_len],
+                                        obj->filetype);
+        }
 }
 /*
@@ -260,7 +277,7 @@ __adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj)
        if (!de.dirobname[0])
                return -ENOENT;
-        adfs_dir2obj(obj, &de);
+        adfs_dir2obj(dir, obj, &de);
        return 0;
 }
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 1796bb352d05..d9e3bee4e653 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -8,6 +8,7 @@
 * published by the Free Software Foundation.
 */
 #include <linux/buffer_head.h>
+#include <linux/slab.h>
 #include "adfs.h"
 #include "dir_fplus.h"
@@ -22,30 +23,53 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
        dir->nr_buffers = 0;
+        /* start off using fixed bh set - only alloc for big dirs */
+        dir->bh_fplus = &dir->bh[0];
        block = __adfs_block_map(sb, id, 0);
        if (!block) {
                adfs_error(sb, "dir object %X has a hole at offset 0", id);
                goto out;
        }
-        dir->bh[0] = sb_bread(sb, block);
+        dir->bh_fplus[0] = sb_bread(sb, block);
-        if (!dir->bh[0])
+        if (!dir->bh_fplus[0])
                goto out;
        dir->nr_buffers += 1;
-        h = (struct adfs_bigdirheader *)dir->bh[0]->b_data;
+        h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data;
        size = le32_to_cpu(h->bigdirsize);
        if (size != sz) {
-                printk(KERN_WARNING "adfs: adfs_fplus_read: directory header size\n"
+                printk(KERN_WARNING "adfs: adfs_fplus_read:"
-                                " does not match directory size\n");
+                                        " directory header size %X\n"
+                                        " does not match directory size %X\n",
+                                        size, sz);
        }
        if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 ||
            h->bigdirversion[2] != 0 || size & 2047 ||
-            h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME))
+            h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) {
+                printk(KERN_WARNING "adfs: dir object %X has"
+                                        " malformed dir header\n", id);
                goto out;
+        }
        size >>= sb->s_blocksize_bits;
+        if (size > sizeof(dir->bh)/sizeof(dir->bh[0])) {
+                /* this directory is too big for fixed bh set, must allocate */
+                struct buffer_head **bh_fplus =
+                        kzalloc(size * sizeof(struct buffer_head *),
+                                GFP_KERNEL);
+                if (!bh_fplus) {
+                        adfs_error(sb, "not enough memory for"
+                                        " dir object %X (%d blocks)", id, size);
+                        goto out;
+                }
+                dir->bh_fplus = bh_fplus;
+                /* copy over the pointer to the block that we've already read */
+                dir->bh_fplus[0] = dir->bh[0];
+        }
        for (blk = 1; blk < size; blk++) {
                block = __adfs_block_map(sb, id, blk);
                if (!block) {
@@ -53,25 +77,44 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
                        goto out;
                }
-                dir->bh[blk] = sb_bread(sb, block);
+                dir->bh_fplus[blk] = sb_bread(sb, block);
-                if (!dir->bh[blk])
+                if (!dir->bh_fplus[blk]) {
+                        adfs_error(sb,  "dir object %X failed read for"
+                                        " offset %d, mapped block %X",
+                                        id, blk, block);
                        goto out;
-                dir->nr_buffers = blk;
+                }
+                dir->nr_buffers += 1;
        }
-        t = (struct adfs_bigdirtail *)(dir->bh[size - 1]->b_data + (sb->s_blocksize - 8));
+        t = (struct adfs_bigdirtail *)
+                (dir->bh_fplus[size - 1]->b_data + (sb->s_blocksize - 8));
        if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) ||
            t->bigdirendmasseq != h->startmasseq ||
-            t->reserved[0] != 0 || t->reserved[1] != 0)
+            t->reserved[0] != 0 || t->reserved[1] != 0) {
+                printk(KERN_WARNING "adfs: dir object %X has "
+                                        "malformed dir end\n", id);
                goto out;
+        }
        dir->parent_id = le32_to_cpu(h->bigdirparent);
        dir->sb = sb;
        return 0;
 out:
-        for (i = 0; i < dir->nr_buffers; i++)
+        if (dir->bh_fplus) {
-                brelse(dir->bh[i]);
+                for (i = 0; i < dir->nr_buffers; i++)
+                        brelse(dir->bh_fplus[i]);
+                if (&dir->bh[0] != dir->bh_fplus)
+                        kfree(dir->bh_fplus);
+                dir->bh_fplus = NULL;
+        }
+        dir->nr_buffers = 0;
        dir->sb = NULL;
        return ret;
 }
@@ -79,7 +122,8 @@ out:
 static int
 adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos)
 {
-        struct adfs_bigdirheader *h = (struct adfs_bigdirheader *)dir->bh[0]->b_data;
+        struct adfs_bigdirheader *h =
+                (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
        int ret = -ENOENT;
        if (fpos <= le32_to_cpu(h->bigdirentries)) {
@@ -102,21 +146,27 @@ dir_memcpy(struct adfs_dir *dir, unsigned int offset, void *to, int len)
        partial = sb->s_blocksize - offset;
        if (partial >= len)
-                memcpy(to, dir->bh[buffer]->b_data + offset, len);
+                memcpy(to, dir->bh_fplus[buffer]->b_data + offset, len);
        else {
                char *c = (char *)to;
                remainder = len - partial;
-                memcpy(c, dir->bh[buffer]->b_data + offset, partial);
+                memcpy(c,
-                memcpy(c + partial, dir->bh[buffer + 1]->b_data, remainder);
+                        dir->bh_fplus[buffer]->b_data + offset,
+                        partial);
+                memcpy(c + partial,
+                        dir->bh_fplus[buffer + 1]->b_data,
+                        remainder);
        }
 }
 static int
 adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
 {
-        struct adfs_bigdirheader *h = (struct adfs_bigdirheader *)dir->bh[0]->b_data;
+        struct adfs_bigdirheader *h =
+                (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
        struct adfs_bigdirentry bde;
        unsigned int offset;
        int i, ret = -ENOENT;
@@ -147,6 +197,24 @@ adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
                if (obj->name[i] == '/')
                        obj->name[i] = '.';
+        obj->filetype = -1;
+        /*
+         * object is a file and is filetyped and timestamped?
+         * RISC OS 12-bit filetype is stored in load_address[19:8]
+         */
+        if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
+                (0xfff00000 == (0xfff00000 & obj->loadaddr))) {
+                obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
+                /* optionally append the ,xyz hex filetype suffix */
+                if (ADFS_SB(dir->sb)->s_ftsuffix)
+                        obj->name_len +=
+                                append_filetype_suffix(
+                                        &obj->name[obj->name_len],
+                                        obj->filetype);
+        }
        dir->pos += 1;
        ret = 0;
 out:
@@ -160,7 +228,7 @@ adfs_fplus_sync(struct adfs_dir *dir)
        int i;
        for (i = dir->nr_buffers - 1; i >= 0; i--) {
-                struct buffer_head *bh = dir->bh[i];
+                struct buffer_head *bh = dir->bh_fplus[i];
                sync_dirty_buffer(bh);
                if (buffer_req(bh) && !buffer_uptodate(bh))
                        err = -EIO;
@@ -174,8 +242,17 @@ adfs_fplus_free(struct adfs_dir *dir)
 {
        int i;
-        for (i = 0; i < dir->nr_buffers; i++)
+        if (dir->bh_fplus) {
-                brelse(dir->bh[i]);
+                for (i = 0; i < dir->nr_buffers; i++)
+                        brelse(dir->bh_fplus[i]);
+                if (&dir->bh[0] != dir->bh_fplus)
+                        kfree(dir->bh_fplus);
+                dir->bh_fplus = NULL;
+        }
+        dir->nr_buffers = 0;
        dir->sb = NULL;
 }
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 65794b8fe79e..d5250c5aae21 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -7,7 +7,6 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include "adfs.h"
@@ -73,32 +72,18 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations adfs_aops = {
        .readpage       = adfs_readpage,
        .writepage      = adfs_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = adfs_write_begin,
        .write_end      = generic_write_end,
        .bmap           = _adfs_bmap
 };
-static inline unsigned int
-adfs_filetype(struct inode *inode)
-{
-        unsigned int type;
-        if (ADFS_I(inode)->stamped)
-                type = (ADFS_I(inode)->loadaddr >> 8) & 0xfff;
-        else
-                type = (unsigned int) -1;
-        return type;
-}
 /*
 * Convert ADFS attributes and filetype to Linux permission.
 */
 static umode_t
 adfs_atts2mode(struct super_block *sb, struct inode *inode)
 {
-        unsigned int filetype, attr = ADFS_I(inode)->attr;
+        unsigned int attr = ADFS_I(inode)->attr;
        umode_t mode, rmask;
        struct adfs_sb_info *asb = ADFS_SB(sb);
@@ -107,9 +92,7 @@ adfs_atts2mode(struct super_block *sb, struct inode *inode)
                return S_IFDIR | S_IXUGO | mode;
        }
-        filetype = adfs_filetype(inode);
+        switch (ADFS_I(inode)->filetype) {
-        switch (filetype) {
        case 0xfc0:     /* LinkFS */
                return S_IFLNK|S_IRWXUGO;
@@ -175,50 +158,48 @@ adfs_mode2atts(struct super_block *sb, struct inode *inode)
 /*
 * Convert an ADFS time to Unix time.  ADFS has a 40-bit centi-second time
- * referenced to 1 Jan 1900 (til 2248)
+ * referenced to 1 Jan 1900 (til 2248) so we need to discard 2208988800 seconds
+ * of time to convert from RISC OS epoch to Unix epoch.
 */
 static void
 adfs_adfs2unix_time(struct timespec *tv, struct inode *inode)
 {
        unsigned int high, low;
+        /* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since
+         * 01 Jan 1900 00:00:00 (RISC OS epoch)
+         */
+        static const s64 nsec_unix_epoch_diff_risc_os_epoch =
+                                                        2208988800000000000LL;
+        s64 nsec;
        if (ADFS_I(inode)->stamped == 0)
                goto cur_time;
-        high = ADFS_I(inode)->loadaddr << 24;
+        high = ADFS_I(inode)->loadaddr & 0xFF; /* top 8 bits of timestamp */
-        low  = ADFS_I(inode)->execaddr;
+        low  = ADFS_I(inode)->execaddr;    /* bottom 32 bits of timestamp */
-        high |= low >> 8;
+        /* convert 40-bit centi-seconds to 32-bit seconds
-        low  &= 255;
+         * going via nanoseconds to retain precision
+         */
+        nsec = (((s64) high << 32) | (s64) low) * 10000000; /* cs to ns */
        /* Files dated pre  01 Jan 1970 00:00:00. */
-        if (high < 0x336e996a)
+        if (nsec < nsec_unix_epoch_diff_risc_os_epoch)
                goto too_early;
-        /* Files dated post 18 Jan 2038 03:14:05. */
+        /* convert from RISC OS to Unix epoch */
-        if (high >= 0x656e9969)
+        nsec -= nsec_unix_epoch_diff_risc_os_epoch;
-                goto too_late;
-        /* discard 2208988800 (0x336e996a00) seconds of time */
-        high -= 0x336e996a;
-        /* convert 40-bit centi-seconds to 32-bit seconds */
+        *tv = ns_to_timespec(nsec);
-        tv->tv_sec = (((high % 100) << 8) + low) / 100 + (high / 100 << 8);
-        tv->tv_nsec = 0;
        return;
 cur_time:
-        *tv = CURRENT_TIME_SEC;
+        *tv = CURRENT_TIME;
        return;
 too_early:
        tv->tv_sec = tv->tv_nsec = 0;
        return;
- too_late:
-        tv->tv_sec = 0x7ffffffd;
-        tv->tv_nsec = 0;
-        return;
 }
 /*
@@ -280,7 +261,8 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
        ADFS_I(inode)->loadaddr  = obj->loadaddr;
        ADFS_I(inode)->execaddr  = obj->execaddr;
        ADFS_I(inode)->attr      = obj->attr;
-        ADFS_I(inode)->stamped    = ((obj->loadaddr & 0xfff00000) == 0xfff00000);
+        ADFS_I(inode)->filetype  = obj->filetype;
+        ADFS_I(inode)->stamped   = ((obj->loadaddr & 0xfff00000) == 0xfff00000);
        inode->i_mode    = adfs_atts2mode(sb, inode);
        adfs_adfs2unix_time(&inode->i_mtime, inode);
@@ -316,8 +298,6 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
        unsigned int ia_valid = attr->ia_valid;
        int error;
        
-        lock_kernel();
        error = inode_change_ok(inode, attr);
        /*
@@ -359,7 +339,6 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
        if (ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MODE))
                mark_inode_dirty(inode);
 out:
-        unlock_kernel();
        return error;
 }
@@ -374,7 +353,6 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        struct object_info obj;
        int ret;
-        lock_kernel();
        obj.file_id     = inode->i_ino;
        obj.name_len    = 0;
        obj.parent_id   = ADFS_I(inode)->parent_id;
@@ -384,6 +362,5 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        obj.size        = inode->i_size;
        ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
-        unlock_kernel();
        return ret;
 }
diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index d1a5932bb0f1..6935f05202ac 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -51,7 +51,7 @@ static DEFINE_RWLOCK(adfs_map_lock);
 /*
 * This is fun.  We need to load up to 19 bits from the map at an
- * arbitary bit alignment.  (We're limited to 19 bits by F+ version 2).
+ * arbitrary bit alignment.  (We're limited to 19 bits by F+ version 2).
 */
 #define GET_FRAG_ID(_map,_start,_idmask)                                \
        ({                                                              \
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 2d7954049fbe..c8bf36a1996a 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -14,7 +14,6 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <linux/smp_lock.h>
 #include <linux/statfs.h>
 #include "adfs.h"
 #include "dir_f.h"
@@ -120,15 +119,11 @@ static void adfs_put_super(struct super_block *sb)
        int i;
        struct adfs_sb_info *asb = ADFS_SB(sb);
-        lock_kernel();
        for (i = 0; i < asb->s_map_size; i++)
                brelse(asb->s_map[i].dm_bh);
        kfree(asb->s_map);
        kfree(asb);
        sb->s_fs_info = NULL;
-        unlock_kernel();
 }
 static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
@@ -143,17 +138,20 @@ static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
                seq_printf(seq, ",ownmask=%o", asb->s_owner_mask);
        if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK)
                seq_printf(seq, ",othmask=%o", asb->s_other_mask);
+        if (asb->s_ftsuffix != 0)
+                seq_printf(seq, ",ftsuffix=%u", asb->s_ftsuffix);
        return 0;
 }
-enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err};
+enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err};
 static const match_table_t tokens = {
        {Opt_uid, "uid=%u"},
        {Opt_gid, "gid=%u"},
        {Opt_ownmask, "ownmask=%o"},
        {Opt_othmask, "othmask=%o"},
+        {Opt_ftsuffix, "ftsuffix=%u"},
        {Opt_err, NULL}
 };
@@ -194,6 +192,11 @@ static int parse_options(struct super_block *sb, char *options)
                                return -EINVAL;
                        asb->s_other_mask = option;
                        break;
+                case Opt_ftsuffix:
+                        if (match_int(args, &option))
+                                return -EINVAL;
+                        asb->s_ftsuffix = option;
+                        break;
                default:
                        printk("ADFS-fs: unrecognised mount option \"%s\" "
                                        "or missing value\n", p);
@@ -359,15 +362,11 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
        struct adfs_sb_info *asb;
        struct inode *root;
-        lock_kernel();
        sb->s_flags |= MS_NODIRATIME;
        asb = kzalloc(sizeof(*asb), GFP_KERNEL);
-        if (!asb) {
+        if (!asb)
-                unlock_kernel();
                return -ENOMEM;
-        }
        sb->s_fs_info = asb;
        /* set default options */
@@ -375,6 +374,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
        asb->s_gid = 0;
        asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
        asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
+        asb->s_ftsuffix = 0;
        if (parse_options(sb, data))
                goto error;
@@ -454,11 +454,13 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
        root_obj.parent_id = root_obj.file_id = le32_to_cpu(dr->root);
        root_obj.name_len  = 0;
-        root_obj.loadaddr  = 0;
+        /* Set root object date as 01 Jan 1987 00:00:00 */
-        root_obj.execaddr  = 0;
+        root_obj.loadaddr  = 0xfff0003f;
+        root_obj.execaddr  = 0xec22c000;
        root_obj.size      = ADFS_NEWDIR_SIZE;
        root_obj.attr      = ADFS_NDA_DIRECTORY   | ADFS_NDA_OWNER_READ |
                             ADFS_NDA_OWNER_WRITE | ADFS_NDA_PUBLIC_READ;
+        root_obj.filetype  = -1;
        /*
         * If this is a F+ disk with variable length directories,
@@ -472,6 +474,12 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
                asb->s_dir     = &adfs_f_dir_ops;
                asb->s_namelen = ADFS_F_NAME_LEN;
        }
+        /*
+         * ,xyz hex filetype suffix may be added by driver
+         * to files that have valid RISC OS filetype
+         */
+        if (asb->s_ftsuffix)
+                asb->s_namelen += 4;
        sb->s_d_op = &adfs_dentry_operations;
        root = adfs_iget(sb, &root_obj);
@@ -485,7 +493,6 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
                adfs_error(sb, "get root inode failed\n");
                goto error;
        }
-        unlock_kernel();
        return 0;
 error_free_bh:
@@ -493,7 +500,6 @@ error_free_bh:
 error:
        sb->s_fs_info = NULL;
        kfree(asb);
-        unlock_kernel();
        return -EINVAL;
 }
diff --git a/fs/affs/Makefile b/fs/affs/Makefile
index b2c4f54446f3..3988b4a78339 100644
--- a/fs/affs/Makefile
+++ b/fs/affs/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the Linux affs filesystem routines.
 #
-#EXTRA_CFLAGS=-DDEBUG=1
+#ccflags-y := -DDEBUG=1
 obj-$(CONFIG_AFFS_FS) += affs.o
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0a90dcd46de2..acf321b70fcd 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -429,7 +429,6 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations affs_aops = {
        .readpage = affs_readpage,
        .writepage = affs_writepage,
-        .sync_page = block_sync_page,
        .write_begin = affs_write_begin,
        .write_end = generic_write_end,
        .bmap = _affs_bmap
@@ -786,7 +785,6 @@ out:
 const struct address_space_operations affs_aops_ofs = {
        .readpage = affs_readpage_ofs,
        //.writepage = affs_writepage_ofs,
-        //.sync_page = affs_sync_page_ofs,
        .write_begin = affs_write_begin_ofs,
        .write_end = affs_write_end_ofs
 };
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index 0fb315dd4d2a..577763c3d88b 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -98,7 +98,7 @@ static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
 }
 /*
- * provide new auxilliary cache data
+ * provide new auxiliary cache data
 */
 static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
                                       void *buffer, uint16_t bufmax)
@@ -117,7 +117,7 @@ static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
 }
 /*
- * check that the auxilliary data indicates that the entry is still valid
+ * check that the auxiliary data indicates that the entry is still valid
 */
 static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
                                                      const void *buffer,
@@ -150,7 +150,7 @@ static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
 }
 /*
- * provide new auxilliary cache data
+ * provide new auxiliary cache data
 */
 static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
                                            void *buffer, uint16_t bufmax)
@@ -172,7 +172,7 @@ static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
 }
 /*
- * check that the auxilliary data indicates that the entry is still valid
+ * check that the auxiliary data indicates that the entry is still valid
 */
 static
 enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
@@ -283,7 +283,7 @@ static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
 }
 /*
- * provide new auxilliary cache data
+ * provide new auxiliary cache data
 */
 static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
                                        void *buffer, uint16_t bufmax)
@@ -309,7 +309,7 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
 }
 /*
- * check that the auxilliary data indicates that the entry is still valid
+ * check that the auxiliary data indicates that the entry is still valid
 */
 static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
                                                       const void *buffer,
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 0d5eeadf6121..3c090b7555ea 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -293,7 +293,7 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
                if (!cell) {
                        /* this should not happen unless user tries to mount
                         * when root cell is not set. Return an impossibly
-                         * bizzare errno to alert the user. Things like
+                         * bizarre errno to alert the user. Things like
                         * ENOENT might be "more appropriate" but they happen
                         * for other reasons.
                         */
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 15690bb1d3b5..789b3afb3423 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -140,6 +140,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
        candidate->first = candidate->last = index;
        candidate->offset_first = from;
        candidate->to_last = to;
+        INIT_LIST_HEAD(&candidate->link);
        candidate->usage = 1;
        candidate->state = AFS_WBACK_PENDING;
        init_waitqueue_head(&candidate->waitq);
diff --git a/fs/aio.c b/fs/aio.c
index fc557a3be0a9..e29ec485af25 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -34,8 +34,6 @@
 #include <linux/security.h>
 #include <linux/eventfd.h>
 #include <linux/blkdev.h>
-#include <linux/mempool.h>
-#include <linux/hash.h>
 #include <linux/compat.h>
 #include <asm/kmap_types.h>
@@ -65,14 +63,6 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
 static DEFINE_SPINLOCK(fput_lock);
 static LIST_HEAD(fput_head);
-#define AIO_BATCH_HASH_BITS     3 /* allocated on-stack, so don't go crazy */
-#define AIO_BATCH_HASH_SIZE     (1 << AIO_BATCH_HASH_BITS)
-struct aio_batch_entry {
-        struct hlist_node list;
-        struct address_space *mapping;
-};
-mempool_t *abe_pool;
 static void aio_kick_handler(struct work_struct *);
 static void aio_queue_work(struct kioctx *);
@@ -85,9 +75,8 @@ static int __init aio_setup(void)
        kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-        aio_wq = create_workqueue("aio");
+        aio_wq = alloc_workqueue("aio", 0, 1);  /* used to limit concurrency */
-        abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
+        BUG_ON(!aio_wq);
-        BUG_ON(!aio_wq || !abe_pool);
        pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
@@ -239,15 +228,23 @@ static void __put_ioctx(struct kioctx *ctx)
        call_rcu(&ctx->rcu_head, ctx_rcu_free);
 }
-#define get_ioctx(kioctx) do {                                          \
+static inline void get_ioctx(struct kioctx *kioctx)
-        BUG_ON(atomic_read(&(kioctx)->users) <= 0);                     \
+{
-        atomic_inc(&(kioctx)->users);                                   \
+        BUG_ON(atomic_read(&kioctx->users) <= 0);
-} while (0)
+        atomic_inc(&kioctx->users);
-#define put_ioctx(kioctx) do {                                          \
+}
-        BUG_ON(atomic_read(&(kioctx)->users) <= 0);                     \
-        if (unlikely(atomic_dec_and_test(&(kioctx)->users)))            \
+static inline int try_get_ioctx(struct kioctx *kioctx)
-                __put_ioctx(kioctx);                                    \
+{
-} while (0)
+        return atomic_inc_not_zero(&kioctx->users);
+}
+static inline void put_ioctx(struct kioctx *kioctx)
+{
+        BUG_ON(atomic_read(&kioctx->users) <= 0);
+        if (unlikely(atomic_dec_and_test(&kioctx->users)))
+                __put_ioctx(kioctx);
+}
 /* ioctx_alloc
 *      Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
@@ -512,7 +509,7 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
        ctx->reqs_active--;
        if (unlikely(!ctx->reqs_active && ctx->dead))
-                wake_up(&ctx->wait);
+                wake_up_all(&ctx->wait);
 }
 static void aio_fput_routine(struct work_struct *data)
@@ -569,7 +566,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
                spin_lock(&fput_lock);
                list_add(&req->ki_list, &fput_head);
                spin_unlock(&fput_lock);
-                queue_work(aio_wq, &fput_work);
+                schedule_work(&fput_work);
        } else {
                req->ki_filp = NULL;
                really_put_req(ctx, req);
@@ -601,8 +598,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
        rcu_read_lock();
        hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
-                if (ctx->user_id == ctx_id && !ctx->dead) {
+                /*
-                        get_ioctx(ctx);
+                 * RCU protects us against accessing freed memory but
+                 * we have to be careful not to get a reference when the
+                 * reference count already dropped to 0 (ctx->dead test
+                 * is unreliable because of races).
+                 */
+                if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
                        ret = ctx;
                        break;
                }
@@ -1216,7 +1218,7 @@ static void io_destroy(struct kioctx *ioctx)
         * by other CPUs at this point.  Right now, we rely on the
         * locking done by the above calls to ensure this consistency.
         */
-        wake_up(&ioctx->wait);
+        wake_up_all(&ioctx->wait);
        put_ioctx(ioctx);       /* once for the lookup */
 }
@@ -1512,57 +1514,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
        return 0;
 }
-static void aio_batch_add(struct address_space *mapping,
-                          struct hlist_head *batch_hash)
-{
-        struct aio_batch_entry *abe;
-        struct hlist_node *pos;
-        unsigned bucket;
-        bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
-        hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
-                if (abe->mapping == mapping)
-                        return;
-        }
-        abe = mempool_alloc(abe_pool, GFP_KERNEL);
-        /*
-         * we should be using igrab here, but
-         * we don't want to hammer on the global
-         * inode spinlock just to take an extra
-         * reference on a file that we must already
-         * have a reference to.
-         *
-         * When we're called, we always have a reference
-         * on the file, so we must always have a reference
-         * on the inode, so ihold() is safe here.
-         */
-        ihold(mapping->host);
-        abe->mapping = mapping;
-        hlist_add_head(&abe->list, &batch_hash[bucket]);
-        return;
-}
-static void aio_batch_free(struct hlist_head *batch_hash)
-{
-        struct aio_batch_entry *abe;
-        struct hlist_node *pos, *n;
-        int i;
-        for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
-                hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
-                        blk_run_address_space(abe->mapping);
-                        iput(abe->mapping->host);
-                        hlist_del(&abe->list);
-                        mempool_free(abe, abe_pool);
-                }
-        }
-}
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-                         struct iocb *iocb, struct hlist_head *batch_hash,
+                         struct iocb *iocb, bool compat)
-                         bool compat)
 {
        struct kiocb *req;
        struct file *file;
@@ -1629,6 +1582,23 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                goto out_put_req;
        spin_lock_irq(&ctx->ctx_lock);
+        /*
+         * We could have raced with io_destroy() and are currently holding a
+         * reference to ctx which should be destroyed. We cannot submit IO
+         * since ctx gets freed as soon as io_submit() puts its reference.  The
+         * check here is reliable: io_destroy() sets ctx->dead before waiting
+         * for outstanding IO and the barrier between these two is realized by
+         * unlock of mm->ioctx_lock and lock of ctx->ctx_lock.  Analogously we
+         * increment ctx->reqs_active before checking for ctx->dead and the
+         * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
+         * don't see ctx->dead set here, io_destroy() waits for our IO to
+         * finish.
+         */
+        if (ctx->dead) {
+                spin_unlock_irq(&ctx->ctx_lock);
+                ret = -EINVAL;
+                goto out_put_req;
+        }
        aio_run_iocb(req);
        if (!list_empty(&ctx->run_list)) {
                /* drain the run list */
@@ -1636,11 +1606,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
                        ;
        }
        spin_unlock_irq(&ctx->ctx_lock);
-        if (req->ki_opcode == IOCB_CMD_PREAD ||
-            req->ki_opcode == IOCB_CMD_PREADV ||
-            req->ki_opcode == IOCB_CMD_PWRITE ||
-            req->ki_opcode == IOCB_CMD_PWRITEV)
-                aio_batch_add(file->f_mapping, batch_hash);
        aio_put_req(req);       /* drop extra ref to req */
        return 0;
@@ -1657,7 +1622,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
        struct kioctx *ctx;
        long ret = 0;
        int i;
-        struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
+        struct blk_plug plug;
        if (unlikely(nr < 0))
                return -EINVAL;
@@ -1674,6 +1639,8 @@ long do_io_submit(aio_context_t ctx_id, long nr,
                return -EINVAL;
        }
+        blk_start_plug(&plug);
        /*
         * AKPM: should this return a partial result if some of the IOs were
         * successfully submitted?
@@ -1692,11 +1659,11 @@ long do_io_submit(aio_context_t ctx_id, long nr,
                        break;
                }
-                ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
+                ret = io_submit_one(ctx, user_iocb, &tmp, compat);
                if (ret)
                        break;
        }
-        aio_batch_free(batch_hash);
+        blk_finish_plug(&plug);
        put_ioctx(ctx);
        return i ? i : ret;
diff --git a/fs/attr.c b/fs/attr.c
index 7ca41811afa1..91dbe2a107f2 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -59,7 +59,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
        /* Make sure a caller can chmod. */
        if (ia_valid & ATTR_MODE) {
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EPERM;
                /* Also check the setgid bit! */
                if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
@@ -69,7 +69,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
        /* Check for setting the inode time. */
        if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EPERM;
        }
@@ -128,7 +128,7 @@ EXPORT_SYMBOL(inode_newsize_ok);
 * setattr_copy must be called with i_mutex held.
 *
 * setattr_copy updates the inode's metadata with that specified
- * in attr. Noticably missing is inode size update, which is more complex
+ * in attr. Noticeably missing is inode size update, which is more complex
 * as it requires pagecache updates.
 *
 * The inode is not marked as dirty after this operation. The rationale is
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 54f923792728..475f9c597cb7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -61,8 +61,6 @@ do {							\
                current->pid, __func__, ##args);        \
 } while (0)
-extern spinlock_t autofs4_lock;
 /* Unified info structure.  This is pointed to by both the dentry and
   inode structures.  Each file in the filesystem has an instance of this
   structure.  It holds a reference to the dentry, so dentries are never
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 1442da4860e5..509fe1eb66ae 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -372,6 +372,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
                return -EBUSY;
        } else {
                struct file *pipe = fget(pipefd);
+                if (!pipe) {
+                        err = -EBADF;
+                        goto out;
+                }
                if (!pipe->f_op || !pipe->f_op->write) {
                        err = -EPIPE;
                        fput(pipe);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index f43100b9662b..450f529a4eae 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -87,18 +87,70 @@ done:
 }
 /*
+ * Calculate and dget next entry in the subdirs list under root.
+ */
+static struct dentry *get_next_positive_subdir(struct dentry *prev,
+                                                struct dentry *root)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
+        struct list_head *next;
+        struct dentry *p, *q;
+        spin_lock(&sbi->lookup_lock);
+        if (prev == NULL) {
+                spin_lock(&root->d_lock);
+                prev = dget_dlock(root);
+                next = prev->d_subdirs.next;
+                p = prev;
+                goto start;
+        }
+        p = prev;
+        spin_lock(&p->d_lock);
+again:
+        next = p->d_u.d_child.next;
+start:
+        if (next == &root->d_subdirs) {
+                spin_unlock(&p->d_lock);
+                spin_unlock(&sbi->lookup_lock);
+                dput(prev);
+                return NULL;
+        }
+        q = list_entry(next, struct dentry, d_u.d_child);
+        spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
+        /* Negative dentry - try next */
+        if (!simple_positive(q)) {
+                spin_unlock(&p->d_lock);
+                p = q;
+                goto again;
+        }
+        dget_dlock(q);
+        spin_unlock(&q->d_lock);
+        spin_unlock(&p->d_lock);
+        spin_unlock(&sbi->lookup_lock);
+        dput(prev);
+        return q;
+}
+/*
 * Calculate and dget next entry in top down tree traversal.
 */
 static struct dentry *get_next_positive_dentry(struct dentry *prev,
                                                struct dentry *root)
 {
+        struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
        struct list_head *next;
        struct dentry *p, *ret;
        if (prev == NULL)
                return dget(root);
-        spin_lock(&autofs4_lock);
+        spin_lock(&sbi->lookup_lock);
 relock:
        p = prev;
        spin_lock(&p->d_lock);
@@ -110,7 +162,7 @@ again:
                        if (p == root) {
                                spin_unlock(&p->d_lock);
-                                spin_unlock(&autofs4_lock);
+                                spin_unlock(&sbi->lookup_lock);
                                dput(prev);
                                return NULL;
                        }
@@ -140,7 +192,7 @@ again:
        dget_dlock(ret);
        spin_unlock(&ret->d_lock);
        spin_unlock(&p->d_lock);
-        spin_unlock(&autofs4_lock);
+        spin_unlock(&sbi->lookup_lock);
        dput(prev);
@@ -290,11 +342,8 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
        spin_lock(&sbi->fs_lock);
        ino = autofs4_dentry_ino(root);
        /* No point expiring a pending mount */
-        if (ino->flags & AUTOFS_INF_PENDING) {
+        if (ino->flags & AUTOFS_INF_PENDING)
-                spin_unlock(&sbi->fs_lock);
+                goto out;
-                return NULL;
-        }
-        managed_dentry_set_transit(root);
        if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
                struct autofs_info *ino = autofs4_dentry_ino(root);
                ino->flags |= AUTOFS_INF_EXPIRING;
@@ -302,7 +351,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
                spin_unlock(&sbi->fs_lock);
                return root;
        }
-        managed_dentry_clear_transit(root);
+out:
        spin_unlock(&sbi->fs_lock);
        dput(root);
@@ -336,13 +385,12 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
        timeout = sbi->exp_timeout;
        dentry = NULL;
-        while ((dentry = get_next_positive_dentry(dentry, root))) {
+        while ((dentry = get_next_positive_subdir(dentry, root))) {
                spin_lock(&sbi->fs_lock);
                ino = autofs4_dentry_ino(dentry);
                /* No point expiring a pending mount */
                if (ino->flags & AUTOFS_INF_PENDING)
-                        goto cont;
+                        goto next;
-                managed_dentry_set_transit(dentry);
                /*
                 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -402,8 +450,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                        }
                }
 next:
-                managed_dentry_clear_transit(dentry);
-cont:
                spin_unlock(&sbi->fs_lock);
        }
        return NULL;
@@ -415,13 +461,13 @@ found:
        ino->flags |= AUTOFS_INF_EXPIRING;
        init_completion(&ino->expire_complete);
        spin_unlock(&sbi->fs_lock);
-        spin_lock(&autofs4_lock);
+        spin_lock(&sbi->lookup_lock);
        spin_lock(&expired->d_parent->d_lock);
        spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
        list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
        spin_unlock(&expired->d_lock);
        spin_unlock(&expired->d_parent->d_lock);
-        spin_unlock(&autofs4_lock);
+        spin_unlock(&sbi->lookup_lock);
        return expired;
 }
@@ -484,8 +530,6 @@ int autofs4_expire_run(struct super_block *sb,
        spin_lock(&sbi->fs_lock);
        ino = autofs4_dentry_ino(dentry);
        ino->flags &= ~AUTOFS_INF_EXPIRING;
-        if (!d_unhashed(dentry))
-                managed_dentry_clear_transit(dentry);
        complete_all(&ino->expire_complete);
        spin_unlock(&sbi->fs_lock);
@@ -513,9 +557,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                spin_lock(&sbi->fs_lock);
                ino->flags &= ~AUTOFS_INF_EXPIRING;
                spin_lock(&dentry->d_lock);
-                if (ret)
+                if (!ret) {
-                        __managed_dentry_clear_transit(dentry);
-                else {
                        if ((IS_ROOT(dentry) ||
                            (autofs_type_indirect(sbi->type) &&
                             IS_ROOT(dentry->d_parent))) &&
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 014e7aba3b08..f55ae23b137e 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -23,8 +23,6 @@
 #include "autofs_i.h"
-DEFINE_SPINLOCK(autofs4_lock);
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -36,7 +34,7 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static struct vfsmount *autofs4_d_automount(struct path *);
-static int autofs4_d_manage(struct dentry *, bool, bool);
+static int autofs4_d_manage(struct dentry *, bool);
 static void autofs4_dentry_release(struct dentry *);
 const struct file_operations autofs4_root_operations = {
@@ -125,15 +123,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
         * autofs file system so just let the libfs routines handle
         * it.
         */
-        spin_lock(&autofs4_lock);
+        spin_lock(&sbi->lookup_lock);
        spin_lock(&dentry->d_lock);
        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
                spin_unlock(&dentry->d_lock);
-                spin_unlock(&autofs4_lock);
+                spin_unlock(&sbi->lookup_lock);
                return -ENOENT;
        }
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&autofs4_lock);
+        spin_unlock(&sbi->lookup_lock);
 out:
        return dcache_dir_open(inode, file);
@@ -171,7 +169,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
        const unsigned char *str = name->name;
        struct list_head *p, *head;
-        spin_lock(&autofs4_lock);
        spin_lock(&sbi->lookup_lock);
        head = &sbi->active_list;
        list_for_each(p, head) {
@@ -204,14 +201,12 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
                        dget_dlock(active);
                        spin_unlock(&active->d_lock);
                        spin_unlock(&sbi->lookup_lock);
-                        spin_unlock(&autofs4_lock);
                        return active;
                }
 next:
                spin_unlock(&active->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
-        spin_unlock(&autofs4_lock);
        return NULL;
 }
@@ -226,7 +221,6 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
        const unsigned char *str = name->name;
        struct list_head *p, *head;
-        spin_lock(&autofs4_lock);
        spin_lock(&sbi->lookup_lock);
        head = &sbi->expiring_list;
        list_for_each(p, head) {
@@ -259,14 +253,12 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
                        dget_dlock(expiring);
                        spin_unlock(&expiring->d_lock);
                        spin_unlock(&sbi->lookup_lock);
-                        spin_unlock(&autofs4_lock);
                        return expiring;
                }
 next:
                spin_unlock(&expiring->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
-        spin_unlock(&autofs4_lock);
        return NULL;
 }
@@ -275,17 +267,16 @@ static int autofs4_mount_wait(struct dentry *dentry)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
-        int status;
+        int status = 0;
        if (ino->flags & AUTOFS_INF_PENDING) {
                DPRINTK("waiting for mount name=%.*s",
                        dentry->d_name.len, dentry->d_name.name);
                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
                DPRINTK("mount wait done status=%d", status);
-                ino->last_used = jiffies;
-                return status;
        }
-        return 0;
+        ino->last_used = jiffies;
+        return status;
 }
 static int do_expire_wait(struct dentry *dentry)
@@ -319,9 +310,12 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
         */
        if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
                struct dentry *parent = dentry->d_parent;
+                struct autofs_info *ino;
                struct dentry *new = d_lookup(parent, &dentry->d_name);
                if (!new)
                        return NULL;
+                ino = autofs4_dentry_ino(new);
+                ino->last_used = jiffies;
                dput(path->dentry);
                path->dentry = new;
        }
@@ -338,18 +332,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
        DPRINTK("dentry=%p %.*s",
                dentry, dentry->d_name.len, dentry->d_name.name);
-        /*
-         * Someone may have manually umounted this or it was a submount
-         * that has gone away.
-         */
-        spin_lock(&dentry->d_lock);
-        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-                if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
-                     (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-                        __managed_dentry_set_transit(path->dentry);
-        }
-        spin_unlock(&dentry->d_lock);
        /* The daemon never triggers a mount. */
        if (autofs4_oz_mode(sbi))
                return NULL;
@@ -418,18 +400,17 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 done:
        if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
                /*
-                 * Any needed mounting has been completed and the path updated
+                 * Any needed mounting has been completed and the path
-                 * so turn this into a normal dentry so we don't continually
+                 * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
-                 * call ->d_automount() and ->d_manage().
+                 * call ->d_automount() on rootless multi-mounts since
-                 */
+                 * it can lead to an incorrect ELOOP error return.
-                spin_lock(&dentry->d_lock);
+                 *
-                __managed_dentry_clear_transit(dentry);
-                /*
                 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
                 * symlinks as in all other cases the dentry will be covered by
                 * an actual mount so ->d_automount() won't be called during
                 * the follow.
                 */
+                spin_lock(&dentry->d_lock);
                if ((!d_mountpoint(dentry) &&
                    !list_empty(&dentry->d_subdirs)) ||
                    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
@@ -446,7 +427,7 @@ done:
        return NULL;
 }
-int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
+int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
@@ -454,7 +435,9 @@ int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
                dentry, dentry->d_name.len, dentry->d_name.name);
        /* The daemon never waits. */
-        if (autofs4_oz_mode(sbi) || mounting_here) {
+        if (autofs4_oz_mode(sbi)) {
+                if (rcu_walk)
+                        return 0;
                if (!d_mountpoint(dentry))
                        return -EISDIR;
                return 0;
@@ -612,12 +595,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        dir->i_mtime = CURRENT_TIME;
-        spin_lock(&autofs4_lock);
+        spin_lock(&sbi->lookup_lock);
-        autofs4_add_expiring(dentry);
+        __autofs4_add_expiring(dentry);
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&autofs4_lock);
+        spin_unlock(&sbi->lookup_lock);
        return 0;
 }
@@ -629,7 +612,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
 * of the directory tree. There is no need to clear the automount flag
 * following a mount or restore it after an expire because these mounts
- * are always covered. However, it is neccessary to ensure that these
+ * are always covered. However, it is necessary to ensure that these
 * flags are clear on non-empty directories to avoid unnecessary calls
 * during path walks.
 */
@@ -686,20 +669,17 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
        if (!autofs4_oz_mode(sbi))
                return -EACCES;
-        spin_lock(&autofs4_lock);
        spin_lock(&sbi->lookup_lock);
        spin_lock(&dentry->d_lock);
        if (!list_empty(&dentry->d_subdirs)) {
                spin_unlock(&dentry->d_lock);
                spin_unlock(&sbi->lookup_lock);
-                spin_unlock(&autofs4_lock);
                return -ENOTEMPTY;
        }
        __autofs4_add_expiring(dentry);
-        spin_unlock(&sbi->lookup_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
-        spin_unlock(&autofs4_lock);
+        spin_unlock(&sbi->lookup_lock);
        if (sbi->version < 5)
                autofs_clear_leaf_automount_flags(dentry);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 56010056b2e6..25435987d6ae 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -197,12 +197,12 @@ rename_retry:
        seq = read_seqbegin(&rename_lock);
        rcu_read_lock();
-        spin_lock(&autofs4_lock);
+        spin_lock(&sbi->fs_lock);
        for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
                len += tmp->d_name.len + 1;
        if (!len || --len > NAME_MAX) {
-                spin_unlock(&autofs4_lock);
+                spin_unlock(&sbi->fs_lock);
                rcu_read_unlock();
                if (read_seqretry(&rename_lock, seq))
                        goto rename_retry;
@@ -218,7 +218,7 @@ rename_retry:
                p -= tmp->d_name.len;
                strncpy(p, tmp->d_name.name, tmp->d_name.len);
        }
-        spin_unlock(&autofs4_lock);
+        spin_unlock(&sbi->fs_lock);
        rcu_read_unlock();
        if (read_seqretry(&rename_lock, seq))
                goto rename_retry;
diff --git a/fs/befs/ChangeLog b/fs/befs/ChangeLog
index ce8c787916be..75a461cfaca6 100644
--- a/fs/befs/ChangeLog
+++ b/fs/befs/ChangeLog
@@ -24,7 +24,7 @@ Version 0.9 (2002-03-14)
 Version 0.64 (2002-02-07)
 ==========
-* Did the string comparision really right this time (btree.c) [WD]
+* Did the string comparison really right this time (btree.c) [WD]
 * Fixed up some places where I assumed that a long int could hold
        a pointer value. (btree.c) [WD]
@@ -114,7 +114,7 @@ Version 0.6 (2001-12-15)
        More flexible. Will soon be controllable at mount time 
        (see TODO). [WD]
-* Rewrote datastream positon lookups.
+* Rewrote datastream position lookups.
        (datastream.c) [WD]
 * Moved the TODO list to its own file.
@@ -150,7 +150,7 @@ Version 0.50 (2001-11-13)
 * Anton also told me that the blocksize is not allowed to be larger than 
        the page size in linux, which is 4k i386. Oops. Added a test for 
        (blocksize > PAGE_SIZE), and refuse to mount in that case. What this 
-        practicaly means is that 8k blocksize volumes won't work without a major
+        practically means is that 8k blocksize volumes won't work without a major
        restructuring of the driver (or an alpha or other 64bit hardware). [WD]
 * Cleaned up the befs_count_blocks() function. Much smarter now. 
@@ -183,7 +183,7 @@ Version 0.45 (2001-10-29)
        structures into the generic pointer fields of the public structures 
        with kmalloc(). put_super and put_inode free them. This allows us not 
        to have to touch the definitions of the public structures in 
-        include/linux/fs.h. Also, befs_inode_info is huge (becuase of the 
+        include/linux/fs.h. Also, befs_inode_info is huge (because of the 
        symlink string). (super.c, inode.c, befs_fs.h) [WD]
 * Fixed a thinko that was corrupting file reads after the first block_run 
@@ -404,7 +404,7 @@ Version 0.4 (2001-10-28)
 * Fixed compile errors on 2.4.1 kernel (WD)
        Resolve rejected patches
-        Accomodate changed NLS interface (util.h)
+        Accommodate changed NLS interface (util.h)
        Needed to include <linux/slab.h> in most files
        Makefile changes
        fs/Config.in changes
diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h
index 7893eaa1e58c..eb557d9dc8be 100644
--- a/fs/befs/befs_fs_types.h
+++ b/fs/befs/befs_fs_types.h
@@ -234,7 +234,7 @@ typedef struct {
 } PACKED befs_btree_super;
 /*
- * Header stucture of each btree node
+ * Header structure of each btree node
 */
 typedef struct {
        fs64 left;
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 4202db7496cb..a66c9b1136e0 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -5,7 +5,7 @@
 *
 * Licensed under the GNU GPL. See the file COPYING for details.
 *
- * 2002-02-05: Sergey S. Kostyliov added binary search withing
+ * 2002-02-05: Sergey S. Kostyliov added binary search within
 *              btree nodes.
 *
 * Many thanks to:
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b1d0c794747b..54b8c28bebc8 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -75,7 +75,6 @@ static const struct inode_operations befs_dir_inode_operations = {
 static const struct address_space_operations befs_aops = {
        .readpage       = befs_readpage,
-        .sync_page      = block_sync_page,
        .bmap           = befs_bmap,
 };
@@ -735,7 +734,7 @@ parse_options(char *options, befs_mount_options * opts)
 /* This function has the responsibiltiy of getting the
 * filesystem ready for unmounting. 
- * Basicly, we free everything that we allocated in
+ * Basically, we free everything that we allocated in
 * befs_read_inode
 */
 static void
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 685ecff3ab31..b14cebfd9047 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -97,7 +97,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
        if (!inode)
                return -ENOSPC;
        mutex_lock(&info->bfs_lock);
-        ino = find_first_zero_bit(info->si_imap, info->si_lasti);
+        ino = find_first_zero_bit(info->si_imap, info->si_lasti + 1);
        if (ino > info->si_lasti) {
                mutex_unlock(&info->bfs_lock);
                iput(inode);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index eb67edd0f8ea..f20e8a71062f 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -186,7 +186,6 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations bfs_aops = {
        .readpage       = bfs_readpage,
        .writepage      = bfs_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = bfs_write_begin,
        .write_end      = generic_write_end,
        .bmap           = bfs_bmap,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d5b640ba6cb1..303983fabfd6 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -570,7 +570,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        unsigned long elf_entry;
        unsigned long interp_load_addr = 0;
        unsigned long start_code, end_code, start_data, end_data;
-        unsigned long reloc_func_desc = 0;
+        unsigned long reloc_func_desc __maybe_unused = 0;
        int executable_stack = EXSTACK_DEFAULT;
        unsigned long def_flags = 0;
        struct {
@@ -941,9 +941,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        current->mm->start_stack = bprm->p;
 #ifdef arch_randomize_brk
-        if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1))
+        if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
                current->mm->brk = current->mm->start_brk =
                        arch_randomize_brk(current->mm);
+#ifdef CONFIG_COMPAT_BRK
+                current->brk_randomized = 1;
+#endif
+        }
 #endif
        if (current->personality & MMAP_PAGE_ZERO) {
@@ -1906,7 +1910,7 @@ static int elf_core_dump(struct coredump_params *cprm)
        segs = current->mm->map_count;
        segs += elf_core_extra_phdrs();
-        gate_vma = get_gate_vma(current);
+        gate_vma = get_gate_vma(current->mm);
        if (gate_vma != NULL)
                segs++;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 811384bec8de..397d3057d336 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -717,7 +717,7 @@ static int load_flat_file(struct linux_binprm * bprm,
         * help simplify all this mumbo jumbo
         *
         * We've got two different sections of relocation entries.
-         * The first is the GOT which resides at the begining of the data segment
+         * The first is the GOT which resides at the beginning of the data segment
         * and is terminated with a -1.  This one can be relocated in place.
         * The second is the extra relocation entries tacked after the image's
         * data segment. These require a little more processing as the entry is
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index e49cce234c65..9c5e6b2cd11a 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -761,6 +761,9 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
        unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
+        if (bs->bio_integrity_pool)
+                return 0;
        bs->bio_integrity_pool =
                mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
diff --git a/fs/bio.c b/fs/bio.c
index 4bd454fa844e..840a0d755248 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -43,7 +43,7 @@ static mempool_t *bio_split_pool __read_mostly;
 * unsigned short
 */
 #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
+static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
        BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
 };
 #undef BV
@@ -111,7 +111,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
        if (!slab)
                goto out_unlock;
-        printk("bio: create slab <%s> at %d\n", bslab->name, entry);
+        printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
        bslab->slab = slab;
        bslab->slab_ref = 1;
        bslab->slab_size = sz;
@@ -1436,7 +1436,7 @@ EXPORT_SYMBOL(bio_flush_dcache_pages);
 *   preferred way to end I/O on a bio, it takes care of clearing
 *   BIO_UPTODATE on error. @error is 0 on success, and and one of the
 *   established -Exxxx (-EIO, for instance) error values in case
- *   something went wrong. Noone should call bi_end_io() directly on a
+ *   something went wrong. No one should call bi_end_io() directly on a
 *   bio unless they own it and thus know that it has an end_io
 *   function.
 **/
@@ -1636,9 +1636,6 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
        if (!bs->bio_pool)
                goto bad;
-        if (bioset_integrity_create(bs, pool_size))
-                goto bad;
        if (!biovec_create_pools(bs, pool_size))
                return bs;
@@ -1656,12 +1653,10 @@ static void __init biovec_init_slabs(void)
                int size;
                struct biovec_slab *bvs = bvec_slabs + i;
-#ifndef CONFIG_BLK_DEV_INTEGRITY
                if (bvs->nr_vecs <= BIO_INLINE_VECS) {
                        bvs->slab = NULL;
                        continue;
                }
-#endif
                size = bvs->nr_vecs * sizeof(struct bio_vec);
                bvs->slab = kmem_cache_create(bvs->name, size, 0,
@@ -1684,6 +1679,9 @@ static int __init init_bio(void)
        if (!fs_bio_set)
                panic("bio: can't allocate bios\n");
+        if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
+                panic("bio: can't create integrity pool\n");
        bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
                                                     sizeof(struct bio_pair));
        if (!bio_split_pool)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 333a7bb4cb9c..5147bdd3b8e1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -55,11 +55,13 @@ EXPORT_SYMBOL(I_BDEV);
 static void bdev_inode_switch_bdi(struct inode *inode,
                        struct backing_dev_info *dst)
 {
-        spin_lock(&inode_lock);
+        spin_lock(&inode_wb_list_lock);
+        spin_lock(&inode->i_lock);
        inode->i_data.backing_dev_info = dst;
        if (inode->i_state & I_DIRTY)
                list_move(&inode->i_wb_list, &dst->wb.b_dirty);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&inode_wb_list_lock);
 }
 static sector_t max_block(struct block_device *bdev)
@@ -651,7 +653,7 @@ void bd_forget(struct inode *inode)
 * @whole: whole block device containing @bdev, may equal @bdev
 * @holder: holder trying to claim @bdev
 *
- * Test whther @bdev can be claimed by @holder.
+ * Test whether @bdev can be claimed by @holder.
 *
 * CONTEXT:
 * spin_lock(&bdev_lock).
@@ -873,6 +875,11 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
        ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
        if (ret)
                goto out_del;
+        /*
+         * bdev could be deleted beneath us which would implicitly destroy
+         * the holder directory.  Hold on to it.
+         */
+        kobject_get(bdev->bd_part->holder_dir);
        list_add(&holder->list, &bdev->bd_holder_disks);
        goto out_unlock;
@@ -909,6 +916,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
                del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
                del_symlink(bdev->bd_part->holder_dir,
                            &disk_to_dev(disk)->kobj);
+                kobject_put(bdev->bd_part->holder_dir);
                list_del_init(&holder->list);
                kfree(holder);
        }
@@ -922,14 +930,15 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
 * flush_disk - invalidates all buffer-cache entries on a disk
 *
 * @bdev:      struct block device to be flushed
+ * @kill_dirty: flag to guide handling of dirty inodes
 *
 * Invalidates all buffer-cache entries on a disk. It should be called
 * when a disk has been changed -- either by a media change or online
 * resize.
 */
-static void flush_disk(struct block_device *bdev)
+static void flush_disk(struct block_device *bdev, bool kill_dirty)
 {
-        if (__invalidate_device(bdev)) {
+        if (__invalidate_device(bdev, kill_dirty)) {
                char name[BDEVNAME_SIZE] = "";
                if (bdev->bd_disk)
@@ -966,7 +975,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
                       "%s: detected capacity change from %lld to %lld\n",
                       name, bdev_size, disk_size);
                i_size_write(bdev->bd_inode, disk_size);
-                flush_disk(bdev);
+                flush_disk(bdev, false);
        }
 }
 EXPORT_SYMBOL(check_disk_size_change);
@@ -1019,7 +1028,7 @@ int check_disk_change(struct block_device *bdev)
        if (!(events & DISK_EVENT_MEDIA_CHANGE))
                return 0;
-        flush_disk(bdev);
+        flush_disk(bdev, true);
        if (bdops->revalidate_disk)
                bdops->revalidate_disk(bdev->bd_disk);
        return 1;
@@ -1080,6 +1089,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        if (!disk)
                goto out;
+        disk_block_events(disk);
        mutex_lock_nested(&bdev->bd_mutex, for_part);
        if (!bdev->bd_openers) {
                bdev->bd_disk = disk;
@@ -1101,10 +1111,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                         */
                                        disk_put_part(bdev->bd_part);
                                        bdev->bd_part = NULL;
-                                        module_put(disk->fops->owner);
-                                        put_disk(disk);
                                        bdev->bd_disk = NULL;
                                        mutex_unlock(&bdev->bd_mutex);
+                                        disk_unblock_events(disk);
+                                        module_put(disk->fops->owner);
+                                        put_disk(disk);
                                        goto restart;
                                }
                                if (ret)
@@ -1141,9 +1152,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
                }
        } else {
-                module_put(disk->fops->owner);
-                put_disk(disk);
-                disk = NULL;
                if (bdev->bd_contains == bdev) {
                        if (bdev->bd_disk->fops->open) {
                                ret = bdev->bd_disk->fops->open(bdev, mode);
@@ -1153,11 +1161,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        if (bdev->bd_invalidated)
                                rescan_partitions(bdev->bd_disk, bdev);
                }
+                /* only one opener holds refs to the module and disk */
+                module_put(disk->fops->owner);
+                put_disk(disk);
        }
        bdev->bd_openers++;
        if (for_part)
                bdev->bd_part_count++;
        mutex_unlock(&bdev->bd_mutex);
+        disk_unblock_events(disk);
        return 0;
 out_clear:
@@ -1170,10 +1182,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        bdev->bd_contains = NULL;
 out_unlock_bdev:
        mutex_unlock(&bdev->bd_mutex);
- out:
+        disk_unblock_events(disk);
-        if (disk)
+        module_put(disk->fops->owner);
-                module_put(disk->fops->owner);
        put_disk(disk);
+ out:
        bdput(bdev);
        return ret;
@@ -1215,12 +1227,6 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
        res = __blkdev_get(bdev, mode, 0);
-        /* __blkdev_get() may alter read only status, check it afterwards */
-        if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
-                __blkdev_put(bdev, mode, 0);
-                res = -EACCES;
-        }
        if (whole) {
                /* finish claiming */
                mutex_lock(&bdev->bd_mutex);
@@ -1298,6 +1304,11 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
        if (err)
                return ERR_PTR(err);
+        if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+                blkdev_put(bdev, mode);
+                return ERR_PTR(-EACCES);
+        }
        return bdev;
 }
 EXPORT_SYMBOL(blkdev_get_by_path);
@@ -1440,14 +1451,13 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
                if (bdev_free) {
                        if (bdev->bd_write_holder) {
                                disk_unblock_events(bdev->bd_disk);
-                                bdev->bd_write_holder = false;
-                        } else
                                disk_check_events(bdev->bd_disk);
+                                bdev->bd_write_holder = false;
+                        }
                }
                mutex_unlock(&bdev->bd_mutex);
-        } else
+        }
-                disk_check_events(bdev->bd_disk);
        return __blkdev_put(bdev, mode, 0);
 }
@@ -1521,7 +1531,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 static const struct address_space_operations def_blk_aops = {
        .readpage       = blkdev_readpage,
        .writepage      = blkdev_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = blkdev_write_begin,
        .write_end      = blkdev_write_end,
        .writepages     = generic_writepages,
@@ -1601,7 +1610,7 @@ fail:
 }
 EXPORT_SYMBOL(lookup_bdev);
-int __invalidate_device(struct block_device *bdev)
+int __invalidate_device(struct block_device *bdev, bool kill_dirty)
 {
        struct super_block *sb = get_super(bdev);
        int res = 0;
@@ -1614,7 +1623,7 @@ int __invalidate_device(struct block_device *bdev)
                 * hold).
                 */
                shrink_dcache_sb(sb);
-                res = invalidate_inodes(sb);
+                res = invalidate_inodes(sb, kill_dirty);
                drop_super(sb);
        }
        invalidate_bdev(bdev);
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 15b5ca2a2606..5d505aaa72fb 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,6 +37,9 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
        char *value = NULL;
        struct posix_acl *acl;
+        if (!IS_POSIXACL(inode))
+                return NULL;
        acl = get_cached_acl(inode, type);
        if (acl != ACL_NOT_CACHED)
                return acl;
@@ -84,6 +87,9 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
        struct posix_acl *acl;
        int ret = 0;
+        if (!IS_POSIXACL(dentry->d_inode))
+                return -EOPNOTSUPP;
        acl = btrfs_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
@@ -164,7 +170,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
        int ret;
        struct posix_acl *acl = NULL;
-        if (!is_owner_or_cap(dentry->d_inode))
+        if (!inode_owner_or_capable(dentry->d_inode))
                return -EPERM;
        if (!IS_POSIXACL(dentry->d_inode))
@@ -172,16 +178,17 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
        if (value) {
                acl = posix_acl_from_xattr(value, size);
-                if (acl == NULL) {
+                if (acl) {
-                        value = NULL;
+                        ret = posix_acl_valid(acl);
-                        size = 0;
+                        if (ret)
+                                goto out;
                } else if (IS_ERR(acl)) {
                        return PTR_ERR(acl);
                }
        }
        ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
+out:
        posix_acl_release(acl);
        return ret;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ccc991c542df..57c3bb2884ce 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -136,9 +136,8 @@ struct btrfs_inode {
         * items we think we'll end up using, and reserved_extents is the number
         * of extent items we've reserved metadata for.
         */
-        spinlock_t accounting_lock;
        atomic_t outstanding_extents;
-        int reserved_extents;
+        atomic_t reserved_extents;
        /*
         * ordered_data_close is set by truncate when a file that used
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index f745287fbf2e..41d1d7c70e29 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -340,6 +340,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
        WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
        cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+        if (!cb)
+                return -ENOMEM;
        atomic_set(&cb->pending_bios, 0);
        cb->errors = 0;
        cb->inode = inode;
@@ -354,6 +356,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
        bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
        bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+        if(!bio) {
+                kfree(cb);
+                return -ENOMEM;
+        }
        bio->bi_private = cb;
        bio->bi_end_io = end_compressed_bio_write;
        atomic_inc(&cb->pending_bios);
@@ -562,7 +568,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        u64 em_len;
        u64 em_start;
        struct extent_map *em;
-        int ret;
+        int ret = -ENOMEM;
        u32 *sums;
        tree = &BTRFS_I(inode)->io_tree;
@@ -577,6 +583,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        compressed_len = em->block_len;
        cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+        if (!cb)
+                goto out;
        atomic_set(&cb->pending_bios, 0);
        cb->errors = 0;
        cb->inode = inode;
@@ -597,13 +606,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
                                 PAGE_CACHE_SIZE;
-        cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+        cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
                                       GFP_NOFS);
+        if (!cb->compressed_pages)
+                goto fail1;
        bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
        for (page_index = 0; page_index < nr_pages; page_index++) {
                cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
                                                              __GFP_HIGHMEM);
+                if (!cb->compressed_pages[page_index])
+                        goto fail2;
        }
        cb->nr_pages = nr_pages;
@@ -614,6 +628,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        cb->len = uncompressed_len;
        comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+        if (!comp_bio)
+                goto fail2;
        comp_bio->bi_private = cb;
        comp_bio->bi_end_io = end_compressed_bio_read;
        atomic_inc(&cb->pending_bios);
@@ -647,8 +663,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                        atomic_inc(&cb->pending_bios);
                        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
-                                btrfs_lookup_bio_sums(root, inode, comp_bio,
+                                ret = btrfs_lookup_bio_sums(root, inode,
-                                                      sums);
+                                                        comp_bio, sums);
+                                BUG_ON(ret);
                        }
                        sums += (comp_bio->bi_size + root->sectorsize - 1) /
                                root->sectorsize;
@@ -673,14 +690,27 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
        BUG_ON(ret);
-        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
-                btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+                ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+                BUG_ON(ret);
+        }
        ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
        BUG_ON(ret);
        bio_put(comp_bio);
        return 0;
+fail2:
+        for (page_index = 0; page_index < nr_pages; page_index++)
+                free_page((unsigned long)cb->compressed_pages[page_index]);
+        kfree(cb->compressed_pages);
+fail1:
+        kfree(cb);
+out:
+        free_extent_map(em);
+        return ret;
 }
 static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
@@ -900,7 +930,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
        return ret;
 }
-void __exit btrfs_exit_compress(void)
+void btrfs_exit_compress(void)
 {
        free_workspaces();
 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b5baff0dccfe..84d7ca1fe0ba 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -147,10 +147,11 @@ noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 {
        struct extent_buffer *eb;
-        spin_lock(&root->node_lock);
-        eb = root->node;
+        rcu_read_lock();
+        eb = rcu_dereference(root->node);
        extent_buffer_get(eb);
-        spin_unlock(&root->node_lock);
+        rcu_read_unlock();
        return eb;
 }
@@ -165,14 +166,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
        while (1) {
                eb = btrfs_root_node(root);
                btrfs_tree_lock(eb);
+                if (eb == root->node)
-                spin_lock(&root->node_lock);
-                if (eb == root->node) {
-                        spin_unlock(&root->node_lock);
                        break;
-                }
-                spin_unlock(&root->node_lock);
                btrfs_tree_unlock(eb);
                free_extent_buffer(eb);
        }
@@ -458,10 +453,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                else
                        parent_start = 0;
-                spin_lock(&root->node_lock);
-                root->node = cow;
                extent_buffer_get(cow);
-                spin_unlock(&root->node_lock);
+                rcu_assign_pointer(root->node, cow);
                btrfs_free_tree_block(trans, root, buf, parent_start,
                                      last_ref);
@@ -542,6 +535,9 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
        ret = __btrfs_cow_block(trans, root, buf, parent,
                                 parent_slot, cow_ret, search_start, 0);
+        trace_btrfs_cow_block(root, buf, *cow_ret);
        return ret;
 }
@@ -686,6 +682,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                        if (!cur) {
                                cur = read_tree_block(root, blocknr,
                                                         blocksize, gen);
+                                if (!cur)
+                                        return -EIO;
                        } else if (!uptodate) {
                                btrfs_read_buffer(cur, gen);
                        }
@@ -732,122 +730,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
        return btrfs_item_offset_nr(leaf, nr - 1);
 }
-/*
- * extra debugging checks to make sure all the items in a key are
- * well formed and in the proper order
- */
-static int check_node(struct btrfs_root *root, struct btrfs_path *path,
-                      int level)
-{
-        struct extent_buffer *parent = NULL;
-        struct extent_buffer *node = path->nodes[level];
-        struct btrfs_disk_key parent_key;
-        struct btrfs_disk_key node_key;
-        int parent_slot;
-        int slot;
-        struct btrfs_key cpukey;
-        u32 nritems = btrfs_header_nritems(node);
-        if (path->nodes[level + 1])
-                parent = path->nodes[level + 1];
-        slot = path->slots[level];
-        BUG_ON(nritems == 0);
-        if (parent) {
-                parent_slot = path->slots[level + 1];
-                btrfs_node_key(parent, &parent_key, parent_slot);
-                btrfs_node_key(node, &node_key, 0);
-                BUG_ON(memcmp(&parent_key, &node_key,
-                              sizeof(struct btrfs_disk_key)));
-                BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-                       btrfs_header_bytenr(node));
-        }
-        BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
-        if (slot != 0) {
-                btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
-                btrfs_node_key(node, &node_key, slot);
-                BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
-        }
-        if (slot < nritems - 1) {
-                btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
-                btrfs_node_key(node, &node_key, slot);
-                BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
-        }
-        return 0;
-}
-/*
- * extra checking to make sure all the items in a leaf are
- * well formed and in the proper order
- */
-static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
-                      int level)
-{
-        struct extent_buffer *leaf = path->nodes[level];
-        struct extent_buffer *parent = NULL;
-        int parent_slot;
-        struct btrfs_key cpukey;
-        struct btrfs_disk_key parent_key;
-        struct btrfs_disk_key leaf_key;
-        int slot = path->slots[0];
-        u32 nritems = btrfs_header_nritems(leaf);
-        if (path->nodes[level + 1])
-                parent = path->nodes[level + 1];
-        if (nritems == 0)
-                return 0;
-        if (parent) {
-                parent_slot = path->slots[level + 1];
-                btrfs_node_key(parent, &parent_key, parent_slot);
-                btrfs_item_key(leaf, &leaf_key, 0);
-                BUG_ON(memcmp(&parent_key, &leaf_key,
-                       sizeof(struct btrfs_disk_key)));
-                BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-                       btrfs_header_bytenr(leaf));
-        }
-        if (slot != 0 && slot < nritems - 1) {
-                btrfs_item_key(leaf, &leaf_key, slot);
-                btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
-                if (comp_keys(&leaf_key, &cpukey) <= 0) {
-                        btrfs_print_leaf(root, leaf);
-                        printk(KERN_CRIT "slot %d offset bad key\n", slot);
-                        BUG_ON(1);
-                }
-                if (btrfs_item_offset_nr(leaf, slot - 1) !=
-                       btrfs_item_end_nr(leaf, slot)) {
-                        btrfs_print_leaf(root, leaf);
-                        printk(KERN_CRIT "slot %d offset bad\n", slot);
-                        BUG_ON(1);
-                }
-        }
-        if (slot < nritems - 1) {
-                btrfs_item_key(leaf, &leaf_key, slot);
-                btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
-                BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
-                if (btrfs_item_offset_nr(leaf, slot) !=
-                        btrfs_item_end_nr(leaf, slot + 1)) {
-                        btrfs_print_leaf(root, leaf);
-                        printk(KERN_CRIT "slot %d offset bad\n", slot);
-                        BUG_ON(1);
-                }
-        }
-        BUG_ON(btrfs_item_offset_nr(leaf, 0) +
-               btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
-        return 0;
-}
-static noinline int check_block(struct btrfs_root *root,
-                                struct btrfs_path *path, int level)
-{
-        return 0;
-        if (level == 0)
-                return check_leaf(root, path, level);
-        return check_node(root, path, level);
-}
 /*
 * search for key in the extent_buffer.  The items start at offset p,
@@ -1046,9 +928,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        goto enospc;
                }
-                spin_lock(&root->node_lock);
+                rcu_assign_pointer(root->node, child);
-                root->node = child;
-                spin_unlock(&root->node_lock);
                add_root_to_dirty_list(root);
                btrfs_tree_unlock(child);
@@ -1188,7 +1068,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                }
        }
        /* double check we haven't messed things up */
-        check_block(root, path, level);
        if (orig_ptr !=
            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
                BUG();
@@ -1798,12 +1677,6 @@ cow_done:
                if (!cow)
                        btrfs_unlock_up_safe(p, level + 1);
-                ret = check_block(root, p, level);
-                if (ret) {
-                        ret = -1;
-                        goto done;
-                }
                ret = bin_search(b, key, level, &slot);
                if (level != 0) {
@@ -2130,10 +2003,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(c);
-        spin_lock(&root->node_lock);
        old = root->node;
-        root->node = c;
+        rcu_assign_pointer(root->node, c);
-        spin_unlock(&root->node_lock);
        /* the super has an extra ref to root->node */
        free_extent_buffer(old);
@@ -3840,7 +3711,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
        unsigned long ptr;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
        if (!ret) {
                leaf = path->nodes[0];
@@ -4217,6 +4089,7 @@ find_next_key:
                }
                btrfs_set_path_blocking(path);
                cur = read_node_slot(root, cur, slot);
+                BUG_ON(!cur);
                btrfs_tree_lock(cur);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c98b3af6052..2e61fe1b6b8c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -28,6 +28,7 @@
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/kobject.h>
+#include <trace/events/btrfs.h>
 #include <asm/kmap_types.h>
 #include "extent_io.h"
 #include "extent_map.h"
@@ -40,6 +41,7 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
+extern struct kmem_cache *btrfs_free_space_cachep;
 struct btrfs_ordered_sum;
 #define BTRFS_MAGIC "_BHRfS_M"
@@ -729,8 +731,19 @@ struct btrfs_space_info {
        u64 disk_total;         /* total bytes on disk, takes mirrors into
                                   account */
-        int full;               /* indicates that we cannot allocate any more
+        /*
+         * we bump reservation progress every time we decrement
+         * bytes_reserved.  This way people waiting for reservations
+         * know something good has happened and they can check
+         * for progress.  The number here isn't to be trusted, it
+         * just shows reclaim activity
+         */
+        unsigned long reservation_progress;
+        int full:1;             /* indicates that we cannot allocate any more
                                   chunks for this space */
+        int chunk_alloc:1;      /* set if we are allocating a chunk */
        int force_alloc;        /* set if we need to force a chunk alloc for
                                   this space */
@@ -773,9 +786,6 @@ struct btrfs_free_cluster {
        /* first extent starting offset */
        u64 window_start;
-        /* if this cluster simply points at a bitmap in the block group */
-        bool points_to_bitmap;
        struct btrfs_block_group_cache *block_group;
        /*
         * when a cluster is allocated from a block group, we put the
@@ -1254,6 +1264,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_SPACE_CACHE         (1 << 12)
 #define BTRFS_MOUNT_CLEAR_CACHE         (1 << 13)
 #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
+#define BTRFS_MOUNT_ENOSPC_DEBUG         (1 << 15)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -1273,6 +1284,9 @@ struct btrfs_root {
 #define BTRFS_INODE_NODUMP              (1 << 8)
 #define BTRFS_INODE_NOATIME             (1 << 9)
 #define BTRFS_INODE_DIRSYNC             (1 << 10)
+#define BTRFS_INODE_COMPRESS            (1 << 11)
+#define BTRFS_INODE_ROOT_ITEM_INIT      (1 << 31)
 /* some macros to generate set/get funcs for the struct fields.  This
 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
@@ -2147,6 +2161,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      u64 root_objectid, u64 owner, u64 offset);
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                u64 num_bytes, int reserve, int sinfo);
 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2217,8 +2233,12 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
 int btrfs_error_unpin_extent_range(struct btrfs_root *root,
                                   u64 start, u64 end);
 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
-                               u64 num_bytes);
+                               u64 num_bytes, u64 *actual_bytes);
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 type);
+int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -2343,6 +2363,8 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
 int btrfs_set_root_node(struct btrfs_root_item *item,
                        struct extent_buffer *node);
+void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, const char *name,
@@ -2380,6 +2402,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
                                          struct btrfs_path *path, u64 dir,
                                          const char *name, u16 name_len,
                                          int mod);
+int verify_dir_item(struct btrfs_root *root,
+                    struct extent_buffer *leaf,
+                    struct btrfs_dir_item *dir_item);
 /* orphan.c */
 int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -2516,7 +2541,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
                              struct inode *inode);
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
-void btrfs_orphan_cleanup(struct btrfs_root *root);
+int btrfs_orphan_cleanup(struct btrfs_root *root);
 void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
                                struct btrfs_pending_snapshot *pending,
                                u64 *bytes_to_reserve);
@@ -2524,7 +2549,7 @@ void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
                                struct btrfs_pending_snapshot *pending);
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root);
-int btrfs_cont_expand(struct inode *inode, loff_t size);
+int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
 int btrfs_invalidate_inodes(struct btrfs_root *root);
 void btrfs_add_delayed_iput(struct inode *inode);
 void btrfs_run_delayed_iputs(struct btrfs_root *root);
@@ -2553,6 +2578,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
                              struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
+void btrfs_drop_pages(struct page **pages, size_t num_pages);
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+                      struct page **pages, size_t num_pages,
+                      loff_t pos, size_t write_bytes,
+                      struct extent_state **cached);
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e807b143b857..bce28f653899 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -483,6 +483,8 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
        INIT_LIST_HEAD(&head_ref->cluster);
        mutex_init(&head_ref->mutex);
+        trace_btrfs_delayed_ref_head(ref, head_ref, action);
        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
        if (existing) {
@@ -537,6 +539,8 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
        }
        full_ref->level = level;
+        trace_btrfs_delayed_tree_ref(ref, full_ref, action);
        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
        if (existing) {
@@ -591,6 +595,8 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
        full_ref->objectid = owner;
        full_ref->offset = offset;
+        trace_btrfs_delayed_data_ref(ref, full_ref, action);
        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
        if (existing) {
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f0cad5ae5be7..c62f02f6ae69 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -151,7 +151,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
                ret = PTR_ERR(dir_item);
                if (ret == -EEXIST)
                        goto second_insert;
-                goto out;
+                goto out_free;
        }
        leaf = path->nodes[0];
@@ -170,7 +170,7 @@ second_insert:
        /* FIXME, use some real flag for selecting the extra index */
        if (root == root->fs_info->tree_root) {
                ret = 0;
-                goto out;
+                goto out_free;
        }
        btrfs_release_path(root, path);
@@ -180,7 +180,7 @@ second_insert:
                                        name, name_len);
        if (IS_ERR(dir_item)) {
                ret2 = PTR_ERR(dir_item);
-                goto out;
+                goto out_free;
        }
        leaf = path->nodes[0];
        btrfs_cpu_key_to_disk(&disk_key, location);
@@ -192,7 +192,9 @@ second_insert:
        name_ptr = (unsigned long)(dir_item + 1);
        write_extent_buffer(leaf, name, name_ptr, name_len);
        btrfs_mark_buffer_dirty(leaf);
-out:
+out_free:
        btrfs_free_path(path);
        if (ret)
                return ret;
@@ -377,6 +379,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
        leaf = path->nodes[0];
        dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+        if (verify_dir_item(root, leaf, dir_item))
+                return NULL;
        total_len = btrfs_item_size_nr(leaf, path->slots[0]);
        while (cur < total_len) {
                this_len = sizeof(*dir_item) +
@@ -429,3 +434,35 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
        }
        return ret;
 }
+int verify_dir_item(struct btrfs_root *root,
+                    struct extent_buffer *leaf,
+                    struct btrfs_dir_item *dir_item)
+{
+        u16 namelen = BTRFS_NAME_LEN;
+        u8 type = btrfs_dir_type(leaf, dir_item);
+        if (type >= BTRFS_FT_MAX) {
+                printk(KERN_CRIT "btrfs: invalid dir item type: %d\n",
+                       (int)type);
+                return 1;
+        }
+        if (type == BTRFS_FT_XATTR)
+                namelen = XATTR_NAME_MAX;
+        if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
+                printk(KERN_CRIT "btrfS: invalid dir item name len: %u\n",
+                       (unsigned)btrfs_dir_data_len(leaf, dir_item));
+                return 1;
+        }
+        /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
+        if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) {
+                printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n",
+                       (unsigned)btrfs_dir_data_len(leaf, dir_item));
+                return 1;
+        }
+        return 0;
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b531c36455d8..68c84c8c24bd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,6 +29,7 @@
 #include <linux/crc32c.h>
 #include <linux/slab.h>
 #include <linux/migrate.h>
+#include <asm/unaligned.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -198,7 +199,7 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
 void btrfs_csum_final(u32 crc, char *result)
 {
-        *(__le32 *)result = ~cpu_to_le32(crc);
+        put_unaligned_le32(~crc, result);
 }
 /*
@@ -323,6 +324,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
        int num_copies = 0;
        int mirror_num = 0;
+        clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
        io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
        while (1) {
                ret = read_extent_buffer_pages(io_tree, eb, start, 1,
@@ -331,6 +333,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
                    !verify_parent_transid(io_tree, eb, parent_transid))
                        return ret;
+                /*
+                 * This buffer's crc is fine, but its contents are corrupted, so
+                 * there is no reason to read the other copies, they won't be
+                 * any less wrong.
+                 */
+                if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+                        return ret;
                num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
                                              eb->start, eb->len);
                if (num_copies == 1)
@@ -359,10 +369,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        tree = &BTRFS_I(page->mapping->host)->io_tree;
-        if (page->private == EXTENT_PAGE_PRIVATE)
+        if (page->private == EXTENT_PAGE_PRIVATE) {
+                WARN_ON(1);
                goto out;
-        if (!page->private)
+        }
+        if (!page->private) {
+                WARN_ON(1);
                goto out;
+        }
        len = page->private >> 2;
        WARN_ON(len == 0);
@@ -415,6 +429,73 @@ static int check_tree_block_fsid(struct btrfs_root *root,
        return ret;
 }
+#define CORRUPT(reason, eb, root, slot)                         \
+        printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
+               "root=%llu, slot=%d\n", reason,                  \
+               (unsigned long long)btrfs_header_bytenr(eb),     \
+               (unsigned long long)root->objectid, slot)
+static noinline int check_leaf(struct btrfs_root *root,
+                               struct extent_buffer *leaf)
+{
+        struct btrfs_key key;
+        struct btrfs_key leaf_key;
+        u32 nritems = btrfs_header_nritems(leaf);
+        int slot;
+        if (nritems == 0)
+                return 0;
+        /* Check the 0 item */
+        if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+            BTRFS_LEAF_DATA_SIZE(root)) {
+                CORRUPT("invalid item offset size pair", leaf, root, 0);
+                return -EIO;
+        }
+        /*
+         * Check to make sure each items keys are in the correct order and their
+         * offsets make sense.  We only have to loop through nritems-1 because
+         * we check the current slot against the next slot, which verifies the
+         * next slot's offset+size makes sense and that the current's slot
+         * offset is correct.
+         */
+        for (slot = 0; slot < nritems - 1; slot++) {
+                btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
+                btrfs_item_key_to_cpu(leaf, &key, slot + 1);
+                /* Make sure the keys are in the right order */
+                if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
+                        CORRUPT("bad key order", leaf, root, slot);
+                        return -EIO;
+                }
+                /*
+                 * Make sure the offset and ends are right, remember that the
+                 * item data starts at the end of the leaf and grows towards the
+                 * front.
+                 */
+                if (btrfs_item_offset_nr(leaf, slot) !=
+                        btrfs_item_end_nr(leaf, slot + 1)) {
+                        CORRUPT("slot offset bad", leaf, root, slot);
+                        return -EIO;
+                }
+                /*
+                 * Check to make sure that we don't point outside of the leaf,
+                 * just incase all the items are consistent to eachother, but
+                 * all point outside of the leaf.
+                 */
+                if (btrfs_item_end_nr(leaf, slot) >
+                    BTRFS_LEAF_DATA_SIZE(root)) {
+                        CORRUPT("slot end outside of leaf", leaf, root, slot);
+                        return -EIO;
+                }
+        }
+        return 0;
+}
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
 {
@@ -481,8 +562,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        btrfs_set_buffer_lockdep_class(eb, found_level);
        ret = csum_tree_block(root, eb, 1);
-        if (ret)
+        if (ret) {
+                ret = -EIO;
+                goto err;
+        }
+        /*
+         * If this is a leaf block and it is corrupt, set the corrupt bit so
+         * that we don't try and read the other copies of this block, just
+         * return -EIO.
+         */
+        if (found_level == 0 && check_leaf(root, eb)) {
+                set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
                ret = -EIO;
+        }
        end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
        end = eb->start + end - 1;
@@ -843,7 +936,6 @@ static const struct address_space_operations btree_aops = {
        .writepages     = btree_writepages,
        .releasepage    = btree_releasepage,
        .invalidatepage = btree_invalidatepage,
-        .sync_page      = block_sync_page,
 #ifdef CONFIG_MIGRATION
        .migratepage    = btree_migratepage,
 #endif
@@ -1156,7 +1248,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
                     root, fs_info, location->objectid);
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path) {
+                kfree(root);
+                return ERR_PTR(-ENOMEM);
+        }
        ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
        if (ret == 0) {
                l = path->nodes[0];
@@ -1180,8 +1275,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
        root->commit_root = btrfs_root_node(root);
        BUG_ON(!root->node);
 out:
-        if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
+        if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
                root->ref_cows = 1;
+                btrfs_check_and_init_root_item(&root->root_item);
+        }
        return root;
 }
@@ -1327,82 +1424,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 }
 /*
- * this unplugs every device on the box, and it is only used when page
- * is null
- */
-static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-        struct btrfs_device *device;
-        struct btrfs_fs_info *info;
-        info = (struct btrfs_fs_info *)bdi->unplug_io_data;
-        list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
-                if (!device->bdev)
-                        continue;
-                bdi = blk_get_backing_dev_info(device->bdev);
-                if (bdi->unplug_io_fn)
-                        bdi->unplug_io_fn(bdi, page);
-        }
-}
-static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-        struct inode *inode;
-        struct extent_map_tree *em_tree;
-        struct extent_map *em;
-        struct address_space *mapping;
-        u64 offset;
-        /* the generic O_DIRECT read code does this */
-        if (1 || !page) {
-                __unplug_io_fn(bdi, page);
-                return;
-        }
-        /*
-         * page->mapping may change at any time.  Get a consistent copy
-         * and use that for everything below
-         */
-        smp_mb();
-        mapping = page->mapping;
-        if (!mapping)
-                return;
-        inode = mapping->host;
-        /*
-         * don't do the expensive searching for a small number of
-         * devices
-         */
-        if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
-                __unplug_io_fn(bdi, page);
-                return;
-        }
-        offset = page_offset(page);
-        em_tree = &BTRFS_I(inode)->extent_tree;
-        read_lock(&em_tree->lock);
-        em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
-        read_unlock(&em_tree->lock);
-        if (!em) {
-                __unplug_io_fn(bdi, page);
-                return;
-        }
-        if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-                free_extent_map(em);
-                __unplug_io_fn(bdi, page);
-                return;
-        }
-        offset = offset - em->start;
-        btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
-                          em->block_start + offset, page);
-        free_extent_map(em);
-}
-/*
 * If this fails, caller must call bdi_destroy() to get rid of the
 * bdi again.
 */
@@ -1416,8 +1437,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
                return err;
        bdi->ra_pages   = default_backing_dev_info.ra_pages;
-        bdi->unplug_io_fn       = btrfs_unplug_io_fn;
-        bdi->unplug_io_data     = info;
        bdi->congested_fn       = btrfs_congested_fn;
        bdi->congested_data     = info;
        return 0;
@@ -1550,6 +1569,7 @@ static int transaction_kthread(void *arg)
                spin_unlock(&root->fs_info->new_trans_lock);
                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(IS_ERR(trans));
                if (transid == trans->transid) {
                        ret = btrfs_commit_transaction(trans, root);
                        BUG_ON(ret);
@@ -1627,6 +1647,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                goto fail_bdi;
        }
+        fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
@@ -1757,6 +1779,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+        /*
+         * In the long term, we'll store the compression type in the super
+         * block, and it'll be used for per file compression control.
+         */
+        fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
        ret = btrfs_parse_options(tree_root, options);
        if (ret) {
                err = ret;
@@ -1962,6 +1990,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->metadata_alloc_profile = (u64)-1;
        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+        ret = btrfs_init_space_info(fs_info);
+        if (ret) {
+                printk(KERN_ERR "Failed to initial space info: %d\n", ret);
+                goto fail_block_groups;
+        }
        ret = btrfs_read_block_groups(extent_root);
        if (ret) {
                printk(KERN_ERR "Failed to read block groups: %d\n", ret);
@@ -2053,9 +2087,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (!(sb->s_flags & MS_RDONLY)) {
                down_read(&fs_info->cleanup_work_sem);
-                btrfs_orphan_cleanup(fs_info->fs_root);
+                err = btrfs_orphan_cleanup(fs_info->fs_root);
-                btrfs_orphan_cleanup(fs_info->tree_root);
+                if (!err)
+                        err = btrfs_orphan_cleanup(fs_info->tree_root);
                up_read(&fs_info->cleanup_work_sem);
+                if (err) {
+                        close_ctree(tree_root);
+                        return ERR_PTR(err);
+                }
        }
        return tree_root;
@@ -2430,8 +2469,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
                root_objectid = gang[ret - 1]->root_key.objectid + 1;
                for (i = 0; i < ret; i++) {
+                        int err;
                        root_objectid = gang[i]->root_key.objectid;
-                        btrfs_orphan_cleanup(gang[i]);
+                        err = btrfs_orphan_cleanup(gang[i]);
+                        if (err)
+                                return err;
                }
                root_objectid++;
        }
@@ -2453,10 +2496,14 @@ int btrfs_commit_super(struct btrfs_root *root)
        up_write(&root->fs_info->cleanup_work_sem);
        trans = btrfs_join_transaction(root, 1);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        ret = btrfs_commit_transaction(trans, root);
        BUG_ON(ret);
        /* run commit again to drop the original snapshot */
        trans = btrfs_join_transaction(root, 1);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        btrfs_commit_transaction(trans, root);
        ret = btrfs_write_and_wait_transaction(NULL, root);
        BUG_ON(ret);
@@ -2484,7 +2531,7 @@ int close_ctree(struct btrfs_root *root)
         * ERROR state on disk.
         *
         * 2. when btrfs flips readonly just in btrfs_commit_super,
-         * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+         * and in such case, btrfs cannot write sb via btrfs_commit_super,
         * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
         * btrfs will cleanup all FS resources first and write sb then.
         */
@@ -2554,6 +2601,8 @@ int close_ctree(struct btrfs_root *root)
        kfree(fs_info->chunk_root);
        kfree(fs_info->dev_root);
        kfree(fs_info->csum_root);
+        kfree(fs_info);
        return 0;
 }
@@ -2936,7 +2985,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
                        break;
                /* opt_discard */
-                ret = btrfs_error_discard_extent(root, start, end + 1 - start);
+                if (btrfs_test_opt(root, DISCARD))
+                        ret = btrfs_error_discard_extent(root, start,
+                                                         end + 1 - start,
+                                                         NULL);
                clear_extent_dirty(unpin, start, end, GFP_NOFS);
                btrfs_error_unpin_extent_range(root, start, end);
@@ -3005,7 +3057,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
                btrfs_destroy_pinned_extent(root,
                                            root->fs_info->pinned_extents);
-                t->use_count = 0;
+                atomic_set(&t->use_count, 0);
                list_del_init(&t->list);
                memset(t, 0, sizeof(*t));
                kmem_cache_free(btrfs_transaction_cachep, t);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 9786963b07e5..b4ffad859adb 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        int len = *max_len;
        int type;
-        if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
+        if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
-            (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+                *max_len = BTRFS_FID_SIZE_CONNECTABLE;
                return 255;
+        } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+                *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+                return 255;
+        }
        len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
        type = FILEID_BTRFS_WITHOUT_PARENT;
@@ -171,6 +175,8 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
        int ret;
        path = btrfs_alloc_path();
+        if (!path)
+                return ERR_PTR(-ENOMEM);
        if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
                key.objectid = root->root_key.objectid;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b55269340cec..31f33ba56fe8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,11 +33,28 @@
 #include "locking.h"
 #include "free-space-cache.h"
+/* control flags for do_chunk_alloc's force field
+ * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
+ * if we really need one.
+ *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ * CHUNK_ALLOC_LIMITED means to only try and allocate one
+ * if we have very few chunks already allocated.  This is
+ * used as part of the clustering code to help make sure
+ * we have a good pool of storage to cluster in, without
+ * filling the FS with empty chunks
+ *
+ */
+enum {
+        CHUNK_ALLOC_NO_FORCE = 0,
+        CHUNK_ALLOC_FORCE = 1,
+        CHUNK_ALLOC_LIMITED = 2,
+};
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc);
-static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                 u64 num_bytes, int reserve, int sinfo);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 parent,
@@ -320,11 +337,6 @@ static int caching_kthread(void *data)
        if (!path)
                return -ENOMEM;
-        exclude_super_stripes(extent_root, block_group);
-        spin_lock(&block_group->space_info->lock);
-        block_group->space_info->bytes_readonly += block_group->bytes_super;
-        spin_unlock(&block_group->space_info->lock);
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
        /*
@@ -447,7 +459,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
         * allocate blocks for the tree root we can't do the fast caching since
         * we likely hold important locks.
         */
-        if (!trans->transaction->in_commit &&
+        if (trans && (!trans->transaction->in_commit) &&
            (root && root != root->fs_info->tree_root)) {
                spin_lock(&cache->lock);
                if (cache->cached != BTRFS_CACHE_NO) {
@@ -467,14 +479,16 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                        cache->cached = BTRFS_CACHE_NO;
                }
                spin_unlock(&cache->lock);
-                if (ret == 1)
+                if (ret == 1) {
+                        free_excluded_extents(fs_info->extent_root, cache);
                        return 0;
+                }
        }
        if (load_cache_only)
                return 0;
-        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
+        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
        BUG_ON(!caching_ctl);
        INIT_LIST_HEAD(&caching_ctl->list);
@@ -1743,39 +1757,45 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
        return ret;
 }
-static void btrfs_issue_discard(struct block_device *bdev,
+static int btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
 {
-        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
+        return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
 }
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
-                                u64 num_bytes)
+                                u64 num_bytes, u64 *actual_bytes)
 {
        int ret;
-        u64 map_length = num_bytes;
+        u64 discarded_bytes = 0;
        struct btrfs_multi_bio *multi = NULL;
-        if (!btrfs_test_opt(root, DISCARD))
-                return 0;
        /* Tell the block device(s) that the sectors can be discarded */
-        ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+        ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
-                              bytenr, &map_length, &multi, 0);
+                              bytenr, &num_bytes, &multi, 0);
        if (!ret) {
                struct btrfs_bio_stripe *stripe = multi->stripes;
                int i;
-                if (map_length > num_bytes)
-                        map_length = num_bytes;
                for (i = 0; i < multi->num_stripes; i++, stripe++) {
-                        btrfs_issue_discard(stripe->dev->bdev,
+                        ret = btrfs_issue_discard(stripe->dev->bdev,
-                                            stripe->physical,
+                                                  stripe->physical,
-                                            map_length);
+                                                  stripe->length);
+                        if (!ret)
+                                discarded_bytes += stripe->length;
+                        else if (ret != -EOPNOTSUPP)
+                                break;
                }
                kfree(multi);
        }
+        if (discarded_bytes && ret == -EOPNOTSUPP)
+                ret = 0;
+        if (actual_bytes)
+                *actual_bytes = discarded_bytes;
        return ret;
 }
@@ -3018,7 +3038,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->bytes_readonly = 0;
        found->bytes_may_use = 0;
        found->full = 0;
-        found->force_alloc = 0;
+        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
+        found->chunk_alloc = 0;
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
        atomic_set(&found->caching_threads, 0);
@@ -3149,7 +3170,7 @@ again:
                if (!data_sinfo->full && alloc_chunk) {
                        u64 alloc_target;
-                        data_sinfo->force_alloc = 1;
+                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
                        spin_unlock(&data_sinfo->lock);
 alloc:
                        alloc_target = btrfs_get_alloc_profile(root, 1);
@@ -3159,7 +3180,8 @@ alloc:
                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
                                             bytes + 2 * 1024 * 1024,
-                                             alloc_target, 0);
+                                             alloc_target,
+                                             CHUNK_ALLOC_NO_FORCE);
                        btrfs_end_transaction(trans, root);
                        if (ret < 0) {
                                if (ret != -ENOSPC)
@@ -3238,31 +3260,56 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
-                        found->force_alloc = 1;
+                        found->force_alloc = CHUNK_ALLOC_FORCE;
        }
        rcu_read_unlock();
 }
 static int should_alloc_chunk(struct btrfs_root *root,
-                              struct btrfs_space_info *sinfo, u64 alloc_bytes)
+                              struct btrfs_space_info *sinfo, u64 alloc_bytes,
+                              int force)
 {
        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+        u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
        u64 thresh;
-        if (sinfo->bytes_used + sinfo->bytes_reserved +
+        if (force == CHUNK_ALLOC_FORCE)
-            alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+                return 1;
+        /*
+         * in limited mode, we want to have some free space up to
+         * about 1% of the FS size.
+         */
+        if (force == CHUNK_ALLOC_LIMITED) {
+                thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+                thresh = max_t(u64, 64 * 1024 * 1024,
+                               div_factor_fine(thresh, 1));
+                if (num_bytes - num_allocated < thresh)
+                        return 1;
+        }
+        /*
+         * we have two similar checks here, one based on percentage
+         * and once based on a hard number of 256MB.  The idea
+         * is that if we have a good amount of free
+         * room, don't allocate a chunk.  A good mount is
+         * less than 80% utilized of the chunks we have allocated,
+         * or more than 256MB free
+         */
+        if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
                return 0;
-        if (sinfo->bytes_used + sinfo->bytes_reserved +
+        if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
-            alloc_bytes < div_factor(num_bytes, 8))
                return 0;
        thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+        /* 256MB or 5% of the FS */
        thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
        if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
                return 0;
        return 1;
 }
@@ -3272,10 +3319,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 {
        struct btrfs_space_info *space_info;
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
+        int wait_for_alloc = 0;
        int ret = 0;
-        mutex_lock(&fs_info->chunk_mutex);
        flags = btrfs_reduce_alloc_profile(extent_root, flags);
        space_info = __find_space_info(extent_root->fs_info, flags);
@@ -3286,21 +3332,40 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        }
        BUG_ON(!space_info);
+again:
        spin_lock(&space_info->lock);
        if (space_info->force_alloc)
-                force = 1;
+                force = space_info->force_alloc;
        if (space_info->full) {
                spin_unlock(&space_info->lock);
-                goto out;
+                return 0;
        }
-        if (!force && !should_alloc_chunk(extent_root, space_info,
+        if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
-                                          alloc_bytes)) {
                spin_unlock(&space_info->lock);
-                goto out;
+                return 0;
+        } else if (space_info->chunk_alloc) {
+                wait_for_alloc = 1;
+        } else {
+                space_info->chunk_alloc = 1;
        }
        spin_unlock(&space_info->lock);
+        mutex_lock(&fs_info->chunk_mutex);
+        /*
+         * The chunk_mutex is held throughout the entirety of a chunk
+         * allocation, so once we've acquired the chunk_mutex we know that the
+         * other guy is done and we need to recheck and see if we should
+         * allocate.
+         */
+        if (wait_for_alloc) {
+                mutex_unlock(&fs_info->chunk_mutex);
+                wait_for_alloc = 0;
+                goto again;
+        }
        /*
         * If we have mixed data/metadata chunks we want to make sure we keep
         * allocating mixed chunks instead of individual chunks.
@@ -3326,9 +3391,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                space_info->full = 1;
        else
                ret = 1;
-        space_info->force_alloc = 0;
+        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+        space_info->chunk_alloc = 0;
        spin_unlock(&space_info->lock);
-out:
        mutex_unlock(&extent_root->fs_info->chunk_mutex);
        return ret;
 }
@@ -3344,21 +3410,24 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
        u64 reserved;
        u64 max_reclaim;
        u64 reclaimed = 0;
-        int pause = 1;
+        long time_left;
        int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+        int loops = 0;
+        unsigned long progress;
        block_rsv = &root->fs_info->delalloc_block_rsv;
        space_info = block_rsv->space_info;
        smp_mb();
        reserved = space_info->bytes_reserved;
+        progress = space_info->reservation_progress;
        if (reserved == 0)
                return 0;
        max_reclaim = min(reserved, to_reclaim);
-        while (1) {
+        while (loops < 1024) {
                /* have the flusher threads jump in and do some IO */
                smp_mb();
                nr_pages = min_t(unsigned long, nr_pages,
@@ -3371,17 +3440,31 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                reserved = space_info->bytes_reserved;
                spin_unlock(&space_info->lock);
+                loops++;
                if (reserved == 0 || reclaimed >= max_reclaim)
                        break;
                if (trans && trans->transaction->blocked)
                        return -EAGAIN;
-                __set_current_state(TASK_INTERRUPTIBLE);
+                time_left = schedule_timeout_interruptible(1);
-                schedule_timeout(pause);
-                pause <<= 1;
+                /* We were interrupted, exit */
-                if (pause > HZ / 10)
+                if (time_left)
-                        pause = HZ / 10;
+                        break;
+                /* we've kicked the IO a few times, if anything has been freed,
+                 * exit.  There is no sense in looping here for a long time
+                 * when we really need to commit the transaction, or there are
+                 * just too many writers without enough free space
+                 */
+                if (loops > 3) {
+                        smp_mb();
+                        if (progress != space_info->reservation_progress)
+                                break;
+                }
        }
        return reclaimed >= to_reclaim;
@@ -3588,10 +3671,23 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
        if (num_bytes > 0) {
                if (dest) {
-                        block_rsv_add_bytes(dest, num_bytes, 0);
+                        spin_lock(&dest->lock);
-                } else {
+                        if (!dest->full) {
+                                u64 bytes_to_add;
+                                bytes_to_add = dest->size - dest->reserved;
+                                bytes_to_add = min(num_bytes, bytes_to_add);
+                                dest->reserved += bytes_to_add;
+                                if (dest->reserved >= dest->size)
+                                        dest->full = 1;
+                                num_bytes -= bytes_to_add;
+                        }
+                        spin_unlock(&dest->lock);
+                }
+                if (num_bytes) {
                        spin_lock(&space_info->lock);
                        space_info->bytes_reserved -= num_bytes;
+                        space_info->reservation_progress++;
                        spin_unlock(&space_info->lock);
                }
        }
@@ -3824,6 +3920,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
        if (block_rsv->reserved >= block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
                sinfo->bytes_reserved -= num_bytes;
+                sinfo->reservation_progress++;
                block_rsv->reserved = block_rsv->size;
                block_rsv->full = 1;
        }
@@ -3968,6 +4065,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
        u64 to_reserve;
        int nr_extents;
+        int reserved_extents;
        int ret;
        if (btrfs_transaction_in_commit(root->fs_info))
@@ -3975,26 +4073,24 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        num_bytes = ALIGN(num_bytes, root->sectorsize);
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
-        if (nr_extents > BTRFS_I(inode)->reserved_extents) {
+        reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
-                nr_extents -= BTRFS_I(inode)->reserved_extents;
+        if (nr_extents > reserved_extents) {
+                nr_extents -= reserved_extents;
                to_reserve = calc_trans_metadata_size(root, nr_extents);
        } else {
                nr_extents = 0;
                to_reserve = 0;
        }
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        to_reserve += calc_csum_metadata_size(inode, num_bytes);
        ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
        if (ret)
                return ret;
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
-        BTRFS_I(inode)->reserved_extents += nr_extents;
        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        block_rsv_add_bytes(block_rsv, to_reserve, 1);
@@ -4009,19 +4105,30 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 to_free;
        int nr_extents;
+        int reserved_extents;
        num_bytes = ALIGN(num_bytes, root->sectorsize);
        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+        WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
-        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
+        do {
-        if (nr_extents < BTRFS_I(inode)->reserved_extents) {
+                int old, new;
-                nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
-                BTRFS_I(inode)->reserved_extents -= nr_extents;
+                nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
-        } else {
+                if (nr_extents >= reserved_extents) {
-                nr_extents = 0;
+                        nr_extents = 0;
-        }
+                        break;
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
+                }
+                old = reserved_extents;
+                nr_extents = reserved_extents - nr_extents;
+                new = reserved_extents - nr_extents;
+                old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
+                                     reserved_extents, new);
+                if (likely(old == reserved_extents))
+                        break;
+                reserved_extents = old;
+        } while (1);
        to_free = calc_csum_metadata_size(inode, num_bytes);
        if (nr_extents > 0)
@@ -4112,6 +4219,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->reserved -= num_bytes;
                        cache->space_info->bytes_reserved -= num_bytes;
+                        cache->space_info->reservation_progress++;
                        cache->space_info->bytes_used += num_bytes;
                        cache->space_info->disk_used += num_bytes * factor;
                        spin_unlock(&cache->lock);
@@ -4163,6 +4271,7 @@ static int pin_down_extent(struct btrfs_root *root,
        if (reserved) {
                cache->reserved -= num_bytes;
                cache->space_info->bytes_reserved -= num_bytes;
+                cache->space_info->reservation_progress++;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);
@@ -4193,8 +4302,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
 * update size of reserved extents. this function may return -EAGAIN
 * if 'reserve' is true or 'sinfo' is false.
 */
-static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                 u64 num_bytes, int reserve, int sinfo)
+                                u64 num_bytes, int reserve, int sinfo)
 {
        int ret = 0;
        if (sinfo) {
@@ -4213,6 +4322,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
                                space_info->bytes_readonly += num_bytes;
                        cache->reserved -= num_bytes;
                        space_info->bytes_reserved -= num_bytes;
+                        space_info->reservation_progress++;
                }
                spin_unlock(&cache->lock);
                spin_unlock(&space_info->lock);
@@ -4332,7 +4442,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                if (ret)
                        break;
-                ret = btrfs_discard_extent(root, start, end + 1 - start);
+                if (btrfs_test_opt(root, DISCARD))
+                        ret = btrfs_discard_extent(root, start,
+                                                   end + 1 - start, NULL);
                clear_extent_dirty(unpin, start, end, GFP_NOFS);
                unpin_extent_range(root, start, end);
@@ -4673,10 +4785,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
                btrfs_add_free_space(cache, buf->start, buf->len);
-                ret = update_reserved_bytes(cache, buf->len, 0, 0);
+                ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
                if (ret == -EAGAIN) {
                        /* block group became read-only */
-                        update_reserved_bytes(cache, buf->len, 0, 1);
+                        btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
                        goto out;
                }
@@ -4691,6 +4803,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                if (ret) {
                        spin_lock(&cache->space_info->lock);
                        cache->space_info->bytes_reserved -= buf->len;
+                        cache->space_info->reservation_progress++;
                        spin_unlock(&cache->space_info->lock);
                }
                goto out;
@@ -4712,6 +4825,11 @@ pin:
                }
        }
 out:
+        /*
+         * Deleting the buffer, clear the corrupt flag since it doesn't matter
+         * anymore.
+         */
+        clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
        btrfs_put_block_group(cache);
 }
@@ -5159,7 +5277,7 @@ checks:
                                             search_start - offset);
                BUG_ON(offset > search_start);
-                ret = update_reserved_bytes(block_group, num_bytes, 1,
+                ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
                                            (data & BTRFS_BLOCK_GROUP_DATA));
                if (ret == -EAGAIN) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
@@ -5250,11 +5368,13 @@ loop:
                if (allowed_chunk_alloc) {
                        ret = do_chunk_alloc(trans, root, num_bytes +
-                                             2 * 1024 * 1024, data, 1);
+                                             2 * 1024 * 1024, data,
+                                             CHUNK_ALLOC_LIMITED);
                        allowed_chunk_alloc = 0;
                        done_chunk_alloc = 1;
-                } else if (!done_chunk_alloc) {
+                } else if (!done_chunk_alloc &&
-                        space_info->force_alloc = 1;
+                           space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) {
+                        space_info->force_alloc = CHUNK_ALLOC_LIMITED;
                }
                if (loop < LOOP_NO_EMPTY_SIZE) {
@@ -5340,7 +5460,8 @@ again:
         */
        if (empty_size || root->ref_cows)
                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                     num_bytes + 2 * 1024 * 1024, data, 0);
+                                     num_bytes + 2 * 1024 * 1024, data,
+                                     CHUNK_ALLOC_NO_FORCE);
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -5352,10 +5473,10 @@ again:
                num_bytes = num_bytes & ~(root->sectorsize - 1);
                num_bytes = max(num_bytes, min_alloc_size);
                do_chunk_alloc(trans, root->fs_info->extent_root,
-                               num_bytes, data, 1);
+                               num_bytes, data, CHUNK_ALLOC_FORCE);
                goto again;
        }
-        if (ret == -ENOSPC) {
+        if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
                struct btrfs_space_info *sinfo;
                sinfo = __find_space_info(root->fs_info, data);
@@ -5365,6 +5486,8 @@ again:
                dump_space_info(sinfo, num_bytes, 1);
        }
+        trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
        return ret;
 }
@@ -5380,12 +5503,15 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
                return -ENOSPC;
        }
-        ret = btrfs_discard_extent(root, start, len);
+        if (btrfs_test_opt(root, DISCARD))
+                ret = btrfs_discard_extent(root, start, len, NULL);
        btrfs_add_free_space(cache, start, len);
-        update_reserved_bytes(cache, len, 0, 1);
+        btrfs_update_reserved_bytes(cache, len, 0, 1);
        btrfs_put_block_group(cache);
+        trace_btrfs_reserved_extent_free(root, start, len);
        return ret;
 }
@@ -5412,7 +5538,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
@@ -5582,7 +5709,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                put_caching_control(caching_ctl);
        }
-        ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
+        ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
        BUG_ON(ret);
        btrfs_put_block_group(block_group);
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5633,6 +5760,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
              struct btrfs_root *root, u32 blocksize)
 {
        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
        int ret;
        block_rsv = get_block_rsv(trans, root);
@@ -5640,14 +5768,39 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        if (block_rsv->size == 0) {
                ret = reserve_metadata_bytes(trans, root, block_rsv,
                                             blocksize, 0);
-                if (ret)
+                /*
+                 * If we couldn't reserve metadata bytes try and use some from
+                 * the global reserve.
+                 */
+                if (ret && block_rsv != global_rsv) {
+                        ret = block_rsv_use_bytes(global_rsv, blocksize);
+                        if (!ret)
+                                return global_rsv;
                        return ERR_PTR(ret);
+                } else if (ret) {
+                        return ERR_PTR(ret);
+                }
                return block_rsv;
        }
        ret = block_rsv_use_bytes(block_rsv, blocksize);
        if (!ret)
                return block_rsv;
+        if (ret) {
+                WARN_ON(1);
+                ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
+                                             0);
+                if (!ret) {
+                        spin_lock(&block_rsv->lock);
+                        block_rsv->size += blocksize;
+                        spin_unlock(&block_rsv->lock);
+                        return block_rsv;
+                } else if (ret && block_rsv != global_rsv) {
+                        ret = block_rsv_use_bytes(global_rsv, blocksize);
+                        if (!ret)
+                                return global_rsv;
+                }
+        }
        return ERR_PTR(-ENOSPC);
 }
@@ -5989,6 +6142,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                if (reada && level == 1)
                        reada_walk_down(trans, root, wc, path);
                next = read_tree_block(root, bytenr, blocksize, generation);
+                if (!next)
+                        return -EIO;
                btrfs_tree_lock(next);
                btrfs_set_lock_blocking(next);
        }
@@ -6221,6 +6376,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        BUG_ON(!wc);
        trans = btrfs_start_transaction(tree_root, 0);
+        BUG_ON(IS_ERR(trans));
        if (block_rsv)
                trans->block_rsv = block_rsv;
@@ -6318,6 +6475,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                        btrfs_end_transaction_throttle(trans, tree_root);
                        trans = btrfs_start_transaction(tree_root, 0);
+                        BUG_ON(IS_ERR(trans));
                        if (block_rsv)
                                trans->block_rsv = block_rsv;
                }
@@ -6377,10 +6535,14 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        wc = kzalloc(sizeof(*wc), GFP_NOFS);
-        BUG_ON(!wc);
+        if (!wc) {
+                btrfs_free_path(path);
+                return -ENOMEM;
+        }
        btrfs_assert_tree_locked(parent);
        parent_level = btrfs_header_level(parent);
@@ -6446,6 +6608,8 @@ static noinline int relocate_inode_pages(struct inode *inode, u64 start,
        int ret = 0;
        ra = kzalloc(sizeof(*ra), GFP_NOFS);
+        if (!ra)
+                return -ENOMEM;
        mutex_lock(&inode->i_mutex);
        first_index = start >> PAGE_CACHE_SHIFT;
@@ -6531,7 +6695,7 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
        u64 end = start + extent_key->offset - 1;
        em = alloc_extent_map(GFP_NOFS);
-        BUG_ON(!em || IS_ERR(em));
+        BUG_ON(!em);
        em->start = start;
        em->len = extent_key->offset;
@@ -6836,7 +7000,11 @@ static noinline int get_new_locations(struct inode *reloc_inode,
        }
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path) {
+                if (exts != *extents)
+                        kfree(exts);
+                return -ENOMEM;
+        }
        cur_pos = extent_key->objectid - offset;
        last_byte = extent_key->objectid + extent_key->offset;
@@ -6878,6 +7046,10 @@ static noinline int get_new_locations(struct inode *reloc_inode,
                        struct disk_extent *old = exts;
                        max *= 2;
                        exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
+                        if (!exts) {
+                                ret = -ENOMEM;
+                                goto out;
+                        }
                        memcpy(exts, old, sizeof(*exts) * nr);
                        if (old != *extents)
                                kfree(old);
@@ -7360,7 +7532,8 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
        int ret;
        new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
-        BUG_ON(!new_extent);
+        if (!new_extent)
+                return -ENOMEM;
        ref = btrfs_lookup_leaf_ref(root, leaf->start);
        BUG_ON(!ref);
@@ -7477,7 +7650,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
                BUG_ON(reloc_root->commit_root != NULL);
                while (1) {
                        trans = btrfs_join_transaction(root, 1);
-                        BUG_ON(!trans);
+                        BUG_ON(IS_ERR(trans));
                        mutex_lock(&root->fs_info->drop_mutex);
                        ret = btrfs_drop_snapshot(trans, reloc_root);
@@ -7535,7 +7708,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
        if (found) {
                trans = btrfs_start_transaction(root, 1);
-                BUG_ON(!trans);
+                BUG_ON(IS_ERR(trans));
                ret = btrfs_commit_transaction(trans, root);
                BUG_ON(ret);
        }
@@ -7546,7 +7719,8 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
        reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
        BUG_ON(!reloc_root);
-        btrfs_orphan_cleanup(reloc_root);
+        ret = btrfs_orphan_cleanup(reloc_root);
+        BUG_ON(ret);
        return 0;
 }
@@ -7564,7 +7738,8 @@ static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
                return 0;
        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
-        BUG_ON(!root_item);
+        if (!root_item)
+                return -ENOMEM;
        ret = btrfs_copy_root(trans, root, root->commit_root,
                              &eb, BTRFS_TREE_RELOC_OBJECTID);
@@ -7590,7 +7765,7 @@ static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
        reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
                                                 &root_key);
-        BUG_ON(!reloc_root);
+        BUG_ON(IS_ERR(reloc_root));
        reloc_root->last_trans = trans->transid;
        reloc_root->commit_root = NULL;
        reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
@@ -7779,7 +7954,7 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
        trans = btrfs_start_transaction(extent_root, 1);
-        BUG_ON(!trans);
+        BUG_ON(IS_ERR(trans));
        if (extent_key->objectid == 0) {
                ret = del_extent_zero(trans, extent_root, path, extent_key);
@@ -7843,6 +8018,10 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
                        eb = read_tree_block(found_root, block_start,
                                             block_size, 0);
+                        if (!eb) {
+                                ret = -EIO;
+                                goto out;
+                        }
                        btrfs_tree_lock(eb);
                        BUG_ON(level != btrfs_header_level(eb));
@@ -7998,13 +8177,15 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        alloc_flags = update_block_group_flags(root, cache->flags);
        if (alloc_flags != cache->flags)
-                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+                               CHUNK_ALLOC_FORCE);
        ret = set_block_group_ro(cache);
        if (!ret)
                goto out;
        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
-        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+                             CHUNK_ALLOC_FORCE);
        if (ret < 0)
                goto out;
        ret = set_block_group_ro(cache);
@@ -8013,6 +8194,14 @@ out:
        return ret;
 }
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 type)
+{
+        u64 alloc_flags = get_alloc_profile(root, type);
+        return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+                              CHUNK_ALLOC_FORCE);
+}
 /*
 * helper to account the unused space of all the readonly block group in the
 * list. takes mirrors into account.
@@ -8270,6 +8459,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                if (block_group->cached == BTRFS_CACHE_STARTED)
                        wait_block_group_cache_done(block_group);
+                /*
+                 * We haven't cached this block group, which means we could
+                 * possibly have excluded extents on this block group.
+                 */
+                if (block_group->cached == BTRFS_CACHE_NO)
+                        free_excluded_extents(info->extent_root, block_group);
                btrfs_remove_free_space_cache(block_group);
                btrfs_put_block_group(block_group);
@@ -8385,6 +8581,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                cache->sectorsize = root->sectorsize;
                /*
+                 * We need to exclude the super stripes now so that the space
+                 * info has super bytes accounted for, otherwise we'll think
+                 * we have more space than we actually do.
+                 */
+                exclude_super_stripes(root, cache);
+                /*
                 * check for two cases, either we are full, and therefore
                 * don't need to bother with the caching work since we won't
                 * find any space, or we are empty, and we can just add all
@@ -8392,12 +8595,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                 * time, particularly in the full case.
                 */
                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
-                        exclude_super_stripes(root, cache);
                        cache->last_byte_to_unpin = (u64)-1;
                        cache->cached = BTRFS_CACHE_FINISHED;
                        free_excluded_extents(root, cache);
                } else if (btrfs_block_group_used(&cache->item) == 0) {
-                        exclude_super_stripes(root, cache);
                        cache->last_byte_to_unpin = (u64)-1;
                        cache->cached = BTRFS_CACHE_FINISHED;
                        add_new_free_space(cache, root->fs_info,
@@ -8539,6 +8740,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        BUG_ON(!block_group);
        BUG_ON(!block_group->ro);
+        /*
+         * Free the reserved super bytes from this block group before
+         * remove it.
+         */
+        free_excluded_extents(root, block_group);
        memcpy(&key, &block_group->key, sizeof(key));
        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
                                  BTRFS_BLOCK_GROUP_RAID1 |
@@ -8642,13 +8849,84 @@ out:
        return ret;
 }
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_space_info *space_info;
+        int ret;
+        ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM, 0, 0,
+                                                                 &space_info);
+        if (ret)
+                return ret;
+        ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA, 0, 0,
+                                                                 &space_info);
+        if (ret)
+                return ret;
+        ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA, 0, 0,
+                                                                 &space_info);
+        if (ret)
+                return ret;
+        return ret;
+}
 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 {
        return unpin_extent_range(root, start, end);
 }
 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
-                               u64 num_bytes)
+                               u64 num_bytes, u64 *actual_bytes)
 {
-        return btrfs_discard_extent(root, bytenr, num_bytes);
+        return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
+}
+int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_block_group_cache *cache = NULL;
+        u64 group_trimmed;
+        u64 start;
+        u64 end;
+        u64 trimmed = 0;
+        int ret = 0;
+        cache = btrfs_lookup_block_group(fs_info, range->start);
+        while (cache) {
+                if (cache->key.objectid >= (range->start + range->len)) {
+                        btrfs_put_block_group(cache);
+                        break;
+                }
+                start = max(range->start, cache->key.objectid);
+                end = min(range->start + range->len,
+                                cache->key.objectid + cache->key.offset);
+                if (end - start >= range->minlen) {
+                        if (!block_group_cache_done(cache)) {
+                                ret = cache_block_group(cache, NULL, root, 0);
+                                if (!ret)
+                                        wait_block_group_cache_done(cache);
+                        }
+                        ret = btrfs_trim_block_group(cache,
+                                                     &group_trimmed,
+                                                     start,
+                                                     end,
+                                                     range->minlen);
+                        trimmed += group_trimmed;
+                        if (ret) {
+                                btrfs_put_block_group(cache);
+                                break;
+                        }
+                }
+                cache = next_block_group(fs_info->tree_root, cache);
+        }
+        range->len = trimmed;
+        return ret;
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2e993cf1766e..315138605088 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -690,6 +690,15 @@ static void cache_state(struct extent_state *state,
        }
 }
+static void uncache_state(struct extent_state **cached_ptr)
+{
+        if (cached_ptr && (*cached_ptr)) {
+                struct extent_state *state = *cached_ptr;
+                *cached_ptr = NULL;
+                free_extent_state(state);
+        }
+}
 /*
 * set some bits on a range in the tree.  This may require allocations or
 * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -940,10 +949,10 @@ static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 }
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-                        gfp_t mask)
+                        struct extent_state **cached_state, gfp_t mask)
 {
-        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
-                              NULL, mask);
+                              NULL, cached_state, mask);
 }
 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -1012,8 +1021,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
                                mask);
 }
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
-                  gfp_t mask)
 {
        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
                                mask);
@@ -1433,12 +1441,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 */
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end, u64 max_bytes,
-                     unsigned long bits)
+                     unsigned long bits, int contig)
 {
        struct rb_node *node;
        struct extent_state *state;
        u64 cur_start = *start;
        u64 total_bytes = 0;
+        u64 last = 0;
        int found = 0;
        if (search_end <= cur_start) {
@@ -1463,7 +1472,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
                state = rb_entry(node, struct extent_state, rb_node);
                if (state->start > search_end)
                        break;
-                if (state->end >= cur_start && (state->state & bits)) {
+                if (contig && found && state->start > last + 1)
+                        break;
+                if (state->end >= cur_start && (state->state & bits) == bits) {
                        total_bytes += min(search_end, state->end) + 1 -
                                       max(cur_start, state->start);
                        if (total_bytes >= max_bytes)
@@ -1472,6 +1483,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
                                *start = state->start;
                                found = 1;
                        }
+                        last = state->end;
+                } else if (contig && found) {
+                        break;
                }
                node = rb_next(node);
                if (!node)
@@ -1729,6 +1743,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
        do {
                struct page *page = bvec->bv_page;
+                struct extent_state *cached = NULL;
+                struct extent_state *state;
                tree = &BTRFS_I(page->mapping->host)->io_tree;
                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1743,9 +1760,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                if (++bvec <= bvec_end)
                        prefetchw(&bvec->bv_page->flags);
+                spin_lock(&tree->lock);
+                state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
+                if (state && state->start == start) {
+                        /*
+                         * take a reference on the state, unlock will drop
+                         * the ref
+                         */
+                        cache_state(state, &cached);
+                }
+                spin_unlock(&tree->lock);
                if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
                        ret = tree->ops->readpage_end_io_hook(page, start, end,
-                                                              NULL);
+                                                              state);
                        if (ret)
                                uptodate = 0;
                }
@@ -1758,15 +1786,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
                                if (err)
                                        uptodate = 0;
+                                uncache_state(&cached);
                                continue;
                        }
                }
                if (uptodate) {
-                        set_extent_uptodate(tree, start, end,
+                        set_extent_uptodate(tree, start, end, &cached,
                                            GFP_ATOMIC);
                }
-                unlock_extent(tree, start, end, GFP_ATOMIC);
+                unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
                if (whole_page) {
                        if (uptodate) {
@@ -1805,6 +1834,7 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
        do {
                struct page *page = bvec->bv_page;
+                struct extent_state *cached = NULL;
                tree = &BTRFS_I(page->mapping->host)->io_tree;
                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1815,13 +1845,14 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
                        prefetchw(&bvec->bv_page->flags);
                if (uptodate) {
-                        set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+                        set_extent_uptodate(tree, start, end, &cached,
+                                            GFP_ATOMIC);
                } else {
                        ClearPageUptodate(page);
                        SetPageError(page);
                }
-                unlock_extent(tree, start, end, GFP_ATOMIC);
+                unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
        } while (bvec >= bio->bi_io_vec);
@@ -1865,7 +1896,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
        bio_get(bio);
        if (tree->ops && tree->ops->submit_bio_hook)
-                tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+                ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
                                           mirror_num, bio_flags, start);
        else
                submit_bio(rw, bio);
@@ -1920,6 +1951,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                nr = bio_get_nr_vecs(bdev);
        bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+        if (!bio)
+                return -ENOMEM;
        bio_add_page(bio, page, page_size, offset);
        bio->bi_end_io = end_io_func;
@@ -1944,6 +1977,7 @@ void set_page_extent_mapped(struct page *page)
 static void set_page_extent_head(struct page *page, unsigned long len)
 {
+        WARN_ON(!PagePrivate(page));
        set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
 }
@@ -2007,14 +2041,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        while (cur <= end) {
                if (cur >= last_byte) {
                        char *userpage;
+                        struct extent_state *cached = NULL;
                        iosize = PAGE_CACHE_SIZE - page_offset;
                        userpage = kmap_atomic(page, KM_USER0);
                        memset(userpage + page_offset, 0, iosize);
                        flush_dcache_page(page);
                        kunmap_atomic(userpage, KM_USER0);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
-                                            GFP_NOFS);
+                                            &cached, GFP_NOFS);
-                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        unlock_extent_cached(tree, cur, cur + iosize - 1,
+                                             &cached, GFP_NOFS);
                        break;
                }
                em = get_extent(inode, page, page_offset, cur,
@@ -2054,14 +2091,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                /* we've found a hole, just zero and go on */
                if (block_start == EXTENT_MAP_HOLE) {
                        char *userpage;
+                        struct extent_state *cached = NULL;
                        userpage = kmap_atomic(page, KM_USER0);
                        memset(userpage + page_offset, 0, iosize);
                        flush_dcache_page(page);
                        kunmap_atomic(userpage, KM_USER0);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
-                                            GFP_NOFS);
+                                            &cached, GFP_NOFS);
-                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        unlock_extent_cached(tree, cur, cur + iosize - 1,
+                                             &cached, GFP_NOFS);
                        cur = cur + iosize;
                        page_offset += iosize;
                        continue;
@@ -2126,7 +2166,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
        ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
                                      &bio_flags);
        if (bio)
-                submit_one_bio(READ, bio, 0, bio_flags);
+                ret = submit_one_bio(READ, bio, 0, bio_flags);
        return ret;
 }
@@ -2179,10 +2219,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        unsigned long nr_written = 0;
        if (wbc->sync_mode == WB_SYNC_ALL)
-                write_flags = WRITE_SYNC_PLUG;
+                write_flags = WRITE_SYNC;
        else
                write_flags = WRITE;
+        trace___extent_writepage(page, inode, wbc);
        WARN_ON(!PageLocked(page));
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
@@ -2778,9 +2820,12 @@ int extent_prepare_write(struct extent_io_tree *tree,
                        iocount++;
                        block_start = block_start + iosize;
                } else {
-                        set_extent_uptodate(tree, block_start, cur_end,
+                        struct extent_state *cached = NULL;
+                        set_extent_uptodate(tree, block_start, cur_end, &cached,
                                            GFP_NOFS);
-                        unlock_extent(tree, block_start, cur_end, GFP_NOFS);
+                        unlock_extent_cached(tree, block_start, cur_end,
+                                             &cached, GFP_NOFS);
                        block_start = cur_end + 1;
                }
                page_offset = block_start & (PAGE_CACHE_SIZE - 1);
@@ -2819,9 +2864,17 @@ int try_release_extent_state(struct extent_map_tree *map,
                 * at this point we can safely clear everything except the
                 * locked bit and the nodatasum bit
                 */
-                clear_extent_bit(tree, start, end,
+                ret = clear_extent_bit(tree, start, end,
                                 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
                                 0, 0, NULL, mask);
+                /* if clear_extent_bit failed for enomem reasons,
+                 * we can't allow the release to continue.
+                 */
+                if (ret < 0)
+                        ret = 0;
+                else
+                        ret = 1;
        }
        return ret;
 }
@@ -2901,6 +2954,46 @@ out:
        return sector;
 }
+/*
+ * helper function for fiemap, which doesn't want to see any holes.
+ * This maps until we find something past 'last'
+ */
+static struct extent_map *get_extent_skip_holes(struct inode *inode,
+                                                u64 offset,
+                                                u64 last,
+                                                get_extent_t *get_extent)
+{
+        u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
+        struct extent_map *em;
+        u64 len;
+        if (offset >= last)
+                return NULL;
+        while(1) {
+                len = last - offset;
+                if (len == 0)
+                        break;
+                len = (len + sectorsize - 1) & ~(sectorsize - 1);
+                em = get_extent(inode, NULL, 0, offset, len, 0);
+                if (!em || IS_ERR(em))
+                        return em;
+                /* if this isn't a hole return it */
+                if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
+                    em->block_start != EXTENT_MAP_HOLE) {
+                        return em;
+                }
+                /* this is a hole, advance to the next extent */
+                offset = extent_map_end(em);
+                free_extent_map(em);
+                if (offset >= last)
+                        break;
+        }
+        return NULL;
+}
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent)
 {
@@ -2910,16 +3003,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        u32 flags = 0;
        u32 found_type;
        u64 last;
+        u64 last_for_get_extent = 0;
        u64 disko = 0;
+        u64 isize = i_size_read(inode);
        struct btrfs_key found_key;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        struct btrfs_path *path;
        struct btrfs_file_extent_item *item;
        int end = 0;
-        u64 em_start = 0, em_len = 0;
+        u64 em_start = 0;
+        u64 em_len = 0;
+        u64 em_end = 0;
        unsigned long emflags;
-        int hole = 0;
        if (len == 0)
                return -EINVAL;
@@ -2929,6 +3025,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                return -ENOMEM;
        path->leave_spinning = 1;
+        /*
+         * lookup the last file extent.  We're not using i_size here
+         * because there might be preallocation past i_size
+         */
        ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
                                       path, inode->i_ino, -1, 0);
        if (ret < 0) {
@@ -2942,18 +3042,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
        found_type = btrfs_key_type(&found_key);
-        /* No extents, just return */
+        /* No extents, but there might be delalloc bits */
        if (found_key.objectid != inode->i_ino ||
            found_type != BTRFS_EXTENT_DATA_KEY) {
-                btrfs_free_path(path);
+                /* have to trust i_size as the end */
-                return 0;
+                last = (u64)-1;
+                last_for_get_extent = isize;
+        } else {
+                /*
+                 * remember the start of the last extent.  There are a
+                 * bunch of different factors that go into the length of the
+                 * extent, so its much less complex to remember where it started
+                 */
+                last = found_key.offset;
+                last_for_get_extent = last + 1;
        }
-        last = found_key.offset;
        btrfs_free_path(path);
+        /*
+         * we might have some extents allocated but more delalloc past those
+         * extents.  so, we trust isize unless the start of the last extent is
+         * beyond isize
+         */
+        if (last < isize) {
+                last = (u64)-1;
+                last_for_get_extent = isize;
+        }
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
                         &cached_state, GFP_NOFS);
-        em = get_extent(inode, NULL, 0, off, max - off, 0);
+        em = get_extent_skip_holes(inode, off, last_for_get_extent,
+                                   get_extent);
        if (!em)
                goto out;
        if (IS_ERR(em)) {
@@ -2962,22 +3082,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        }
        while (!end) {
-                hole = 0;
+                u64 offset_in_extent;
-                off = em->start + em->len;
-                if (off >= max)
-                        end = 1;
-                if (em->block_start == EXTENT_MAP_HOLE) {
+                /* break if the extent we found is outside the range */
-                        hole = 1;
+                if (em->start >= max || extent_map_end(em) < off)
-                        goto next;
+                        break;
-                }
-                em_start = em->start;
+                /*
-                em_len = em->len;
+                 * get_extent may return an extent that starts before our
+                 * requested range.  We have to make sure the ranges
+                 * we return to fiemap always move forward and don't
+                 * overlap, so adjust the offsets here
+                 */
+                em_start = max(em->start, off);
+                /*
+                 * record the offset from the start of the extent
+                 * for adjusting the disk offset below
+                 */
+                offset_in_extent = em_start - em->start;
+                em_end = extent_map_end(em);
+                em_len = em_end - em_start;
+                emflags = em->flags;
                disko = 0;
                flags = 0;
+                /*
+                 * bump off for our next call to get_extent
+                 */
+                off = extent_map_end(em);
+                if (off >= max)
+                        end = 1;
                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
                        end = 1;
                        flags |= FIEMAP_EXTENT_LAST;
@@ -2988,42 +3124,34 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        flags |= (FIEMAP_EXTENT_DELALLOC |
                                  FIEMAP_EXTENT_UNKNOWN);
                } else {
-                        disko = em->block_start;
+                        disko = em->block_start + offset_in_extent;
                }
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
-next:
-                emflags = em->flags;
                free_extent_map(em);
                em = NULL;
-                if (!end) {
+                if ((em_start >= last) || em_len == (u64)-1 ||
-                        em = get_extent(inode, NULL, 0, off, max - off, 0);
+                   (last == (u64)-1 && isize <= em_end)) {
-                        if (!em)
-                                goto out;
-                        if (IS_ERR(em)) {
-                                ret = PTR_ERR(em);
-                                goto out;
-                        }
-                        emflags = em->flags;
-                }
-                if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
-                if (em_start == last) {
+                /* now scan forward to see if this is really the last extent. */
+                em = get_extent_skip_holes(inode, off, last_for_get_extent,
+                                           get_extent);
+                if (IS_ERR(em)) {
+                        ret = PTR_ERR(em);
+                        goto out;
+                }
+                if (!em) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
+                ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
-                if (!hole) {
+                                              em_len, flags);
-                        ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+                if (ret)
-                                                em_len, flags);
+                        goto out_free;
-                        if (ret)
-                                goto out_free;
-                }
        }
 out_free:
        free_extent_map(em);
@@ -3192,7 +3320,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                }
                if (!PageUptodate(p))
                        uptodate = 0;
-                unlock_page(p);
+                /*
+                 * see below about how we avoid a nasty race with release page
+                 * and why we unlock later
+                 */
+                if (i != 0)
+                        unlock_page(p);
        }
        if (uptodate)
                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -3216,9 +3350,26 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        atomic_inc(&eb->refs);
        spin_unlock(&tree->buffer_lock);
        radix_tree_preload_end();
+        /*
+         * there is a race where release page may have
+         * tried to find this extent buffer in the radix
+         * but failed.  It will tell the VM it is safe to
+         * reclaim the, and it will clear the page private bit.
+         * We must make sure to set the page private bit properly
+         * after the extent buffer is in the radix tree so
+         * it doesn't get lost
+         */
+        set_page_extent_mapped(eb->first_page);
+        set_page_extent_head(eb->first_page, eb->len);
+        if (!page0)
+                unlock_page(eb->first_page);
        return eb;
 free_eb:
+        if (eb->first_page && !page0)
+                unlock_page(eb->first_page);
        if (!atomic_dec_and_test(&eb->refs))
                return exists;
        btrfs_release_extent_buffer(eb);
@@ -3269,10 +3420,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                        continue;
                lock_page(page);
+                WARN_ON(!PagePrivate(page));
+                set_page_extent_mapped(page);
                if (i == 0)
                        set_page_extent_head(page, eb->len);
-                else
-                        set_page_private(page, EXTENT_PAGE_PRIVATE);
                clear_page_dirty_for_io(page);
                spin_lock_irq(&page->mapping->tree_lock);
@@ -3339,7 +3491,7 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
        num_pages = num_extent_pages(eb->start, eb->len);
        set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-                            GFP_NOFS);
+                            NULL, GFP_NOFS);
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
                if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3462,6 +3614,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        for (i = start_i; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
+                WARN_ON(!PagePrivate(page));
+                set_page_extent_mapped(page);
+                if (i == 0)
+                        set_page_extent_head(page, eb->len);
                if (inc_all_pages)
                        page_cache_get(page);
                if (!PageUptodate(page)) {
@@ -3567,6 +3726,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
                       "wanted %lu %lu\n", (unsigned long long)eb->start,
                       eb->len, start, min_len);
                WARN_ON(1);
+                return -EINVAL;
        }
        p = extent_buffer_page(eb, i);
@@ -3759,6 +3919,12 @@ static void move_pages(struct page *dst_page, struct page *src_page,
        kunmap_atomic(dst_kaddr, KM_USER0);
 }
+static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
+{
+        unsigned long distance = (src > dst) ? src - dst : dst - src;
+        return distance < len;
+}
 static void copy_pages(struct page *dst_page, struct page *src_page,
                       unsigned long dst_off, unsigned long src_off,
                       unsigned long len)
@@ -3766,10 +3932,12 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
        char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
        char *src_kaddr;
-        if (dst_page != src_page)
+        if (dst_page != src_page) {
                src_kaddr = kmap_atomic(src_page, KM_USER1);
-        else
+        } else {
                src_kaddr = dst_kaddr;
+                BUG_ON(areas_overlap(src_off, dst_off, len));
+        }
        memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
        kunmap_atomic(dst_kaddr, KM_USER0);
@@ -3844,7 +4012,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
                       "len %lu len %lu\n", dst_offset, len, dst->len);
                BUG_ON(1);
        }
-        if (dst_offset < src_offset) {
+        if (!areas_overlap(src_offset, dst_offset, len)) {
                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
                return;
        }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7083cfafd061..af2d7179c372 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -31,6 +31,7 @@
 #define EXTENT_BUFFER_UPTODATE 0
 #define EXTENT_BUFFER_BLOCKING 1
 #define EXTENT_BUFFER_DIRTY 2
+#define EXTENT_BUFFER_CORRUPT 3
 /* these are flags for extent_clear_unlock_delalloc */
 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -191,7 +192,7 @@ void extent_io_exit(void);
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end,
-                     u64 max_bytes, unsigned long bits);
+                     u64 max_bytes, unsigned long bits, int contig);
 void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -207,7 +208,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   int bits, int exclusive_bits, u64 *failed_start,
                   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-                        gfp_t mask);
+                        struct extent_state **cached_state, gfp_t mask);
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                   gfp_t mask);
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b0e1fce12530..a24a3f2fa13e 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,8 +51,8 @@ struct extent_map *alloc_extent_map(gfp_t mask)
 {
        struct extent_map *em;
        em = kmem_cache_alloc(extent_map_cache, mask);
-        if (!em || IS_ERR(em))
+        if (!em)
-                return em;
+                return NULL;
        em->in_tree = 0;
        em->flags = 0;
        em->compress_type = BTRFS_COMPRESS_NONE;
@@ -243,7 +243,7 @@ out:
 * Insert @em into @tree or perform a simple forward/backward merge with
 * existing mappings.  The extent_map struct passed in will be inserted
 * into the tree directly, with an additional reference taken, or a
- * reference dropped if the merge attempt was successfull.
+ * reference dropped if the merge attempt was successful.
 */
 int add_extent_mapping(struct extent_map_tree *tree,
                       struct extent_map *em)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a562a250ae77..a6a9d4e8b491 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -48,7 +48,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
        struct extent_buffer *leaf;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        file_key.objectid = objectid;
        file_key.offset = pos;
        btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
@@ -169,6 +170,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        if (bio->bi_size > PAGE_CACHE_SIZE * 8)
                path->reada = 2;
@@ -536,6 +539,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
        root = root->fs_info->csum_root;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        while (1) {
                key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -548,7 +553,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                        if (path->slots[0] == 0)
                                goto out;
                        path->slots[0]--;
+                } else if (ret < 0) {
+                        goto out;
                }
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c800d58f3013..75899a01dded 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -45,14 +45,14 @@
 * and be replaced with calls into generic code.
 */
 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
-                                         int write_bytes,
+                                         size_t write_bytes,
                                         struct page **prepared_pages,
                                         struct iov_iter *i)
 {
        size_t copied = 0;
+        size_t total_copied = 0;
        int pg = 0;
        int offset = pos & (PAGE_CACHE_SIZE - 1);
-        int total_copied = 0;
        while (write_bytes > 0) {
                size_t count = min_t(size_t,
@@ -70,14 +70,26 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
+                /*
+                 * if we get a partial write, we can end up with
+                 * partially up to date pages.  These add
+                 * a lot of complexity, so make sure they don't
+                 * happen by forcing this copy to be retried.
+                 *
+                 * The rest of the btrfs_file_write code will fall
+                 * back to page at a time copies after we return 0.
+                 */
+                if (!PageUptodate(page) && copied < count)
+                        copied = 0;
                iov_iter_advance(i, copied);
                write_bytes -= copied;
                total_copied += copied;
                /* Return to btrfs_file_aio_write to fault page */
-                if (unlikely(copied == 0)) {
+                if (unlikely(copied == 0))
                        break;
-                }
                if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
                        offset += copied;
@@ -92,12 +104,10 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 /*
 * unlocks pages after btrfs_file_write is done with them
 */
-static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
+void btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
        size_t i;
        for (i = 0; i < num_pages; i++) {
-                if (!pages[i])
-                        break;
                /* page checked is some magic around finding pages that
                 * have been modified without going through btrfs_set_page_dirty
                 * clear it here
@@ -117,17 +127,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
 * this also makes the decision about creating an inline extent vs
 * doing real data extents, marking pages dirty and delalloc as required.
 */
-static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
-                                   struct btrfs_root *root,
+                      struct page **pages, size_t num_pages,
-                                   struct file *file,
+                      loff_t pos, size_t write_bytes,
-                                   struct page **pages,
+                      struct extent_state **cached)
-                                   size_t num_pages,
-                                   loff_t pos,
-                                   size_t write_bytes)
 {
        int err = 0;
        int i;
-        struct inode *inode = fdentry(file)->d_inode;
        u64 num_bytes;
        u64 start_pos;
        u64 end_of_last_block;
@@ -140,8 +146,9 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
        end_of_last_block = start_pos + num_bytes - 1;
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
-                                        NULL);
+                                        cached);
-        BUG_ON(err);
+        if (err)
+                return err;
        for (i = 0; i < num_pages; i++) {
                struct page *p = pages[i];
@@ -149,13 +156,14 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
                ClearPageChecked(p);
                set_page_dirty(p);
        }
-        if (end_pos > isize) {
+        /*
+         * we've only changed i_size in ram, and we haven't updated
+         * the disk i_size.  There is no need to log the inode
+         * at this time.
+         */
+        if (end_pos > isize)
                i_size_write(inode, end_pos);
-                /* we've only changed i_size in ram, and we haven't updated
-                 * the disk i_size.  There is no need to log the inode
-                 * at this time.
-                 */
-        }
        return 0;
 }
@@ -186,6 +194,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split = alloc_extent_map(GFP_NOFS);
                if (!split2)
                        split2 = alloc_extent_map(GFP_NOFS);
+                BUG_ON(!split || !split2);
                write_lock(&em_tree->lock);
                em = lookup_extent_mapping(em_tree, start, len);
@@ -596,6 +605,8 @@ again:
        key.offset = split;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret < 0)
+                goto out;
        if (ret > 0 && path->slots[0] > 0)
                path->slots[0]--;
@@ -762,6 +773,27 @@ out:
 }
 /*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos)
+{
+        int ret = 0;
+        if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+                ret = btrfs_readpage(NULL, page);
+                if (ret)
+                        return ret;
+                lock_page(page);
+                if (!PageUptodate(page)) {
+                        unlock_page(page);
+                        return -EIO;
+                }
+        }
+        return 0;
+}
+/*
 * this gets pages into the page cache and locks them down, it also properly
 * waits for data=ordered extents to finish before allowing the pages to be
 * modified.
@@ -776,6 +808,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
        unsigned long index = pos >> PAGE_CACHE_SHIFT;
        struct inode *inode = fdentry(file)->d_inode;
        int err = 0;
+        int faili = 0;
        u64 start_pos;
        u64 last_pos;
@@ -783,21 +816,33 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
        last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
        if (start_pos > inode->i_size) {
-                err = btrfs_cont_expand(inode, start_pos);
+                err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
                if (err)
                        return err;
        }
-        memset(pages, 0, num_pages * sizeof(struct page *));
 again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = grab_cache_page(inode->i_mapping, index + i);
                if (!pages[i]) {
+                        faili = i - 1;
                        err = -ENOMEM;
-                        BUG_ON(1);
+                        goto fail;
+                }
+                if (i == 0)
+                        err = prepare_uptodate_page(pages[i], pos);
+                if (i == num_pages - 1)
+                        err = prepare_uptodate_page(pages[i],
+                                                    pos + write_bytes);
+                if (err) {
+                        page_cache_release(pages[i]);
+                        faili = i - 1;
+                        goto fail;
                }
                wait_on_page_writeback(pages[i]);
        }
+        err = 0;
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
                lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -837,187 +882,103 @@ again:
                WARN_ON(!PageLocked(pages[i]));
        }
        return 0;
+fail:
+        while (faili >= 0) {
+                unlock_page(pages[faili]);
+                page_cache_release(pages[faili]);
+                faili--;
+        }
+        return err;
 }
-static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
+static noinline ssize_t __btrfs_buffered_write(struct file *file,
-                                    const struct iovec *iov,
+                                               struct iov_iter *i,
-                                    unsigned long nr_segs, loff_t pos)
+                                               loff_t pos)
 {
-        struct file *file = iocb->ki_filp;
        struct inode *inode = fdentry(file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct page *pinned[2];
        struct page **pages = NULL;
-        struct iov_iter i;
-        loff_t *ppos = &iocb->ki_pos;
-        loff_t start_pos;
-        ssize_t num_written = 0;
-        ssize_t err = 0;
-        size_t count;
-        size_t ocount;
-        int ret = 0;
-        int nrptrs;
        unsigned long first_index;
        unsigned long last_index;
-        int will_write;
+        size_t num_written = 0;
-        int buffered = 0;
+        int nrptrs;
-        int copied = 0;
+        int ret = 0;
-        int dirty_pages = 0;
-        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
-                      (file->f_flags & O_DIRECT));
-        pinned[0] = NULL;
-        pinned[1] = NULL;
-        start_pos = pos;
-        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-        mutex_lock(&inode->i_mutex);
-        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-        if (err)
-                goto out;
-        count = ocount;
-        current->backing_dev_info = inode->i_mapping->backing_dev_info;
-        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
-        if (err)
-                goto out;
-        if (count == 0)
-                goto out;
-        err = file_remove_suid(file);
-        if (err)
-                goto out;
-        /*
-         * If BTRFS flips readonly due to some impossible error
-         * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
-         * although we have opened a file as writable, we have
-         * to stop this write operation to ensure FS consistency.
-         */
-        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
-                err = -EROFS;
-                goto out;
-        }
-        file_update_time(file);
-        BTRFS_I(inode)->sequence++;
-        if (unlikely(file->f_flags & O_DIRECT)) {
-                num_written = generic_file_direct_write(iocb, iov, &nr_segs,
-                                                        pos, ppos, count,
-                                                        ocount);
-                /*
-                 * the generic O_DIRECT will update in-memory i_size after the
-                 * DIOs are done.  But our endio handlers that update the on
-                 * disk i_size never update past the in memory i_size.  So we
-                 * need one more update here to catch any additions to the
-                 * file
-                 */
-                if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
-                        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-                        mark_inode_dirty(inode);
-                }
-                if (num_written < 0) {
-                        ret = num_written;
-                        num_written = 0;
-                        goto out;
-                } else if (num_written == count) {
-                        /* pick up pos changes done by the generic code */
-                        pos = *ppos;
-                        goto out;
-                }
-                /*
-                 * We are going to do buffered for the rest of the range, so we
-                 * need to make sure to invalidate the buffered pages when we're
-                 * done.
-                 */
-                buffered = 1;
-                pos += num_written;
-        }
-        iov_iter_init(&i, iov, nr_segs, count, num_written);
+        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
-        nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
                     (sizeof(struct page *)));
        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+        if (!pages)
-        /* generic_write_checks can change our pos */
+                return -ENOMEM;
-        start_pos = pos;
        first_index = pos >> PAGE_CACHE_SHIFT;
-        last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
+        last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
-        /*
+        while (iov_iter_count(i) > 0) {
-         * there are lots of better ways to do this, but this code
-         * makes sure the first and last page in the file range are
-         * up to date and ready for cow
-         */
-        if ((pos & (PAGE_CACHE_SIZE - 1))) {
-                pinned[0] = grab_cache_page(inode->i_mapping, first_index);
-                if (!PageUptodate(pinned[0])) {
-                        ret = btrfs_readpage(NULL, pinned[0]);
-                        BUG_ON(ret);
-                        wait_on_page_locked(pinned[0]);
-                } else {
-                        unlock_page(pinned[0]);
-                }
-        }
-        if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
-                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
-                if (!PageUptodate(pinned[1])) {
-                        ret = btrfs_readpage(NULL, pinned[1]);
-                        BUG_ON(ret);
-                        wait_on_page_locked(pinned[1]);
-                } else {
-                        unlock_page(pinned[1]);
-                }
-        }
-        while (iov_iter_count(&i) > 0) {
                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
-                size_t write_bytes = min(iov_iter_count(&i),
+                size_t write_bytes = min(iov_iter_count(i),
                                         nrptrs * (size_t)PAGE_CACHE_SIZE -
                                         offset);
-                size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
+                size_t num_pages = (write_bytes + offset +
-                                        PAGE_CACHE_SHIFT;
+                                    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                size_t dirty_pages;
+                size_t copied;
                WARN_ON(num_pages > nrptrs);
-                memset(pages, 0, sizeof(struct page *) * nrptrs);
                /*
                 * Fault pages before locking them in prepare_pages
                 * to avoid recursive lock
                 */
-                if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) {
+                if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
                        ret = -EFAULT;
-                        goto out;
+                        break;
                }
                ret = btrfs_delalloc_reserve_space(inode,
                                        num_pages << PAGE_CACHE_SHIFT);
                if (ret)
-                        goto out;
+                        break;
+                /*
+                 * This is going to setup the pages array with the number of
+                 * pages we want, so we don't really need to worry about the
+                 * contents of pages from loop to loop
+                 */
                ret = prepare_pages(root, file, pages, num_pages,
                                    pos, first_index, last_index,
                                    write_bytes);
                if (ret) {
                        btrfs_delalloc_release_space(inode,
                                        num_pages << PAGE_CACHE_SHIFT);
-                        goto out;
+                        break;
                }
                copied = btrfs_copy_from_user(pos, num_pages,
-                                           write_bytes, pages, &i);
+                                           write_bytes, pages, i);
-                dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >>
-                                        PAGE_CACHE_SHIFT;
+                /*
+                 * if we have trouble faulting in the pages, fall
+                 * back to one page at a time
+                 */
+                if (copied < write_bytes)
+                        nrptrs = 1;
+                if (copied == 0)
+                        dirty_pages = 0;
+                else
+                        dirty_pages = (copied + offset +
+                                       PAGE_CACHE_SIZE - 1) >>
+                                       PAGE_CACHE_SHIFT;
+                /*
+                 * If we had a short copy we need to release the excess delaloc
+                 * bytes we reserved.  We need to increment outstanding_extents
+                 * because btrfs_delalloc_release_space will decrement it, but
+                 * we still have an outstanding extent for the chunk we actually
+                 * managed to copy.
+                 */
                if (num_pages > dirty_pages) {
                        if (copied > 0)
                                atomic_inc(
@@ -1028,43 +989,157 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                }
                if (copied > 0) {
-                        dirty_and_release_pages(NULL, root, file, pages,
+                        ret = btrfs_dirty_pages(root, inode, pages,
-                                                dirty_pages, pos, copied);
+                                                dirty_pages, pos, copied,
+                                                NULL);
+                        if (ret) {
+                                btrfs_delalloc_release_space(inode,
+                                        dirty_pages << PAGE_CACHE_SHIFT);
+                                btrfs_drop_pages(pages, num_pages);
+                                break;
+                        }
                }
                btrfs_drop_pages(pages, num_pages);
-                if (copied > 0) {
+                cond_resched();
-                        if (will_write) {
-                                filemap_fdatawrite_range(inode->i_mapping, pos,
+                balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-                                                         pos + copied - 1);
+                                                   dirty_pages);
-                        } else {
+                if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-                                balance_dirty_pages_ratelimited_nr(
+                        btrfs_btree_balance_dirty(root, 1);
-                                                        inode->i_mapping,
+                btrfs_throttle(root);
-                                                        dirty_pages);
-                                if (dirty_pages <
-                                (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-                                        btrfs_btree_balance_dirty(root, 1);
-                                btrfs_throttle(root);
-                        }
-                }
                pos += copied;
                num_written += copied;
+        }
-                cond_resched();
+        kfree(pages);
+        return num_written ? num_written : ret;
+}
+static ssize_t __btrfs_direct_write(struct kiocb *iocb,
+                                    const struct iovec *iov,
+                                    unsigned long nr_segs, loff_t pos,
+                                    loff_t *ppos, size_t count, size_t ocount)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = fdentry(file)->d_inode;
+        struct iov_iter i;
+        ssize_t written;
+        ssize_t written_buffered;
+        loff_t endbyte;
+        int err;
+        written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
+                                            count, ocount);
+        /*
+         * the generic O_DIRECT will update in-memory i_size after the
+         * DIOs are done.  But our endio handlers that update the on
+         * disk i_size never update past the in memory i_size.  So we
+         * need one more update here to catch any additions to the
+         * file
+         */
+        if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+                btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+                mark_inode_dirty(inode);
        }
+        if (written < 0 || written == count)
+                return written;
+        pos += written;
+        count -= written;
+        iov_iter_init(&i, iov, nr_segs, count, written);
+        written_buffered = __btrfs_buffered_write(file, &i, pos);
+        if (written_buffered < 0) {
+                err = written_buffered;
+                goto out;
+        }
+        endbyte = pos + written_buffered - 1;
+        err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+        if (err)
+                goto out;
+        written += written_buffered;
+        *ppos = pos + written_buffered;
+        invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
+                                 endbyte >> PAGE_CACHE_SHIFT);
 out:
-        mutex_unlock(&inode->i_mutex);
+        return written ? written : err;
-        if (ret)
+}
-                err = ret;
-        kfree(pages);
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
-        if (pinned[0])
+                                    const struct iovec *iov,
-                page_cache_release(pinned[0]);
+                                    unsigned long nr_segs, loff_t pos)
-        if (pinned[1])
+{
-                page_cache_release(pinned[1]);
+        struct file *file = iocb->ki_filp;
-        *ppos = pos;
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        loff_t *ppos = &iocb->ki_pos;
+        ssize_t num_written = 0;
+        ssize_t err = 0;
+        size_t count, ocount;
+        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+        mutex_lock(&inode->i_mutex);
+        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+        if (err) {
+                mutex_unlock(&inode->i_mutex);
+                goto out;
+        }
+        count = ocount;
+        current->backing_dev_info = inode->i_mapping->backing_dev_info;
+        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+        if (err) {
+                mutex_unlock(&inode->i_mutex);
+                goto out;
+        }
+        if (count == 0) {
+                mutex_unlock(&inode->i_mutex);
+                goto out;
+        }
+        err = file_remove_suid(file);
+        if (err) {
+                mutex_unlock(&inode->i_mutex);
+                goto out;
+        }
+        /*
+         * If BTRFS flips readonly due to some impossible error
+         * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+         * although we have opened a file as writable, we have
+         * to stop this write operation to ensure FS consistency.
+         */
+        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+                mutex_unlock(&inode->i_mutex);
+                err = -EROFS;
+                goto out;
+        }
+        file_update_time(file);
+        BTRFS_I(inode)->sequence++;
+        if (unlikely(file->f_flags & O_DIRECT)) {
+                num_written = __btrfs_direct_write(iocb, iov, nr_segs,
+                                                   pos, ppos, count, ocount);
+        } else {
+                struct iov_iter i;
+                iov_iter_init(&i, iov, nr_segs, count, num_written);
+                num_written = __btrfs_buffered_write(file, &i, pos);
+                if (num_written > 0)
+                        *ppos = pos + num_written;
+        }
+        mutex_unlock(&inode->i_mutex);
        /*
         * we want to make sure fsync finds this change
@@ -1079,43 +1154,12 @@ out:
         * one running right now.
         */
        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+        if (num_written > 0 || num_written == -EIOCBQUEUED) {
-        if (num_written > 0 && will_write) {
+                err = generic_write_sync(file, pos, num_written);
-                struct btrfs_trans_handle *trans;
+                if (err < 0 && num_written > 0)
-                err = btrfs_wait_ordered_range(inode, start_pos, num_written);
-                if (err)
                        num_written = err;
-                if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-                        trans = btrfs_start_transaction(root, 0);
-                        if (IS_ERR(trans)) {
-                                num_written = PTR_ERR(trans);
-                                goto done;
-                        }
-                        mutex_lock(&inode->i_mutex);
-                        ret = btrfs_log_dentry_safe(trans, root,
-                                                    file->f_dentry);
-                        mutex_unlock(&inode->i_mutex);
-                        if (ret == 0) {
-                                ret = btrfs_sync_log(trans, root);
-                                if (ret == 0)
-                                        btrfs_end_transaction(trans, root);
-                                else
-                                        btrfs_commit_transaction(trans, root);
-                        } else if (ret != BTRFS_NO_LOG_SYNC) {
-                                btrfs_commit_transaction(trans, root);
-                        } else {
-                                btrfs_end_transaction(trans, root);
-                        }
-                }
-                if (file->f_flags & O_DIRECT && buffered) {
-                        invalidate_mapping_pages(inode->i_mapping,
-                              start_pos >> PAGE_CACHE_SHIFT,
-                             (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
-                }
        }
-done:
+out:
        current->backing_dev_info = NULL;
        return num_written ? num_written : err;
 }
@@ -1158,6 +1202,7 @@ int btrfs_sync_file(struct file *file, int datasync)
        int ret = 0;
        struct btrfs_trans_handle *trans;
+        trace_btrfs_sync_file(file, datasync);
        /* we wait first, since the writeback may change the inode */
        root->log_batch++;
@@ -1285,7 +1330,8 @@ static long btrfs_fallocate(struct file *file, int mode,
                goto out;
        if (alloc_start > inode->i_size) {
-                ret = btrfs_cont_expand(inode, alloc_start);
+                ret = btrfs_cont_expand(inode, i_size_read(inode),
+                                        alloc_start);
                if (ret)
                        goto out;
        }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 60d684266959..11d2e9cea09e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -24,6 +24,7 @@
 #include "free-space-cache.h"
 #include "transaction.h"
 #include "disk-io.h"
+#include "extent_io.h"
 #define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
@@ -81,6 +82,8 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
                return ERR_PTR(-ENOENT);
        }
+        inode->i_mapping->flags &= ~__GFP_FS;
        spin_lock(&block_group->lock);
        if (!root->fs_info->closing) {
                block_group->inode = igrab(inode);
@@ -222,6 +225,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
        u64 num_entries;
        u64 num_bitmaps;
        u64 generation;
+        u64 used = btrfs_block_group_used(&block_group->item);
        u32 cur_crc = ~(u32)0;
        pgoff_t index = 0;
        unsigned long first_page_offset;
@@ -393,7 +397,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                                break;
                        need_loop = 1;
-                        e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+                        e = kmem_cache_zalloc(btrfs_free_space_cachep,
+                                              GFP_NOFS);
                        if (!e) {
                                kunmap(page);
                                unlock_page(page);
@@ -405,7 +410,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                        e->bytes = le64_to_cpu(entry->bytes);
                        if (!e->bytes) {
                                kunmap(page);
-                                kfree(e);
+                                kmem_cache_free(btrfs_free_space_cachep, e);
                                unlock_page(page);
                                page_cache_release(page);
                                goto free_cache;
@@ -420,7 +425,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                                e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
                                if (!e->bitmap) {
                                        kunmap(page);
-                                        kfree(e);
+                                        kmem_cache_free(
+                                                btrfs_free_space_cachep, e);
                                        unlock_page(page);
                                        page_cache_release(page);
                                        goto free_cache;
@@ -465,6 +471,17 @@ next:
                index++;
        }
+        spin_lock(&block_group->tree_lock);
+        if (block_group->free_space != (block_group->key.offset - used -
+                                        block_group->bytes_super)) {
+                spin_unlock(&block_group->tree_lock);
+                printk(KERN_ERR "block group %llu has an wrong amount of free "
+                       "space\n", block_group->key.objectid);
+                ret = 0;
+                goto free_cache;
+        }
+        spin_unlock(&block_group->tree_lock);
        ret = 1;
 out:
        kfree(checksums);
@@ -491,18 +508,23 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        struct inode *inode;
        struct rb_node *node;
        struct list_head *pos, *n;
+        struct page **pages;
        struct page *page;
        struct extent_state *cached_state = NULL;
+        struct btrfs_free_cluster *cluster = NULL;
+        struct extent_io_tree *unpin = NULL;
        struct list_head bitmap_list;
        struct btrfs_key key;
+        u64 start, end, len;
        u64 bytes = 0;
        u32 *crc, *checksums;
-        pgoff_t index = 0, last_index = 0;
        unsigned long first_page_offset;
-        int num_checksums;
+        int index = 0, num_pages = 0;
        int entries = 0;
        int bitmaps = 0;
        int ret = 0;
+        bool next_page = false;
+        bool out_of_space = false;
        root = root->fs_info->tree_root;
@@ -530,24 +552,43 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                return 0;
        }
-        last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+        num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+                PAGE_CACHE_SHIFT;
        filemap_write_and_wait(inode->i_mapping);
        btrfs_wait_ordered_range(inode, inode->i_size &
                                 ~(root->sectorsize - 1), (u64)-1);
        /* We need a checksum per page. */
-        num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
+        crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
-        crc = checksums  = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
        if (!crc) {
                iput(inode);
                return 0;
        }
+        pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
+        if (!pages) {
+                kfree(crc);
+                iput(inode);
+                return 0;
+        }
        /* Since the first page has all of our checksums and our generation we
         * need to calculate the offset into the page that we can start writing
         * our entries.
         */
-        first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+        first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
+        /* Get the cluster for this block_group if it exists */
+        if (!list_empty(&block_group->cluster_list))
+                cluster = list_entry(block_group->cluster_list.next,
+                                     struct btrfs_free_cluster,
+                                     block_group_list);
+        /*
+         * We shouldn't have switched the pinned extents yet so this is the
+         * right one
+         */
+        unpin = root->fs_info->pinned_extents;
        /*
         * Lock all pages first so we can lock the extent safely.
@@ -557,20 +598,18 @@ int btrfs_write_out_cache(struct btrfs_root *root,
         * after find_get_page at this point.  Just putting this here so people
         * know and don't freak out.
         */
-        while (index <= last_index) {
+        while (index < num_pages) {
                page = grab_cache_page(inode->i_mapping, index);
                if (!page) {
-                        pgoff_t i = 0;
+                        int i;
-                        while (i < index) {
+                        for (i = 0; i < num_pages; i++) {
-                                page = find_get_page(inode->i_mapping, i);
+                                unlock_page(pages[i]);
-                                unlock_page(page);
+                                page_cache_release(pages[i]);
-                                page_cache_release(page);
-                                page_cache_release(page);
-                                i++;
                        }
                        goto out_free;
                }
+                pages[index] = page;
                index++;
        }
@@ -578,6 +617,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
                         0, &cached_state, GFP_NOFS);
+        /*
+         * When searching for pinned extents, we need to start at our start
+         * offset.
+         */
+        start = block_group->key.objectid;
        /* Write out the extent entries */
        do {
                struct btrfs_free_space_entry *entry;
@@ -585,18 +630,25 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                unsigned long offset = 0;
                unsigned long start_offset = 0;
+                next_page = false;
                if (index == 0) {
                        start_offset = first_page_offset;
                        offset = start_offset;
                }
-                page = find_get_page(inode->i_mapping, index);
+                if (index >= num_pages) {
+                        out_of_space = true;
+                        break;
+                }
+                page = pages[index];
                addr = kmap(page);
                entry = addr + start_offset;
                memset(addr, 0, PAGE_CACHE_SIZE);
-                while (1) {
+                while (node && !next_page) {
                        struct btrfs_free_space *e;
                        e = rb_entry(node, struct btrfs_free_space, offset_index);
@@ -612,12 +664,49 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                                entry->type = BTRFS_FREE_SPACE_EXTENT;
                        }
                        node = rb_next(node);
-                        if (!node)
+                        if (!node && cluster) {
-                                break;
+                                node = rb_first(&cluster->root);
+                                cluster = NULL;
+                        }
                        offset += sizeof(struct btrfs_free_space_entry);
                        if (offset + sizeof(struct btrfs_free_space_entry) >=
                            PAGE_CACHE_SIZE)
+                                next_page = true;
+                        entry++;
+                }
+                /*
+                 * We want to add any pinned extents to our free space cache
+                 * so we don't leak the space
+                 */
+                while (!next_page && (start < block_group->key.objectid +
+                                      block_group->key.offset)) {
+                        ret = find_first_extent_bit(unpin, start, &start, &end,
+                                                    EXTENT_DIRTY);
+                        if (ret) {
+                                ret = 0;
                                break;
+                        }
+                        /* This pinned extent is out of our range */
+                        if (start >= block_group->key.objectid +
+                            block_group->key.offset)
+                                break;
+                        len = block_group->key.objectid +
+                                block_group->key.offset - start;
+                        len = min(len, end + 1 - start);
+                        entries++;
+                        entry->offset = cpu_to_le64(start);
+                        entry->bytes = cpu_to_le64(len);
+                        entry->type = BTRFS_FREE_SPACE_EXTENT;
+                        start = end + 1;
+                        offset += sizeof(struct btrfs_free_space_entry);
+                        if (offset + sizeof(struct btrfs_free_space_entry) >=
+                            PAGE_CACHE_SIZE)
+                                next_page = true;
                        entry++;
                }
                *crc = ~(u32)0;
@@ -630,25 +719,8 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                bytes += PAGE_CACHE_SIZE;
-                ClearPageChecked(page);
-                set_page_extent_mapped(page);
-                SetPageUptodate(page);
-                set_page_dirty(page);
-                /*
-                 * We need to release our reference we got for grab_cache_page,
-                 * except for the first page which will hold our checksums, we
-                 * do that below.
-                 */
-                if (index != 0) {
-                        unlock_page(page);
-                        page_cache_release(page);
-                }
-                page_cache_release(page);
                index++;
-        } while (node);
+        } while (node || next_page);
        /* Write out the bitmaps */
        list_for_each_safe(pos, n, &bitmap_list) {
@@ -656,7 +728,11 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                struct btrfs_free_space *entry =
                        list_entry(pos, struct btrfs_free_space, list);
-                page = find_get_page(inode->i_mapping, index);
+                if (index >= num_pages) {
+                        out_of_space = true;
+                        break;
+                }
+                page = pages[index];
                addr = kmap(page);
                memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
@@ -667,64 +743,58 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                crc++;
                bytes += PAGE_CACHE_SIZE;
-                ClearPageChecked(page);
-                set_page_extent_mapped(page);
-                SetPageUptodate(page);
-                set_page_dirty(page);
-                unlock_page(page);
-                page_cache_release(page);
-                page_cache_release(page);
                list_del_init(&entry->list);
                index++;
        }
+        if (out_of_space) {
+                btrfs_drop_pages(pages, num_pages);
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+                                     i_size_read(inode) - 1, &cached_state,
+                                     GFP_NOFS);
+                ret = 0;
+                goto out_free;
+        }
        /* Zero out the rest of the pages just to make sure */
-        while (index <= last_index) {
+        while (index < num_pages) {
                void *addr;
-                page = find_get_page(inode->i_mapping, index);
+                page = pages[index];
                addr = kmap(page);
                memset(addr, 0, PAGE_CACHE_SIZE);
                kunmap(page);
-                ClearPageChecked(page);
-                set_page_extent_mapped(page);
-                SetPageUptodate(page);
-                set_page_dirty(page);
-                unlock_page(page);
-                page_cache_release(page);
-                page_cache_release(page);
                bytes += PAGE_CACHE_SIZE;
                index++;
        }
-        btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
        /* Write the checksums and trans id to the first page */
        {
                void *addr;
                u64 *gen;
-                page = find_get_page(inode->i_mapping, 0);
+                page = pages[0];
                addr = kmap(page);
-                memcpy(addr, checksums, sizeof(u32) * num_checksums);
+                memcpy(addr, checksums, sizeof(u32) * num_pages);
-                gen = addr + (sizeof(u32) * num_checksums);
+                gen = addr + (sizeof(u32) * num_pages);
                *gen = trans->transid;
                kunmap(page);
-                ClearPageChecked(page);
-                set_page_extent_mapped(page);
-                SetPageUptodate(page);
-                set_page_dirty(page);
-                unlock_page(page);
-                page_cache_release(page);
-                page_cache_release(page);
        }
-        BTRFS_I(inode)->generation = trans->transid;
+        ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
+                                            bytes, &cached_state);
+        btrfs_drop_pages(pages, num_pages);
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
                             i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+        if (ret) {
+                ret = 0;
+                goto out_free;
+        }
+        BTRFS_I(inode)->generation = trans->transid;
        filemap_write_and_wait(inode->i_mapping);
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -775,6 +845,7 @@ out_free:
                BTRFS_I(inode)->generation = 0;
        }
        kfree(checksums);
+        kfree(pages);
        btrfs_update_inode(trans, root, inode);
        iput(inode);
        return ret;
@@ -987,11 +1058,18 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
        return entry;
 }
-static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+static inline void
-                              struct btrfs_free_space *info)
+__unlink_free_space(struct btrfs_block_group_cache *block_group,
+                    struct btrfs_free_space *info)
 {
        rb_erase(&info->offset_index, &block_group->free_space_offset);
        block_group->free_extents--;
+}
+static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *info)
+{
+        __unlink_free_space(block_group, info);
        block_group->free_space -= info->bytes;
 }
@@ -1016,14 +1094,18 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
        u64 max_bytes;
        u64 bitmap_bytes;
        u64 extent_bytes;
+        u64 size = block_group->key.offset;
        /*
         * The goal is to keep the total amount of memory used per 1gb of space
         * at or below 32k, so we need to adjust how much memory we allow to be
         * used by extent based free space tracking
         */
-        max_bytes = MAX_CACHE_BYTES_PER_GIG *
+        if (size < 1024 * 1024 * 1024)
-                (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+                max_bytes = MAX_CACHE_BYTES_PER_GIG;
+        else
+                max_bytes = MAX_CACHE_BYTES_PER_GIG *
+                        div64_u64(size, 1024 * 1024 * 1024);
        /*
         * we want to account for 1 more bitmap than what we have so we can make
@@ -1171,6 +1253,16 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
        recalculate_thresholds(block_group);
 }
+static void free_bitmap(struct btrfs_block_group_cache *block_group,
+                        struct btrfs_free_space *bitmap_info)
+{
+        unlink_free_space(block_group, bitmap_info);
+        kfree(bitmap_info->bitmap);
+        kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
+        block_group->total_bitmaps--;
+        recalculate_thresholds(block_group);
+}
 static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
                              struct btrfs_free_space *bitmap_info,
                              u64 *offset, u64 *bytes)
@@ -1195,6 +1287,7 @@ again:
         */
        search_start = *offset;
        search_bytes = *bytes;
+        search_bytes = min(search_bytes, end - search_start + 1);
        ret = search_bitmap(block_group, bitmap_info, &search_start,
                            &search_bytes);
        BUG_ON(ret < 0 || search_start != *offset);
@@ -1211,13 +1304,8 @@ again:
        if (*bytes) {
                struct rb_node *next = rb_next(&bitmap_info->offset_index);
-                if (!bitmap_info->bytes) {
+                if (!bitmap_info->bytes)
-                        unlink_free_space(block_group, bitmap_info);
+                        free_bitmap(block_group, bitmap_info);
-                        kfree(bitmap_info->bitmap);
-                        kfree(bitmap_info);
-                        block_group->total_bitmaps--;
-                        recalculate_thresholds(block_group);
-                }
                /*
                 * no entry after this bitmap, but we still have bytes to
@@ -1250,13 +1338,8 @@ again:
                        return -EAGAIN;
                goto again;
-        } else if (!bitmap_info->bytes) {
+        } else if (!bitmap_info->bytes)
-                unlink_free_space(block_group, bitmap_info);
+                free_bitmap(block_group, bitmap_info);
-                kfree(bitmap_info->bitmap);
-                kfree(bitmap_info);
-                block_group->total_bitmaps--;
-                recalculate_thresholds(block_group);
-        }
        return 0;
 }
@@ -1273,9 +1356,22 @@ static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
         * If we are below the extents threshold then we can add this as an
         * extent, and don't have to deal with the bitmap
         */
-        if (block_group->free_extents < block_group->extents_thresh &&
+        if (block_group->free_extents < block_group->extents_thresh) {
-            info->bytes > block_group->sectorsize * 4)
+                /*
-                return 0;
+                 * If this block group has some small extents we don't want to
+                 * use up all of our free slots in the cache with them, we want
+                 * to reserve them to larger extents, however if we have plent
+                 * of cache left then go ahead an dadd them, no sense in adding
+                 * the overhead of a bitmap if we don't have to.
+                 */
+                if (info->bytes <= block_group->sectorsize * 4) {
+                        if (block_group->free_extents * 2 <=
+                            block_group->extents_thresh)
+                                return 0;
+                } else {
+                        return 0;
+                }
+        }
        /*
         * some block groups are so tiny they can't be enveloped by a bitmap, so
@@ -1330,8 +1426,8 @@ new_bitmap:
                /* no pre-allocated info, allocate a new one */
                if (!info) {
-                        info = kzalloc(sizeof(struct btrfs_free_space),
+                        info = kmem_cache_zalloc(btrfs_free_space_cachep,
-                                       GFP_NOFS);
+                                                 GFP_NOFS);
                        if (!info) {
                                spin_lock(&block_group->tree_lock);
                                ret = -ENOMEM;
@@ -1353,28 +1449,20 @@ out:
        if (info) {
                if (info->bitmap)
                        kfree(info->bitmap);
-                kfree(info);
+                kmem_cache_free(btrfs_free_space_cachep, info);
        }
        return ret;
 }
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
-                         u64 offset, u64 bytes)
+                          struct btrfs_free_space *info, bool update_stat)
 {
-        struct btrfs_free_space *right_info = NULL;
+        struct btrfs_free_space *left_info;
-        struct btrfs_free_space *left_info = NULL;
+        struct btrfs_free_space *right_info;
-        struct btrfs_free_space *info = NULL;
+        bool merged = false;
-        int ret = 0;
+        u64 offset = info->offset;
+        u64 bytes = info->bytes;
-        info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
-        if (!info)
-                return -ENOMEM;
-        info->offset = offset;
-        info->bytes = bytes;
-        spin_lock(&block_group->tree_lock);
        /*
         * first we want to see if there is free space adjacent to the range we
@@ -1388,40 +1476,65 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
        else
                left_info = tree_search_offset(block_group, offset - 1, 0, 0);
-        /*
-         * If there was no extent directly to the left or right of this new
-         * extent then we know we're going to have to allocate a new extent, so
-         * before we do that see if we need to drop this into a bitmap
-         */
-        if ((!left_info || left_info->bitmap) &&
-            (!right_info || right_info->bitmap)) {
-                ret = insert_into_bitmap(block_group, info);
-                if (ret < 0) {
-                        goto out;
-                } else if (ret) {
-                        ret = 0;
-                        goto out;
-                }
-        }
        if (right_info && !right_info->bitmap) {
-                unlink_free_space(block_group, right_info);
+                if (update_stat)
+                        unlink_free_space(block_group, right_info);
+                else
+                        __unlink_free_space(block_group, right_info);
                info->bytes += right_info->bytes;
-                kfree(right_info);
+                kmem_cache_free(btrfs_free_space_cachep, right_info);
+                merged = true;
        }
        if (left_info && !left_info->bitmap &&
            left_info->offset + left_info->bytes == offset) {
-                unlink_free_space(block_group, left_info);
+                if (update_stat)
+                        unlink_free_space(block_group, left_info);
+                else
+                        __unlink_free_space(block_group, left_info);
                info->offset = left_info->offset;
                info->bytes += left_info->bytes;
-                kfree(left_info);
+                kmem_cache_free(btrfs_free_space_cachep, left_info);
+                merged = true;
        }
+        return merged;
+}
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+                         u64 offset, u64 bytes)
+{
+        struct btrfs_free_space *info;
+        int ret = 0;
+        info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+        if (!info)
+                return -ENOMEM;
+        info->offset = offset;
+        info->bytes = bytes;
+        spin_lock(&block_group->tree_lock);
+        if (try_merge_free_space(block_group, info, true))
+                goto link;
+        /*
+         * There was no extent directly to the left or right of this new
+         * extent then we know we're going to have to allocate a new extent, so
+         * before we do that see if we need to drop this into a bitmap
+         */
+        ret = insert_into_bitmap(block_group, info);
+        if (ret < 0) {
+                goto out;
+        } else if (ret) {
+                ret = 0;
+                goto out;
+        }
+link:
        ret = link_free_space(block_group, info);
        if (ret)
-                kfree(info);
+                kmem_cache_free(btrfs_free_space_cachep, info);
 out:
        spin_unlock(&block_group->tree_lock);
@@ -1491,7 +1604,7 @@ again:
                        kfree(info->bitmap);
                        block_group->total_bitmaps--;
                }
-                kfree(info);
+                kmem_cache_free(btrfs_free_space_cachep, info);
                goto out_lock;
        }
@@ -1527,7 +1640,7 @@ again:
                        /* the hole we're creating ends at the end
                         * of the info struct, just free the info
                         */
-                        kfree(info);
+                        kmem_cache_free(btrfs_free_space_cachep, info);
                }
                spin_unlock(&block_group->tree_lock);
@@ -1600,29 +1713,28 @@ __btrfs_return_cluster_to_free_space(
 {
        struct btrfs_free_space *entry;
        struct rb_node *node;
-        bool bitmap;
        spin_lock(&cluster->lock);
        if (cluster->block_group != block_group)
                goto out;
-        bitmap = cluster->points_to_bitmap;
        cluster->block_group = NULL;
        cluster->window_start = 0;
        list_del_init(&cluster->block_group_list);
-        cluster->points_to_bitmap = false;
-        if (bitmap)
-                goto out;
        node = rb_first(&cluster->root);
        while (node) {
+                bool bitmap;
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
                rb_erase(&entry->offset_index, &cluster->root);
-                BUG_ON(entry->bitmap);
+                bitmap = (entry->bitmap != NULL);
+                if (!bitmap)
+                        try_merge_free_space(block_group, entry, false);
                tree_insert_offset(&block_group->free_space_offset,
-                                   entry->offset, &entry->offset_index, 0);
+                                   entry->offset, &entry->offset_index, bitmap);
        }
        cluster->root = RB_ROOT;
@@ -1659,7 +1771,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
                unlink_free_space(block_group, info);
                if (info->bitmap)
                        kfree(info->bitmap);
-                kfree(info);
+                kmem_cache_free(btrfs_free_space_cachep, info);
                if (need_resched()) {
                        spin_unlock(&block_group->tree_lock);
                        cond_resched();
@@ -1685,19 +1797,14 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
        ret = offset;
        if (entry->bitmap) {
                bitmap_clear_bits(block_group, entry, offset, bytes);
-                if (!entry->bytes) {
+                if (!entry->bytes)
-                        unlink_free_space(block_group, entry);
+                        free_bitmap(block_group, entry);
-                        kfree(entry->bitmap);
-                        kfree(entry);
-                        block_group->total_bitmaps--;
-                        recalculate_thresholds(block_group);
-                }
        } else {
                unlink_free_space(block_group, entry);
                entry->offset += bytes;
                entry->bytes -= bytes;
                if (!entry->bytes)
-                        kfree(entry);
+                        kmem_cache_free(btrfs_free_space_cachep, entry);
                else
                        link_free_space(block_group, entry);
        }
@@ -1750,48 +1857,24 @@ int btrfs_return_cluster_to_free_space(
 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
                                   struct btrfs_free_cluster *cluster,
+                                   struct btrfs_free_space *entry,
                                   u64 bytes, u64 min_start)
 {
-        struct btrfs_free_space *entry;
        int err;
        u64 search_start = cluster->window_start;
        u64 search_bytes = bytes;
        u64 ret = 0;
-        spin_lock(&block_group->tree_lock);
-        spin_lock(&cluster->lock);
-        if (!cluster->points_to_bitmap)
-                goto out;
-        if (cluster->block_group != block_group)
-                goto out;
-        /*
-         * search_start is the beginning of the bitmap, but at some point it may
-         * be a good idea to point to the actual start of the free area in the
-         * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
-         * to 1 to make sure we get the bitmap entry
-         */
-        entry = tree_search_offset(block_group,
-                                   offset_to_bitmap(block_group, search_start),
-                                   1, 0);
-        if (!entry || !entry->bitmap)
-                goto out;
        search_start = min_start;
        search_bytes = bytes;
        err = search_bitmap(block_group, entry, &search_start,
                            &search_bytes);
        if (err)
-                goto out;
+                return 0;
        ret = search_start;
        bitmap_clear_bits(block_group, entry, ret, bytes);
-out:
-        spin_unlock(&cluster->lock);
-        spin_unlock(&block_group->tree_lock);
        return ret;
 }
@@ -1809,10 +1892,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
        struct rb_node *node;
        u64 ret = 0;
-        if (cluster->points_to_bitmap)
-                return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
-                                               min_start);
        spin_lock(&cluster->lock);
        if (bytes > cluster->max_size)
                goto out;
@@ -1825,9 +1904,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
                goto out;
        entry = rb_entry(node, struct btrfs_free_space, offset_index);
        while(1) {
-                if (entry->bytes < bytes || entry->offset < min_start) {
+                if (entry->bytes < bytes ||
+                    (!entry->bitmap && entry->offset < min_start)) {
                        struct rb_node *node;
                        node = rb_next(&entry->offset_index);
@@ -1837,20 +1916,53 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
                                         offset_index);
                        continue;
                }
-                ret = entry->offset;
-                entry->offset += bytes;
+                if (entry->bitmap) {
-                entry->bytes -= bytes;
+                        ret = btrfs_alloc_from_bitmap(block_group,
+                                                      cluster, entry, bytes,
+                                                      min_start);
+                        if (ret == 0) {
+                                struct rb_node *node;
+                                node = rb_next(&entry->offset_index);
+                                if (!node)
+                                        break;
+                                entry = rb_entry(node, struct btrfs_free_space,
+                                                 offset_index);
+                                continue;
+                        }
+                } else {
-                if (entry->bytes == 0) {
+                        ret = entry->offset;
-                        rb_erase(&entry->offset_index, &cluster->root);
-                        kfree(entry);
+                        entry->offset += bytes;
+                        entry->bytes -= bytes;
                }
+                if (entry->bytes == 0)
+                        rb_erase(&entry->offset_index, &cluster->root);
                break;
        }
 out:
        spin_unlock(&cluster->lock);
+        if (!ret)
+                return 0;
+        spin_lock(&block_group->tree_lock);
+        block_group->free_space -= bytes;
+        if (entry->bytes == 0) {
+                block_group->free_extents--;
+                if (entry->bitmap) {
+                        kfree(entry->bitmap);
+                        block_group->total_bitmaps--;
+                        recalculate_thresholds(block_group);
+                }
+                kmem_cache_free(btrfs_free_space_cachep, entry);
+        }
+        spin_unlock(&block_group->tree_lock);
        return ret;
 }
@@ -1866,12 +1978,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
        unsigned long found_bits;
        unsigned long start = 0;
        unsigned long total_found = 0;
+        int ret;
        bool found = false;
        i = offset_to_bit(entry->offset, block_group->sectorsize,
                          max_t(u64, offset, entry->offset));
-        search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+        search_bits = bytes_to_bits(bytes, block_group->sectorsize);
-        total_bits = bytes_to_bits(bytes, block_group->sectorsize);
+        total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
 again:
        found_bits = 0;
@@ -1888,7 +2001,7 @@ again:
        }
        if (!found_bits)
-                return -1;
+                return -ENOSPC;
        if (!found) {
                start = i;
@@ -1912,189 +2025,208 @@ again:
        cluster->window_start = start * block_group->sectorsize +
                entry->offset;
-        cluster->points_to_bitmap = true;
+        rb_erase(&entry->offset_index, &block_group->free_space_offset);
+        ret = tree_insert_offset(&cluster->root, entry->offset,
+                                 &entry->offset_index, 1);
+        BUG_ON(ret);
        return 0;
 }
 /*
- * here we try to find a cluster of blocks in a block group.  The goal
+ * This searches the block group for just extents to fill the cluster with.
- * is to find at least bytes free and up to empty_size + bytes free.
- * We might not find them all in one contiguous area.
- *
- * returns zero and sets up cluster if things worked out, otherwise
- * it returns -enospc
 */
-int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
-                             struct btrfs_root *root,
+                                   struct btrfs_free_cluster *cluster,
-                             struct btrfs_block_group_cache *block_group,
+                                   u64 offset, u64 bytes, u64 min_bytes)
-                             struct btrfs_free_cluster *cluster,
-                             u64 offset, u64 bytes, u64 empty_size)
 {
+        struct btrfs_free_space *first = NULL;
        struct btrfs_free_space *entry = NULL;
+        struct btrfs_free_space *prev = NULL;
+        struct btrfs_free_space *last;
        struct rb_node *node;
-        struct btrfs_free_space *next;
-        struct btrfs_free_space *last = NULL;
-        u64 min_bytes;
        u64 window_start;
        u64 window_free;
-        u64 max_extent = 0;
+        u64 max_extent;
-        bool found_bitmap = false;
+        u64 max_gap = 128 * 1024;
-        int ret;
-        /* for metadata, allow allocates with more holes */
+        entry = tree_search_offset(block_group, offset, 0, 1);
-        if (btrfs_test_opt(root, SSD_SPREAD)) {
+        if (!entry)
-                min_bytes = bytes + empty_size;
+                return -ENOSPC;
-        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
-                /*
-                 * we want to do larger allocations when we are
-                 * flushing out the delayed refs, it helps prevent
-                 * making more work as we go along.
-                 */
-                if (trans->transaction->delayed_refs.flushing)
-                        min_bytes = max(bytes, (bytes + empty_size) >> 1);
-                else
-                        min_bytes = max(bytes, (bytes + empty_size) >> 4);
-        } else
-                min_bytes = max(bytes, (bytes + empty_size) >> 2);
-        spin_lock(&block_group->tree_lock);
-        spin_lock(&cluster->lock);
-        /* someone already found a cluster, hooray */
-        if (cluster->block_group) {
-                ret = 0;
-                goto out;
-        }
-again:
-        entry = tree_search_offset(block_group, offset, found_bitmap, 1);
-        if (!entry) {
-                ret = -ENOSPC;
-                goto out;
-        }
        /*
-         * If found_bitmap is true, we exhausted our search for extent entries,
+         * We don't want bitmaps, so just move along until we find a normal
-         * and we just want to search all of the bitmaps that we can find, and
+         * extent entry.
-         * ignore any extent entries we find.
         */
-        while (entry->bitmap || found_bitmap ||
+        while (entry->bitmap) {
-               (!entry->bitmap && entry->bytes < min_bytes)) {
+                node = rb_next(&entry->offset_index);
-                struct rb_node *node = rb_next(&entry->offset_index);
+                if (!node)
+                        return -ENOSPC;
-                if (entry->bitmap && entry->bytes > bytes + empty_size) {
-                        ret = btrfs_bitmap_cluster(block_group, entry, cluster,
-                                                   offset, bytes + empty_size,
-                                                   min_bytes);
-                        if (!ret)
-                                goto got_it;
-                }
-                if (!node) {
-                        ret = -ENOSPC;
-                        goto out;
-                }
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
        }
-        /*
-         * We already searched all the extent entries from the passed in offset
-         * to the end and didn't find enough space for the cluster, and we also
-         * didn't find any bitmaps that met our criteria, just go ahead and exit
-         */
-        if (found_bitmap) {
-                ret = -ENOSPC;
-                goto out;
-        }
-        cluster->points_to_bitmap = false;
        window_start = entry->offset;
        window_free = entry->bytes;
-        last = entry;
        max_extent = entry->bytes;
+        first = entry;
+        last = entry;
+        prev = entry;
-        while (1) {
+        while (window_free <= min_bytes) {
-                /* out window is just right, lets fill it */
+                node = rb_next(&entry->offset_index);
-                if (window_free >= bytes + empty_size)
+                if (!node)
-                        break;
+                        return -ENOSPC;
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
-                node = rb_next(&last->offset_index);
-                if (!node) {
-                        if (found_bitmap)
-                                goto again;
-                        ret = -ENOSPC;
-                        goto out;
-                }
-                next = rb_entry(node, struct btrfs_free_space, offset_index);
-                /*
+                if (entry->bitmap)
-                 * we found a bitmap, so if this search doesn't result in a
-                 * cluster, we know to go and search again for the bitmaps and
-                 * start looking for space there
-                 */
-                if (next->bitmap) {
-                        if (!found_bitmap)
-                                offset = next->offset;
-                        found_bitmap = true;
-                        last = next;
                        continue;
-                }
                /*
                 * we haven't filled the empty size and the window is
                 * very large.  reset and try again
                 */
-                if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
+                if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
-                    next->offset - window_start > (bytes + empty_size) * 2) {
+                    entry->offset - window_start > (min_bytes * 2)) {
-                        entry = next;
+                        first = entry;
                        window_start = entry->offset;
                        window_free = entry->bytes;
                        last = entry;
                        max_extent = entry->bytes;
                } else {
-                        last = next;
+                        last = entry;
-                        window_free += next->bytes;
+                        window_free += entry->bytes;
                        if (entry->bytes > max_extent)
                                max_extent = entry->bytes;
                }
+                prev = entry;
        }
-        cluster->window_start = entry->offset;
+        cluster->window_start = first->offset;
+        node = &first->offset_index;
        /*
         * now we've found our entries, pull them out of the free space
         * cache and put them into the cluster rbtree
-         *
-         * The cluster includes an rbtree, but only uses the offset index
-         * of each free space cache entry.
         */
-        while (1) {
+        do {
+                int ret;
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
-                if (entry->bitmap && node) {
+                if (entry->bitmap)
-                        entry = rb_entry(node, struct btrfs_free_space,
-                                         offset_index);
                        continue;
-                } else if (entry->bitmap && !node) {
-                        break;
-                }
                rb_erase(&entry->offset_index, &block_group->free_space_offset);
                ret = tree_insert_offset(&cluster->root, entry->offset,
                                         &entry->offset_index, 0);
                BUG_ON(ret);
+        } while (node && entry != last);
-                if (!node || entry == last)
+        cluster->max_size = max_extent;
-                        break;
+        return 0;
+}
+/*
+ * This specifically looks for bitmaps that may work in the cluster, we assume
+ * that we have already failed to find extents that will work.
+ */
+static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
+                                struct btrfs_free_cluster *cluster,
+                                u64 offset, u64 bytes, u64 min_bytes)
+{
+        struct btrfs_free_space *entry;
+        struct rb_node *node;
+        int ret = -ENOSPC;
+        if (block_group->total_bitmaps == 0)
+                return -ENOSPC;
+        entry = tree_search_offset(block_group,
+                                   offset_to_bitmap(block_group, offset),
+                                   0, 1);
+        if (!entry)
+                return -ENOSPC;
+        node = &entry->offset_index;
+        do {
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+                node = rb_next(&entry->offset_index);
+                if (!entry->bitmap)
+                        continue;
+                if (entry->bytes < min_bytes)
+                        continue;
+                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
+                                           bytes, min_bytes);
+        } while (ret && node);
+        return ret;
+}
+/*
+ * here we try to find a cluster of blocks in a block group.  The goal
+ * is to find at least bytes free and up to empty_size + bytes free.
+ * We might not find them all in one contiguous area.
+ *
+ * returns zero and sets up cluster if things worked out, otherwise
+ * it returns -enospc
+ */
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster,
+                             u64 offset, u64 bytes, u64 empty_size)
+{
+        u64 min_bytes;
+        int ret;
+        /* for metadata, allow allocates with more holes */
+        if (btrfs_test_opt(root, SSD_SPREAD)) {
+                min_bytes = bytes + empty_size;
+        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+                /*
+                 * we want to do larger allocations when we are
+                 * flushing out the delayed refs, it helps prevent
+                 * making more work as we go along.
+                 */
+                if (trans->transaction->delayed_refs.flushing)
+                        min_bytes = max(bytes, (bytes + empty_size) >> 1);
+                else
+                        min_bytes = max(bytes, (bytes + empty_size) >> 4);
+        } else
+                min_bytes = max(bytes, (bytes + empty_size) >> 2);
+        spin_lock(&block_group->tree_lock);
+        /*
+         * If we know we don't have enough space to make a cluster don't even
+         * bother doing all the work to try and find one.
+         */
+        if (block_group->free_space < min_bytes) {
+                spin_unlock(&block_group->tree_lock);
+                return -ENOSPC;
        }
-        cluster->max_size = max_extent;
+        spin_lock(&cluster->lock);
-got_it:
-        ret = 0;
+        /* someone already found a cluster, hooray */
-        atomic_inc(&block_group->count);
+        if (cluster->block_group) {
-        list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
+                ret = 0;
-        cluster->block_group = block_group;
+                goto out;
+        }
+        ret = setup_cluster_no_bitmap(block_group, cluster, offset, bytes,
+                                      min_bytes);
+        if (ret)
+                ret = setup_cluster_bitmap(block_group, cluster, offset,
+                                           bytes, min_bytes);
+        if (!ret) {
+                atomic_inc(&block_group->count);
+                list_add_tail(&cluster->block_group_list,
+                              &block_group->cluster_list);
+                cluster->block_group = block_group;
+        }
 out:
        spin_unlock(&cluster->lock);
        spin_unlock(&block_group->tree_lock);
@@ -2111,8 +2243,99 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
        spin_lock_init(&cluster->refill_lock);
        cluster->root = RB_ROOT;
        cluster->max_size = 0;
-        cluster->points_to_bitmap = false;
        INIT_LIST_HEAD(&cluster->block_group_list);
        cluster->block_group = NULL;
 }
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+                           u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+        struct btrfs_free_space *entry = NULL;
+        struct btrfs_fs_info *fs_info = block_group->fs_info;
+        u64 bytes = 0;
+        u64 actually_trimmed;
+        int ret = 0;
+        *trimmed = 0;
+        while (start < end) {
+                spin_lock(&block_group->tree_lock);
+                if (block_group->free_space < minlen) {
+                        spin_unlock(&block_group->tree_lock);
+                        break;
+                }
+                entry = tree_search_offset(block_group, start, 0, 1);
+                if (!entry)
+                        entry = tree_search_offset(block_group,
+                                                   offset_to_bitmap(block_group,
+                                                                    start),
+                                                   1, 1);
+                if (!entry || entry->offset >= end) {
+                        spin_unlock(&block_group->tree_lock);
+                        break;
+                }
+                if (entry->bitmap) {
+                        ret = search_bitmap(block_group, entry, &start, &bytes);
+                        if (!ret) {
+                                if (start >= end) {
+                                        spin_unlock(&block_group->tree_lock);
+                                        break;
+                                }
+                                bytes = min(bytes, end - start);
+                                bitmap_clear_bits(block_group, entry,
+                                                  start, bytes);
+                                if (entry->bytes == 0)
+                                        free_bitmap(block_group, entry);
+                        } else {
+                                start = entry->offset + BITS_PER_BITMAP *
+                                        block_group->sectorsize;
+                                spin_unlock(&block_group->tree_lock);
+                                ret = 0;
+                                continue;
+                        }
+                } else {
+                        start = entry->offset;
+                        bytes = min(entry->bytes, end - start);
+                        unlink_free_space(block_group, entry);
+                        kfree(entry);
+                }
+                spin_unlock(&block_group->tree_lock);
+                if (bytes >= minlen) {
+                        int update_ret;
+                        update_ret = btrfs_update_reserved_bytes(block_group,
+                                                                 bytes, 1, 1);
+                        ret = btrfs_error_discard_extent(fs_info->extent_root,
+                                                         start,
+                                                         bytes,
+                                                         &actually_trimmed);
+                        btrfs_add_free_space(block_group,
+                                             start, bytes);
+                        if (!update_ret)
+                                btrfs_update_reserved_bytes(block_group,
+                                                            bytes, 0, 1);
+                        if (ret)
+                                break;
+                        *trimmed += actually_trimmed;
+                }
+                start += bytes;
+                bytes = 0;
+                if (fatal_signal_pending(current)) {
+                        ret = -ERESTARTSYS;
+                        break;
+                }
+                cond_resched();
+        }
+        return ret;
+}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index e49ca5c321b5..65c3b935289f 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -68,4 +68,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 int btrfs_return_cluster_to_free_space(
                               struct btrfs_block_group_cache *block_group,
                               struct btrfs_free_cluster *cluster);
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+                           u64 *trimmed, u64 start, u64 end, u64 minlen);
 #endif
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index c56eb5909172..c05a08f4c411 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -30,7 +30,8 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
        int slot;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
        search_key.type = -1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 160b55b3e132..fcd66b6a8086 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -50,6 +50,7 @@
 #include "tree-log.h"
 #include "compression.h"
 #include "locking.h"
+#include "free-space-cache.h"
 struct btrfs_iget_args {
        u64 ino;
@@ -70,6 +71,7 @@ static struct kmem_cache *btrfs_inode_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_transaction_cachep;
 struct kmem_cache *btrfs_path_cachep;
+struct kmem_cache *btrfs_free_space_cachep;
 #define S_SHIFT 12
 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -82,7 +84,8 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
        [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
 };
-static void btrfs_truncate(struct inode *inode);
+static int btrfs_setsize(struct inode *inode, loff_t newsize);
+static int btrfs_truncate(struct inode *inode);
 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
 static noinline int cow_file_range(struct inode *inode,
                                   struct page *locked_page,
@@ -90,13 +93,14 @@ static noinline int cow_file_range(struct inode *inode,
                                   unsigned long *nr_written, int unlock);
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
-                                     struct inode *inode,  struct inode *dir)
+                                     struct inode *inode,  struct inode *dir,
+                                     const struct qstr *qstr)
 {
        int err;
        err = btrfs_init_acl(trans, inode, dir);
        if (!err)
-                err = btrfs_xattr_security_init(trans, inode, dir);
+                err = btrfs_xattr_security_init(trans, inode, dir, qstr);
        return err;
 }
@@ -108,6 +112,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root, struct inode *inode,
                                u64 start, size_t size, size_t compressed_size,
+                                int compress_type,
                                struct page **compressed_pages)
 {
        struct btrfs_key key;
@@ -122,12 +127,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        size_t cur_size = size;
        size_t datasize;
        unsigned long offset;
-        int compress_type = BTRFS_COMPRESS_NONE;
-        if (compressed_size && compressed_pages) {
+        if (compressed_size && compressed_pages)
-                compress_type = root->fs_info->compress_type;
                cur_size = compressed_size;
-        }
        path = btrfs_alloc_path();
        if (!path)
@@ -217,7 +219,7 @@ fail:
 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode, u64 start, u64 end,
-                                 size_t compressed_size,
+                                 size_t compressed_size, int compress_type,
                                 struct page **compressed_pages)
 {
        u64 isize = i_size_read(inode);
@@ -250,7 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                inline_len = min_t(u64, isize, actual_end);
        ret = insert_inline_extent(trans, root, inode, start,
                                   inline_len, compressed_size,
-                                   compressed_pages);
+                                   compress_type, compressed_pages);
        BUG_ON(ret);
        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
@@ -287,6 +289,7 @@ static noinline int add_async_extent(struct async_cow *cow,
        struct async_extent *async_extent;
        async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+        BUG_ON(!async_extent);
        async_extent->start = start;
        async_extent->ram_size = ram_size;
        async_extent->compressed_size = compressed_size;
@@ -381,9 +384,11 @@ again:
         */
        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
            (btrfs_test_opt(root, COMPRESS) ||
-             (BTRFS_I(inode)->force_compress))) {
+             (BTRFS_I(inode)->force_compress) ||
+             (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+                BUG_ON(!pages);
                if (BTRFS_I(inode)->force_compress)
                        compress_type = BTRFS_I(inode)->force_compress;
@@ -416,7 +421,7 @@ again:
        }
        if (start == 0) {
                trans = btrfs_join_transaction(root, 1);
-                BUG_ON(!trans);
+                BUG_ON(IS_ERR(trans));
                btrfs_set_trans_block_group(trans, inode);
                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -426,12 +431,13 @@ again:
                         * to make an uncompressed inline extent.
                         */
                        ret = cow_file_range_inline(trans, root, inode,
-                                                    start, end, 0, NULL);
+                                                    start, end, 0, 0, NULL);
                } else {
                        /* try making a compressed inline extent */
                        ret = cow_file_range_inline(trans, root, inode,
                                                    start, end,
-                                                    total_compressed, pages);
+                                                    total_compressed,
+                                                    compress_type, pages);
                }
                if (ret == 0) {
                        /*
@@ -612,6 +618,7 @@ retry:
                            GFP_NOFS);
                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(IS_ERR(trans));
                ret = btrfs_reserve_extent(trans, root,
                                           async_extent->compressed_size,
                                           async_extent->compressed_size,
@@ -643,6 +650,7 @@ retry:
                                        async_extent->ram_size - 1, 0);
                em = alloc_extent_map(GFP_NOFS);
+                BUG_ON(!em);
                em->start = async_extent->start;
                em->len = async_extent->ram_size;
                em->orig_start = em->start;
@@ -771,7 +779,7 @@ static noinline int cow_file_range(struct inode *inode,
        BUG_ON(root == root->fs_info->tree_root);
        trans = btrfs_join_transaction(root, 1);
-        BUG_ON(!trans);
+        BUG_ON(IS_ERR(trans));
        btrfs_set_trans_block_group(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -783,7 +791,7 @@ static noinline int cow_file_range(struct inode *inode,
        if (start == 0) {
                /* lets try to make an inline extent */
                ret = cow_file_range_inline(trans, root, inode,
-                                            start, end, 0, NULL);
+                                            start, end, 0, 0, NULL);
                if (ret == 0) {
                        extent_clear_unlock_delalloc(inode,
                                     &BTRFS_I(inode)->io_tree,
@@ -819,6 +827,7 @@ static noinline int cow_file_range(struct inode *inode,
                BUG_ON(ret);
                em = alloc_extent_map(GFP_NOFS);
+                BUG_ON(!em);
                em->start = start;
                em->orig_start = em->start;
                ram_size = ins.offset;
@@ -1049,7 +1058,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
        } else {
                trans = btrfs_join_transaction(root, 1);
        }
-        BUG_ON(!trans);
+        BUG_ON(IS_ERR(trans));
        cow_start = (u64)-1;
        cur_offset = start;
@@ -1168,6 +1177,7 @@ out_check:
                        struct extent_map_tree *em_tree;
                        em_tree = &BTRFS_I(inode)->extent_tree;
                        em = alloc_extent_map(GFP_NOFS);
+                        BUG_ON(!em);
                        em->start = cur_offset;
                        em->orig_start = em->start;
                        em->len = num_bytes;
@@ -1249,7 +1259,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
        else if (!btrfs_test_opt(root, COMPRESS) &&
-                 !(BTRFS_I(inode)->force_compress))
+                 !(BTRFS_I(inode)->force_compress) &&
+                 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))
                ret = cow_file_range(inode, locked_page, start, end,
                                      page_started, nr_written, 1);
        else
@@ -1456,8 +1467,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                if (bio_flags & EXTENT_BIO_COMPRESSED) {
                        return btrfs_submit_compressed_read(inode, bio,
                                                    mirror_num, bio_flags);
-                } else if (!skip_sum)
+                } else if (!skip_sum) {
-                        btrfs_lookup_bio_sums(root, inode, bio, NULL);
+                        ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
+                        if (ret)
+                                return ret;
+                }
                goto mapit;
        } else if (!skip_sum) {
                /* csum items have already been cloned */
@@ -1557,6 +1571,7 @@ out:
 out_page:
        unlock_page(page);
        page_cache_release(page);
+        kfree(fixup);
 }
 /*
@@ -1703,7 +1718,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                trans = btrfs_join_transaction_nolock(root, 1);
                        else
                                trans = btrfs_join_transaction(root, 1);
-                        BUG_ON(!trans);
+                        BUG_ON(IS_ERR(trans));
                        btrfs_set_trans_block_group(trans, inode);
                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode(trans, root, inode);
@@ -1720,6 +1735,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                trans = btrfs_join_transaction_nolock(root, 1);
        else
                trans = btrfs_join_transaction(root, 1);
+        BUG_ON(IS_ERR(trans));
        btrfs_set_trans_block_group(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -1754,9 +1770,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
-        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+        ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-        ret = btrfs_update_inode(trans, root, inode);
+        if (!ret) {
-        BUG_ON(ret);
+                ret = btrfs_update_inode(trans, root, inode);
+                BUG_ON(ret);
+        }
+        ret = 0;
 out:
        if (nolock) {
                if (trans)
@@ -1778,6 +1797,8 @@ out:
 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                struct extent_state *state, int uptodate)
 {
+        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
        ClearPagePrivate2(page);
        return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
@@ -1888,10 +1909,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
        else
                rw = READ;
-        BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
+        ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
                                                      failrec->last_mirror,
                                                      failrec->bio_flags, 0);
-        return 0;
+        return ret;
 }
 /*
@@ -1907,7 +1928,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start)
        private = 0;
        if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-                             (u64)-1, 1, EXTENT_DIRTY)) {
+                             (u64)-1, 1, EXTENT_DIRTY, 0)) {
                ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
                                        start, &private_failure);
                if (ret == 0) {
@@ -2203,8 +2224,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
                        insert = 1;
 #endif
                insert = 1;
-        } else {
-                WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
        }
        if (!BTRFS_I(inode)->orphan_meta_reserved) {
@@ -2275,7 +2294,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 * this cleans up any orphans that may be left on the list from the last use
 * of this root.
 */
-void btrfs_orphan_cleanup(struct btrfs_root *root)
+int btrfs_orphan_cleanup(struct btrfs_root *root)
 {
        struct btrfs_path *path;
        struct extent_buffer *leaf;
@@ -2285,10 +2304,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
        int ret = 0, nr_unlink = 0, nr_truncate = 0;
        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
-                return;
+                return 0;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
        path->reada = -1;
        key.objectid = BTRFS_ORPHAN_OBJECTID;
@@ -2297,18 +2319,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-                if (ret < 0) {
+                if (ret < 0)
-                        printk(KERN_ERR "Error searching slot for orphan: %d"
+                        goto out;
-                               "\n", ret);
-                        break;
-                }
                /*
                 * if ret == 0 means we found what we were searching for, which
-                 * is weird, but possible, so only screw with path if we didnt
+                 * is weird, but possible, so only screw with path if we didn't
                 * find the key and see if we have stuff that matches
                 */
                if (ret > 0) {
+                        ret = 0;
                        if (path->slots[0] == 0)
                                break;
                        path->slots[0]--;
@@ -2336,7 +2356,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                found_key.type = BTRFS_INODE_ITEM_KEY;
                found_key.offset = 0;
                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
-                BUG_ON(IS_ERR(inode));
+                if (IS_ERR(inode)) {
+                        ret = PTR_ERR(inode);
+                        goto out;
+                }
                /*
                 * add this inode to the orphan list so btrfs_orphan_del does
@@ -2354,6 +2377,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                 */
                if (is_bad_inode(inode)) {
                        trans = btrfs_start_transaction(root, 0);
+                        if (IS_ERR(trans)) {
+                                ret = PTR_ERR(trans);
+                                goto out;
+                        }
                        btrfs_orphan_del(trans, inode);
                        btrfs_end_transaction(trans, root);
                        iput(inode);
@@ -2362,17 +2389,22 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                /* if we have links, this was a truncate, lets do that */
                if (inode->i_nlink) {
+                        if (!S_ISREG(inode->i_mode)) {
+                                WARN_ON(1);
+                                iput(inode);
+                                continue;
+                        }
                        nr_truncate++;
-                        btrfs_truncate(inode);
+                        ret = btrfs_truncate(inode);
                } else {
                        nr_unlink++;
                }
                /* this will do delete_inode and everything for us */
                iput(inode);
+                if (ret)
+                        goto out;
        }
-        btrfs_free_path(path);
        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
        if (root->orphan_block_rsv)
@@ -2381,13 +2413,20 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
        if (root->orphan_block_rsv || root->orphan_item_inserted) {
                trans = btrfs_join_transaction(root, 1);
-                btrfs_end_transaction(trans, root);
+                if (!IS_ERR(trans))
+                        btrfs_end_transaction(trans, root);
        }
        if (nr_unlink)
                printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
        if (nr_truncate)
                printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+out:
+        if (ret)
+                printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
+        btrfs_free_path(path);
+        return ret;
 }
 /*
@@ -2554,6 +2593,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct btrfs_inode_item *item,
                            struct inode *inode)
 {
+        if (!leaf->map_token)
+                map_private_extent_buffer(leaf, (unsigned long)item,
+                                          sizeof(struct btrfs_inode_item),
+                                          &leaf->map_token, &leaf->kaddr,
+                                          &leaf->map_start, &leaf->map_len,
+                                          KM_USER1);
        btrfs_set_inode_uid(leaf, item, inode->i_uid);
        btrfs_set_inode_gid(leaf, item, inode->i_gid);
        btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
@@ -2582,6 +2628,11 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
        btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+        if (leaf->map_token) {
+                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                leaf->map_token = NULL;
+        }
 }
 /*
@@ -2626,10 +2677,10 @@ failed:
 * recovery code.  It remove a link in a directory with a given name, and
 * also drops the back refs in the inode to the directory
 */
-int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root,
+                                struct btrfs_root *root,
-                       struct inode *dir, struct inode *inode,
+                                struct inode *dir, struct inode *inode,
-                       const char *name, int name_len)
+                                const char *name, int name_len)
 {
        struct btrfs_path *path;
        int ret = 0;
@@ -2641,7 +2692,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
-                goto err;
+                goto out;
        }
        path->leave_spinning = 1;
@@ -2701,12 +2752,25 @@ err:
        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
        inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        btrfs_update_inode(trans, root, dir);
-        btrfs_drop_nlink(inode);
-        ret = btrfs_update_inode(trans, root, inode);
 out:
        return ret;
 }
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       struct inode *dir, struct inode *inode,
+                       const char *name, int name_len)
+{
+        int ret;
+        ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+        if (!ret) {
+                btrfs_drop_nlink(inode);
+                ret = btrfs_update_inode(trans, root, inode);
+        }
+        return ret;
+}
+                
 /* helper to check if there is any shared block in the path */
 static int check_path_shared(struct btrfs_root *root,
                             struct btrfs_path *path)
@@ -2714,9 +2778,10 @@ static int check_path_shared(struct btrfs_root *root,
        struct extent_buffer *eb;
        int level;
        u64 refs = 1;
-        int uninitialized_var(ret);
        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                int ret;
                if (!path->nodes[level])
                        break;
                eb = path->nodes[level];
@@ -2727,7 +2792,7 @@ static int check_path_shared(struct btrfs_root *root,
                if (refs > 1)
                        return 1;
        }
-        return ret; /* XXX callers? */
+        return 0;
 }
 /*
@@ -3527,7 +3592,13 @@ out:
        return ret;
 }
-int btrfs_cont_expand(struct inode *inode, loff_t size)
+/*
+ * This function puts in dummy file extents for the area we're creating a hole
+ * for.  So if we are truncating this file to a larger size we need to insert
+ * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
+ * the range between oldsize and size
+ */
+int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3535,7 +3606,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        u64 mask = root->sectorsize - 1;
-        u64 hole_start = (inode->i_size + mask) & ~mask;
+        u64 hole_start = (oldsize + mask) & ~mask;
        u64 block_end = (size + mask) & ~mask;
        u64 last_byte;
        u64 cur_offset;
@@ -3580,13 +3651,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                        err = btrfs_drop_extents(trans, inode, cur_offset,
                                                 cur_offset + hole_size,
                                                 &hint_byte, 1);
-                        BUG_ON(err);
+                        if (err)
+                                break;
                        err = btrfs_insert_file_extent(trans, root,
                                        inode->i_ino, cur_offset, 0,
                                        0, hole_size, 0, hole_size,
                                        0, 0, 0);
-                        BUG_ON(err);
+                        if (err)
+                                break;
                        btrfs_drop_extent_cache(inode, hole_start,
                                        last_byte - 1, 0);
@@ -3606,81 +3679,41 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
        return err;
 }
-static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
+static int btrfs_setsize(struct inode *inode, loff_t newsize)
 {
-        struct btrfs_root *root = BTRFS_I(inode)->root;
+        loff_t oldsize = i_size_read(inode);
-        struct btrfs_trans_handle *trans;
-        unsigned long nr;
        int ret;
-        if (attr->ia_size == inode->i_size)
+        if (newsize == oldsize)
                return 0;
-        if (attr->ia_size > inode->i_size) {
+        if (newsize > oldsize) {
-                unsigned long limit;
+                i_size_write(inode, newsize);
-                limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
-                if (attr->ia_size > inode->i_sb->s_maxbytes)
+                truncate_pagecache(inode, oldsize, newsize);
-                        return -EFBIG;
+                ret = btrfs_cont_expand(inode, oldsize, newsize);
-                if (limit != RLIM_INFINITY && attr->ia_size > limit) {
-                        send_sig(SIGXFSZ, current, 0);
-                        return -EFBIG;
-                }
-        }
-        trans = btrfs_start_transaction(root, 5);
-        if (IS_ERR(trans))
-                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, inode);
-        ret = btrfs_orphan_add(trans, inode);
-        BUG_ON(ret);
-        nr = trans->blocks_used;
-        btrfs_end_transaction(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
-        if (attr->ia_size > inode->i_size) {
-                ret = btrfs_cont_expand(inode, attr->ia_size);
                if (ret) {
-                        btrfs_truncate(inode);
+                        btrfs_setsize(inode, oldsize);
                        return ret;
                }
-                i_size_write(inode, attr->ia_size);
+                mark_inode_dirty(inode);
-                btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+        } else {
-                trans = btrfs_start_transaction(root, 0);
+                /*
-                BUG_ON(IS_ERR(trans));
+                 * We're truncating a file that used to have good data down to
-                btrfs_set_trans_block_group(trans, inode);
+                 * zero. Make sure it gets into the ordered flush list so that
-                trans->block_rsv = root->orphan_block_rsv;
+                 * any new writes get down to disk quickly.
-                BUG_ON(!trans->block_rsv);
+                 */
+                if (newsize == 0)
+                        BTRFS_I(inode)->ordered_data_close = 1;
-                ret = btrfs_update_inode(trans, root, inode);
+                /* we don't support swapfiles, so vmtruncate shouldn't fail */
-                BUG_ON(ret);
+                truncate_setsize(inode, newsize);
-                if (inode->i_nlink > 0) {
+                ret = btrfs_truncate(inode);
-                        ret = btrfs_orphan_del(trans, inode);
-                        BUG_ON(ret);
-                }
-                nr = trans->blocks_used;
-                btrfs_end_transaction(trans, root);
-                btrfs_btree_balance_dirty(root, nr);
-                return 0;
        }
-        /*
+        return ret;
-         * We're truncating a file that used to have good data down to
-         * zero. Make sure it gets into the ordered flush list so that
-         * any new writes get down to disk quickly.
-         */
-        if (attr->ia_size == 0)
-                BTRFS_I(inode)->ordered_data_close = 1;
-        /* we don't support swapfiles, so vmtruncate shouldn't fail */
-        ret = vmtruncate(inode, attr->ia_size);
-        BUG_ON(ret);
-        return 0;
 }
 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -3697,7 +3730,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                return err;
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-                err = btrfs_setattr_size(inode, attr);
+                err = btrfs_setsize(inode, attr->ia_size);
                if (err)
                        return err;
        }
@@ -3720,6 +3753,8 @@ void btrfs_evict_inode(struct inode *inode)
        unsigned long nr;
        int ret;
+        trace_btrfs_inode_evict(inode);
        truncate_inode_pages(&inode->i_data, 0);
        if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
                               root == root->fs_info->tree_root))
@@ -4062,7 +4097,6 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                BTRFS_I(inode)->root = root;
                memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
                btrfs_read_locked_inode(inode);
                inode_tree_add(inode);
                unlock_new_inode(inode);
                if (new)
@@ -4134,11 +4168,13 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        }
        srcu_read_unlock(&root->fs_info->subvol_srcu, index);
-        if (root != sub_root) {
+        if (!IS_ERR(inode) && root != sub_root) {
                down_read(&root->fs_info->cleanup_work_sem);
                if (!(inode->i_sb->s_flags & MS_RDONLY))
-                        btrfs_orphan_cleanup(sub_root);
+                        ret = btrfs_orphan_cleanup(sub_root);
                up_read(&root->fs_info->cleanup_work_sem);
+                if (ret)
+                        inode = ERR_PTR(ret);
        }
        return inode;
@@ -4186,10 +4222,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        struct btrfs_key found_key;
        struct btrfs_path *path;
        int ret;
-        u32 nritems;
        struct extent_buffer *leaf;
        int slot;
-        int advance;
        unsigned char d_type;
        int over = 0;
        u32 di_cur;
@@ -4232,27 +4266,19 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
-        advance = 0;
        while (1) {
                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
                slot = path->slots[0];
-                if (advance || slot >= nritems) {
+                if (slot >= btrfs_header_nritems(leaf)) {
-                        if (slot >= nritems - 1) {
+                        ret = btrfs_next_leaf(root, path);
-                                ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
-                                if (ret)
+                                goto err;
-                                        break;
+                        else if (ret > 0)
-                                leaf = path->nodes[0];
+                                break;
-                                nritems = btrfs_header_nritems(leaf);
+                        continue;
-                                slot = path->slots[0];
-                        } else {
-                                slot++;
-                                path->slots[0]++;
-                        }
                }
-                advance = 1;
                item = btrfs_item_nr(leaf, slot);
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -4261,7 +4287,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                if (btrfs_key_type(&found_key) != key_type)
                        break;
                if (found_key.offset < filp->f_pos)
-                        continue;
+                        goto next;
                filp->f_pos = found_key.offset;
@@ -4272,6 +4298,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                while (di_cur < di_total) {
                        struct btrfs_key location;
+                        if (verify_dir_item(root, leaf, di))
+                                break;
                        name_len = btrfs_dir_name_len(leaf, di);
                        if (name_len <= sizeof(tmp_name)) {
                                name_ptr = tmp_name;
@@ -4311,6 +4340,8 @@ skip:
                        di_cur += di_len;
                        di = (struct btrfs_dir_item *)((char *)di + di_len);
                }
+next:
+                path->slots[0]++;
        }
        /* Reached end of directory/root. Bump pos past the last item. */
@@ -4347,6 +4378,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
                        trans = btrfs_join_transaction_nolock(root, 1);
                else
                        trans = btrfs_join_transaction(root, 1);
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
                btrfs_set_trans_block_group(trans, inode);
                if (nolock)
                        ret = btrfs_end_transaction_nolock(trans, root);
@@ -4372,6 +4405,7 @@ void btrfs_dirty_inode(struct inode *inode)
                return;
        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(IS_ERR(trans));
        btrfs_set_trans_block_group(trans, inode);
        ret = btrfs_update_inode(trans, root, inode);
@@ -4500,12 +4534,17 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        BUG_ON(!path);
        inode = new_inode(root->fs_info->sb);
-        if (!inode)
+        if (!inode) {
+                btrfs_free_path(path);
                return ERR_PTR(-ENOMEM);
+        }
        if (dir) {
+                trace_btrfs_inode_request(dir);
                ret = btrfs_set_inode_index(dir, index);
                if (ret) {
+                        btrfs_free_path(path);
                        iput(inode);
                        return ERR_PTR(ret);
                }
@@ -4572,12 +4611,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        if ((mode & S_IFREG)) {
                if (btrfs_test_opt(root, NODATASUM))
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
-                if (btrfs_test_opt(root, NODATACOW))
+                if (btrfs_test_opt(root, NODATACOW) ||
+                    (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
        }
        insert_inode_hash(inode);
        inode_tree_add(inode);
+        trace_btrfs_inode_new(inode);
        return inode;
 fail:
        if (dir)
@@ -4692,7 +4735,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@ -4753,7 +4796,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@ -4794,30 +4837,31 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        int err;
        int drop_inode = 0;
-        if (inode->i_nlink == 0)
-                return -ENOENT;
        /* do not allow sys_link's with other subvols of the same device */
        if (root->objectid != BTRFS_I(inode)->root->objectid)
-                return -EPERM;
+                return -EXDEV;
-        btrfs_inc_nlink(inode);
+        if (inode->i_nlink == ~0U)
-        inode->i_ctime = CURRENT_TIME;
+                return -EMLINK;
        err = btrfs_set_inode_index(dir, &index);
        if (err)
                goto fail;
        /*
-         * 1 item for inode ref
+         * 2 items for inode and inode ref
         * 2 items for dir items
+         * 1 item for parent inode
         */
-        trans = btrfs_start_transaction(root, 3);
+        trans = btrfs_start_transaction(root, 5);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto fail;
        }
+        btrfs_inc_nlink(inode);
+        inode->i_ctime = CURRENT_TIME;
        btrfs_set_trans_block_group(trans, dir);
        ihold(inode);
@@ -4881,7 +4925,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        drop_on_err = 1;
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
                goto out_fail;
@@ -5176,6 +5220,8 @@ again:
                                em = NULL;
                                btrfs_release_path(root, path);
                                trans = btrfs_join_transaction(root, 1);
+                                if (IS_ERR(trans))
+                                        return ERR_CAST(trans);
                                goto again;
                        }
                        map = kmap(page);
@@ -5185,7 +5231,7 @@ again:
                        btrfs_mark_buffer_dirty(leaf);
                }
                set_extent_uptodate(io_tree, em->start,
-                                    extent_map_end(em) - 1, GFP_NOFS);
+                                    extent_map_end(em) - 1, NULL, GFP_NOFS);
                goto insert;
        } else {
                printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
@@ -5252,6 +5298,9 @@ insert:
        }
        write_unlock(&em_tree->lock);
 out:
+        trace_btrfs_get_extent(root, em);
        if (path)
                btrfs_free_path(path);
        if (trans) {
@@ -5266,22 +5315,157 @@ out:
        return em;
 }
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+                                           size_t pg_offset, u64 start, u64 len,
+                                           int create)
+{
+        struct extent_map *em;
+        struct extent_map *hole_em = NULL;
+        u64 range_start = start;
+        u64 end;
+        u64 found;
+        u64 found_end;
+        int err = 0;
+        em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+        if (IS_ERR(em))
+                return em;
+        if (em) {
+                /*
+                 * if our em maps to a hole, there might
+                 * actually be delalloc bytes behind it
+                 */
+                if (em->block_start != EXTENT_MAP_HOLE)
+                        return em;
+                else
+                        hole_em = em;
+        }
+        /* check to see if we've wrapped (len == -1 or similar) */
+        end = start + len;
+        if (end < start)
+                end = (u64)-1;
+        else
+                end -= 1;
+        em = NULL;
+        /* ok, we didn't find anything, lets look for delalloc */
+        found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
+                                 end, len, EXTENT_DELALLOC, 1);
+        found_end = range_start + found;
+        if (found_end < range_start)
+                found_end = (u64)-1;
+        /*
+         * we didn't find anything useful, return
+         * the original results from get_extent()
+         */
+        if (range_start > end || found_end <= start) {
+                em = hole_em;
+                hole_em = NULL;
+                goto out;
+        }
+        /* adjust the range_start to make sure it doesn't
+         * go backwards from the start they passed in
+         */
+        range_start = max(start,range_start);
+        found = found_end - range_start;
+        if (found > 0) {
+                u64 hole_start = start;
+                u64 hole_len = len;
+                em = alloc_extent_map(GFP_NOFS);
+                if (!em) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                /*
+                 * when btrfs_get_extent can't find anything it
+                 * returns one huge hole
+                 *
+                 * make sure what it found really fits our range, and
+                 * adjust to make sure it is based on the start from
+                 * the caller
+                 */
+                if (hole_em) {
+                        u64 calc_end = extent_map_end(hole_em);
+                        if (calc_end <= start || (hole_em->start > end)) {
+                                free_extent_map(hole_em);
+                                hole_em = NULL;
+                        } else {
+                                hole_start = max(hole_em->start, start);
+                                hole_len = calc_end - hole_start;
+                        }
+                }
+                em->bdev = NULL;
+                if (hole_em && range_start > hole_start) {
+                        /* our hole starts before our delalloc, so we
+                         * have to return just the parts of the hole
+                         * that go until  the delalloc starts
+                         */
+                        em->len = min(hole_len,
+                                      range_start - hole_start);
+                        em->start = hole_start;
+                        em->orig_start = hole_start;
+                        /*
+                         * don't adjust block start at all,
+                         * it is fixed at EXTENT_MAP_HOLE
+                         */
+                        em->block_start = hole_em->block_start;
+                        em->block_len = hole_len;
+                } else {
+                        em->start = range_start;
+                        em->len = found;
+                        em->orig_start = range_start;
+                        em->block_start = EXTENT_MAP_DELALLOC;
+                        em->block_len = found;
+                }
+        } else if (hole_em) {
+                return hole_em;
+        }
+out:
+        free_extent_map(hole_em);
+        if (err) {
+                free_extent_map(em);
+                return ERR_PTR(err);
+        }
+        return em;
+}
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+                                                  struct extent_map *em,
                                                  u64 start, u64 len)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
-        struct extent_map *em;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct btrfs_key ins;
        u64 alloc_hint;
        int ret;
+        bool insert = false;
-        btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+        /*
+         * Ok if the extent map we looked up is a hole and is for the exact
+         * range we want, there is no reason to allocate a new one, however if
+         * it is not right then we need to free this one and drop the cache for
+         * our range.
+         */
+        if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
+            em->len != len) {
+                free_extent_map(em);
+                em = NULL;
+                insert = true;
+                btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+        }
        trans = btrfs_join_transaction(root, 0);
-        if (!trans)
+        if (IS_ERR(trans))
-                return ERR_PTR(-ENOMEM);
+                return ERR_CAST(trans);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -5293,10 +5477,12 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                goto out;
        }
-        em = alloc_extent_map(GFP_NOFS);
        if (!em) {
-                em = ERR_PTR(-ENOMEM);
+                em = alloc_extent_map(GFP_NOFS);
-                goto out;
+                if (!em) {
+                        em = ERR_PTR(-ENOMEM);
+                        goto out;
+                }
        }
        em->start = start;
@@ -5306,9 +5492,15 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
        em->block_start = ins.objectid;
        em->block_len = ins.offset;
        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        /*
+         * We need to do this because if we're using the original em we searched
+         * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
+         */
+        em->flags = 0;
        set_bit(EXTENT_FLAG_PINNED, &em->flags);
-        while (1) {
+        while (insert) {
                write_lock(&em_tree->lock);
                ret = add_extent_mapping(em_tree, em);
                write_unlock(&em_tree->lock);
@@ -5505,7 +5697,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                 * while we look for nocow cross refs
                 */
                trans = btrfs_join_transaction(root, 0);
-                if (!trans)
+                if (IS_ERR(trans))
                        goto must_cow;
                if (can_nocow_odirect(trans, inode, start, len) == 1) {
@@ -5526,8 +5718,7 @@ must_cow:
         * it above
         */
        len = bh_result->b_size;
-        free_extent_map(em);
+        em = btrfs_new_extent_direct(inode, em, start, len);
-        em = btrfs_new_extent_direct(inode, start, len);
        if (IS_ERR(em))
                return PTR_ERR(em);
        len = min(len, em->len - (start - em->start));
@@ -5613,6 +5804,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
        kfree(dip->csums);
        kfree(dip);
+        /* If we had a csum failure make sure to clear the uptodate flag */
+        if (err)
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
        dio_end_io(bio, err);
 }
@@ -5640,7 +5835,7 @@ again:
        BUG_ON(!ordered);
        trans = btrfs_join_transaction(root, 1);
-        if (!trans) {
+        if (IS_ERR(trans)) {
                err = -ENOMEM;
                goto out;
        }
@@ -5686,8 +5881,10 @@ again:
        }
        add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
-        btrfs_ordered_update_i_size(inode, 0, ordered);
+        ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-        btrfs_update_inode(trans, root, inode);
+        if (!ret)
+                btrfs_update_inode(trans, root, inode);
+        ret = 0;
 out_unlock:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
                             ordered->file_offset + ordered->len - 1,
@@ -5714,6 +5911,10 @@ out_done:
        kfree(dip->csums);
        kfree(dip);
+        /* If we had an error make sure to clear the uptodate flag */
+        if (err)
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
        dio_end_io(bio, err);
 }
@@ -5769,7 +5970,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                                         int rw, u64 file_offset, int skip_sum,
-                                         u32 *csums)
+                                         u32 *csums, int async_submit)
 {
        int write = rw & REQ_WRITE;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -5780,18 +5981,33 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
        if (ret)
                goto err;
-        if (write && !skip_sum) {
+        if (skip_sum)
+                goto map;
+        if (write && async_submit) {
                ret = btrfs_wq_submit_bio(root->fs_info,
                                   inode, rw, bio, 0, 0,
                                   file_offset,
                                   __btrfs_submit_bio_start_direct_io,
                                   __btrfs_submit_bio_done);
                goto err;
-        } else if (!skip_sum)
+        } else if (write) {
-                btrfs_lookup_bio_sums_dio(root, inode, bio,
+                /*
+                 * If we aren't doing async submit, calculate the csum of the
+                 * bio now.
+                 */
+                ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
+                if (ret)
+                        goto err;
+        } else if (!skip_sum) {
+                ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
                                          file_offset, csums);
+                if (ret)
+                        goto err;
+        }
-        ret = btrfs_map_bio(root, rw, bio, 0, 1);
+map:
+        ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
 err:
        bio_put(bio);
        return ret;
@@ -5813,13 +6029,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        int nr_pages = 0;
        u32 *csums = dip->csums;
        int ret = 0;
+        int async_submit = 0;
-        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
+        int write = rw & REQ_WRITE;
-        if (!bio)
-                return -ENOMEM;
-        bio->bi_private = dip;
-        bio->bi_end_io = btrfs_end_dio_bio;
-        atomic_inc(&dip->pending_bios);
        map_length = orig_bio->bi_size;
        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
@@ -5829,6 +6040,19 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                return -EIO;
        }
+        if (map_length >= orig_bio->bi_size) {
+                bio = orig_bio;
+                goto submit;
+        }
+        async_submit = 1;
+        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_private = dip;
+        bio->bi_end_io = btrfs_end_dio_bio;
+        atomic_inc(&dip->pending_bios);
        while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
                if (unlikely(map_length < submit_len + bvec->bv_len ||
                    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
@@ -5842,14 +6066,15 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                        atomic_inc(&dip->pending_bios);
                        ret = __btrfs_submit_dio_bio(bio, inode, rw,
                                                     file_offset, skip_sum,
-                                                     csums);
+                                                     csums, async_submit);
                        if (ret) {
                                bio_put(bio);
                                atomic_dec(&dip->pending_bios);
                                goto out_err;
                        }
-                        if (!skip_sum)
+                        /* Write's use the ordered csums */
+                        if (!write && !skip_sum)
                                csums = csums + nr_pages;
                        start_sector += submit_len >> 9;
                        file_offset += submit_len;
@@ -5878,8 +6103,9 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                }
        }
+submit:
        ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
-                                     csums);
+                                     csums, async_submit);
        if (!ret)
                return 0;
@@ -5917,9 +6143,11 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
        }
        dip->csums = NULL;
-        if (!skip_sum) {
+        /* Write's use the ordered csum stuff, so we don't need dip->csums */
+        if (!write && !skip_sum) {
                dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
                if (!dip->csums) {
+                        kfree(dip);
                        ret = -ENOMEM;
                        goto free_ordered;
                }
@@ -5972,6 +6200,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
                        unsigned long nr_segs)
 {
        int seg;
+        int i;
        size_t size;
        unsigned long addr;
        unsigned blocksize_mask = root->sectorsize - 1;
@@ -5986,8 +6215,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
                addr = (unsigned long)iov[seg].iov_base;
                size = iov[seg].iov_len;
                end += size;
-                if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+                if ((addr & blocksize_mask) || (size & blocksize_mask))
                        goto out;
+                /* If this is a write we don't need to check anymore */
+                if (rw & WRITE)
+                        continue;
+                /*
+                 * Check to make sure we don't have duplicate iov_base's in this
+                 * iovec, if so return EINVAL, otherwise we'll get csum errors
+                 * when reading back.
+                 */
+                for (i = seg + 1; i < nr_segs; i++) {
+                        if (iov[seg].iov_base == iov[i].iov_base)
+                                goto out;
+                }
        }
        retval = 0;
 out:
@@ -6088,7 +6331,7 @@ out:
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
-        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
+        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
 }
 int btrfs_readpage(struct file *file, struct page *page)
@@ -6338,28 +6581,42 @@ out:
        return ret;
 }
-static void btrfs_truncate(struct inode *inode)
+static int btrfs_truncate(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret;
+        int err = 0;
        struct btrfs_trans_handle *trans;
        unsigned long nr;
        u64 mask = root->sectorsize - 1;
-        if (!S_ISREG(inode->i_mode)) {
-                WARN_ON(1);
-                return;
-        }
        ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
        if (ret)
-                return;
+                return ret;
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+        trans = btrfs_start_transaction(root, 5);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
+        btrfs_set_trans_block_group(trans, inode);
+        ret = btrfs_orphan_add(trans, inode);
+        if (ret) {
+                btrfs_end_transaction(trans, root);
+                return ret;
+        }
+        nr = trans->blocks_used;
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+        /* Now start a transaction for the truncate */
        trans = btrfs_start_transaction(root, 0);
-        BUG_ON(IS_ERR(trans));
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        btrfs_set_trans_block_group(trans, inode);
        trans->block_rsv = root->orphan_block_rsv;
@@ -6386,29 +6643,38 @@ static void btrfs_truncate(struct inode *inode)
        while (1) {
                if (!trans) {
                        trans = btrfs_start_transaction(root, 0);
-                        BUG_ON(IS_ERR(trans));
+                        if (IS_ERR(trans))
+                                return PTR_ERR(trans);
                        btrfs_set_trans_block_group(trans, inode);
                        trans->block_rsv = root->orphan_block_rsv;
                }
                ret = btrfs_block_rsv_check(trans, root,
                                            root->orphan_block_rsv, 0, 5);
-                if (ret) {
+                if (ret == -EAGAIN) {
-                        BUG_ON(ret != -EAGAIN);
                        ret = btrfs_commit_transaction(trans, root);
-                        BUG_ON(ret);
+                        if (ret)
+                                return ret;
                        trans = NULL;
                        continue;
+                } else if (ret) {
+                        err = ret;
+                        break;
                }
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
-                if (ret != -EAGAIN)
+                if (ret != -EAGAIN) {
+                        err = ret;
                        break;
+                }
                ret = btrfs_update_inode(trans, root, inode);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        break;
+                }
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
@@ -6418,16 +6684,27 @@ static void btrfs_truncate(struct inode *inode)
        if (ret == 0 && inode->i_nlink > 0) {
                ret = btrfs_orphan_del(trans, inode);
-                BUG_ON(ret);
+                if (ret)
+                        err = ret;
+        } else if (ret && inode->i_nlink > 0) {
+                /*
+                 * Failed to do the truncate, remove us from the in memory
+                 * orphan list.
+                 */
+                ret = btrfs_orphan_del(NULL, inode);
        }
        ret = btrfs_update_inode(trans, root, inode);
-        BUG_ON(ret);
+        if (ret && !err)
+                err = ret;
        nr = trans->blocks_used;
        ret = btrfs_end_transaction_throttle(trans, root);
-        BUG_ON(ret);
+        if (ret && !err)
+                err = ret;
        btrfs_btree_balance_dirty(root, nr);
+        return err;
 }
 /*
@@ -6494,9 +6771,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->index_cnt = (u64)-1;
        ei->last_unlink_trans = 0;
-        spin_lock_init(&ei->accounting_lock);
        atomic_set(&ei->outstanding_extents, 0);
-        ei->reserved_extents = 0;
+        atomic_set(&ei->reserved_extents, 0);
        ei->ordered_data_close = 0;
        ei->orphan_meta_reserved = 0;
@@ -6532,7 +6808,7 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
        WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
-        WARN_ON(BTRFS_I(inode)->reserved_extents);
+        WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents));
        /*
         * This can happen where we create an inode, but somebody else also
@@ -6624,6 +6900,8 @@ void btrfs_destroy_cachep(void)
                kmem_cache_destroy(btrfs_transaction_cachep);
        if (btrfs_path_cachep)
                kmem_cache_destroy(btrfs_path_cachep);
+        if (btrfs_free_space_cachep)
+                kmem_cache_destroy(btrfs_free_space_cachep);
 }
 int btrfs_init_cachep(void)
@@ -6652,6 +6930,12 @@ int btrfs_init_cachep(void)
        if (!btrfs_path_cachep)
                goto fail;
+        btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
+                        sizeof(struct btrfs_free_space), 0,
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+        if (!btrfs_free_space_cachep)
+                goto fail;
        return 0;
 fail:
        btrfs_destroy_cachep();
@@ -6670,6 +6954,26 @@ static int btrfs_getattr(struct vfsmount *mnt,
        return 0;
 }
+/*
+ * If a file is moved, it will inherit the cow and compression flags of the new
+ * directory.
+ */
+static void fixup_inode_flags(struct inode *dir, struct inode *inode)
+{
+        struct btrfs_inode *b_dir = BTRFS_I(dir);
+        struct btrfs_inode *b_inode = BTRFS_I(inode);
+        if (b_dir->flags & BTRFS_INODE_NODATACOW)
+                b_inode->flags |= BTRFS_INODE_NODATACOW;
+        else
+                b_inode->flags &= ~BTRFS_INODE_NODATACOW;
+        if (b_dir->flags & BTRFS_INODE_COMPRESS)
+                b_inode->flags |= BTRFS_INODE_COMPRESS;
+        else
+                b_inode->flags &= ~BTRFS_INODE_COMPRESS;
+}
 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                           struct inode *new_dir, struct dentry *new_dentry)
 {
@@ -6718,8 +7022,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * should cover the worst case number of items we'll modify.
         */
        trans = btrfs_start_transaction(root, 20);
-        if (IS_ERR(trans))
+        if (IS_ERR(trans)) {
-                return PTR_ERR(trans);
+                ret = PTR_ERR(trans);
+                goto out_notrans;
+        }
        btrfs_set_trans_block_group(trans, new_dir);
@@ -6772,11 +7078,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                                        old_dentry->d_name.name,
                                        old_dentry->d_name.len);
        } else {
-                btrfs_inc_nlink(old_dentry->d_inode);
+                ret = __btrfs_unlink_inode(trans, root, old_dir,
-                ret = btrfs_unlink_inode(trans, root, old_dir,
+                                        old_dentry->d_inode,
-                                         old_dentry->d_inode,
+                                        old_dentry->d_name.name,
-                                         old_dentry->d_name.name,
+                                        old_dentry->d_name.len);
-                                         old_dentry->d_name.len);
+                if (!ret)
+                        ret = btrfs_update_inode(trans, root, old_inode);
        }
        BUG_ON(ret);
@@ -6803,6 +7110,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                }
        }
+        fixup_inode_flags(new_dir, old_inode);
        ret = btrfs_add_link(trans, new_dir, old_inode,
                             new_dentry->d_name.name,
                             new_dentry->d_name.len, 0, index);
@@ -6816,7 +7125,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        }
 out_fail:
        btrfs_end_transaction_throttle(trans, root);
+out_notrans:
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&root->fs_info->subvol_sem);
@@ -6968,7 +7277,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@ -7204,7 +7513,6 @@ static const struct address_space_operations btrfs_aops = {
        .writepage      = btrfs_writepage,
        .writepages     = btrfs_writepages,
        .readpages      = btrfs_readpages,
-        .sync_page      = block_sync_page,
        .direct_IO      = btrfs_direct_IO,
        .invalidatepage = btrfs_invalidatepage,
        .releasepage    = btrfs_releasepage,
@@ -7220,7 +7528,6 @@ static const struct address_space_operations btrfs_symlink_aops = {
 };
 static const struct inode_operations btrfs_file_inode_operations = {
-        .truncate       = btrfs_truncate,
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
        .setxattr       = btrfs_setxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a506a22b522a..ffb48d6c5433 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -40,6 +40,7 @@
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -138,6 +139,24 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
        return 0;
 }
+static int check_flags(unsigned int flags)
+{
+        if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+                      FS_NOATIME_FL | FS_NODUMP_FL | \
+                      FS_SYNC_FL | FS_DIRSYNC_FL | \
+                      FS_NOCOMP_FL | FS_COMPR_FL | \
+                      FS_NOCOW_FL | FS_COW_FL))
+                return -EOPNOTSUPP;
+        if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
+                return -EINVAL;
+        if ((flags & FS_NOCOW_FL) && (flags & FS_COW_FL))
+                return -EINVAL;
+        return 0;
+}
 static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
@@ -153,12 +172,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
-        if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+        ret = check_flags(flags);
-                      FS_NOATIME_FL | FS_NODUMP_FL | \
+        if (ret)
-                      FS_SYNC_FL | FS_DIRSYNC_FL))
+                return ret;
-                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EACCES;
        mutex_lock(&inode->i_mutex);
@@ -201,9 +219,25 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        else
                ip->flags &= ~BTRFS_INODE_DIRSYNC;
+        /*
+         * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
+         * flag may be changed automatically if compression code won't make
+         * things smaller.
+         */
+        if (flags & FS_NOCOMP_FL) {
+                ip->flags &= ~BTRFS_INODE_COMPRESS;
+                ip->flags |= BTRFS_INODE_NOCOMPRESS;
+        } else if (flags & FS_COMPR_FL) {
+                ip->flags |= BTRFS_INODE_COMPRESS;
+                ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+        }
+        if (flags & FS_NOCOW_FL)
+                ip->flags |= BTRFS_INODE_NODATACOW;
+        else if (flags & FS_COW_FL)
+                ip->flags &= ~BTRFS_INODE_NODATACOW;
        trans = btrfs_join_transaction(root, 1);
-        BUG_ON(!trans);
+        BUG_ON(IS_ERR(trans));
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
@@ -213,9 +247,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        btrfs_end_transaction(trans, root);
        mnt_drop_write(file->f_path.mnt);
+        ret = 0;
 out_unlock:
        mutex_unlock(&inode->i_mutex);
-        return 0;
+        return ret;
 }
 static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
@@ -225,6 +261,49 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
        return put_user(inode->i_generation, arg);
 }
+static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
+{
+        struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_device *device;
+        struct request_queue *q;
+        struct fstrim_range range;
+        u64 minlen = ULLONG_MAX;
+        u64 num_devices = 0;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        mutex_lock(&fs_info->fs_devices->device_list_mutex);
+        list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+                if (!device->bdev)
+                        continue;
+                q = bdev_get_queue(device->bdev);
+                if (blk_queue_discard(q)) {
+                        num_devices++;
+                        minlen = min((u64)q->limits.discard_granularity,
+                                     minlen);
+                }
+        }
+        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+        if (!num_devices)
+                return -EOPNOTSUPP;
+        if (copy_from_user(&range, arg, sizeof(range)))
+                return -EFAULT;
+        range.minlen = max(range.minlen, minlen);
+        ret = btrfs_trim_fs(root, &range);
+        if (ret < 0)
+                return ret;
+        if (copy_to_user(arg, &range, sizeof(range)))
+                return -EFAULT;
+        return 0;
+}
 static noinline int create_subvol(struct btrfs_root *root,
                                  struct dentry *dentry,
                                  char *name, int namelen,
@@ -294,6 +373,10 @@ static noinline int create_subvol(struct btrfs_root *root,
        inode_item->nbytes = cpu_to_le64(root->leafsize);
        inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+        root_item.flags = 0;
+        root_item.byte_limit = 0;
+        inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT);
        btrfs_set_root_bytenr(&root_item, leaf->start);
        btrfs_set_root_generation(&root_item, trans->transid);
        btrfs_set_root_level(&root_item, 0);
@@ -409,7 +492,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        if (ret)
                goto fail;
-        btrfs_orphan_cleanup(pending_snapshot->snap);
+        ret = btrfs_orphan_cleanup(pending_snapshot->snap);
+        if (ret)
+                goto fail;
        parent = dget_parent(dentry);
        inode = btrfs_lookup_dentry(parent->d_inode, dentry);
@@ -907,6 +992,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        if (new_size > old_size) {
                trans = btrfs_start_transaction(root, 0);
+                if (IS_ERR(trans)) {
+                        ret = PTR_ERR(trans);
+                        goto out_unlock;
+                }
                ret = btrfs_grow_device(trans, device, new_size);
                btrfs_commit_transaction(trans, root);
        } else {
@@ -1067,12 +1156,15 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
-        if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
+        if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
                return -EINVAL;
        if (flags & ~BTRFS_SUBVOL_RDONLY)
                return -EOPNOTSUPP;
+        if (!inode_owner_or_capable(inode))
+                return -EACCES;
        down_write(&root->fs_info->subvol_sem);
        /* nothing to do */
@@ -1093,7 +1185,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
                goto out_reset;
        }
-        ret = btrfs_update_root(trans, root,
+        ret = btrfs_update_root(trans, root->fs_info->tree_root,
                                &root->root_key, &root->root_item);
        btrfs_commit_transaction(trans, root);
@@ -1898,7 +1990,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        memcpy(&new_key, &key, sizeof(new_key));
                        new_key.objectid = inode->i_ino;
-                        new_key.offset = key.offset + destoff - off;
+                        if (off <= key.offset)
+                                new_key.offset = key.offset + destoff - off;
+                        else
+                                new_key.offset = destoff;
                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
@@ -2082,7 +2177,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
        ret = -ENOMEM;
        trans = btrfs_start_ioctl_transaction(root, 0);
-        if (!trans)
+        if (IS_ERR(trans))
                goto out_drop;
        file->private_data = trans;
@@ -2138,9 +2233,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        path->leave_spinning = 1;
        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
+        if (IS_ERR(trans)) {
                btrfs_free_path(path);
-                return -ENOMEM;
+                return PTR_ERR(trans);
        }
        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
@@ -2192,7 +2287,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
        struct btrfs_ioctl_space_info space;
        struct btrfs_ioctl_space_info *dest;
        struct btrfs_ioctl_space_info *dest_orig;
-        struct btrfs_ioctl_space_info *user_dest;
+        struct btrfs_ioctl_space_info __user *user_dest;
        struct btrfs_space_info *info;
        u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
                       BTRFS_BLOCK_GROUP_SYSTEM,
@@ -2201,7 +2296,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
        int num_types = 4;
        int alloc_size;
        int ret = 0;
-        int slot_count = 0;
+        u64 slot_count = 0;
        int i, c;
        if (copy_from_user(&space_args,
@@ -2240,7 +2335,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
                goto out;
        }
-        slot_count = min_t(int, space_args.space_slots, slot_count);
+        slot_count = min_t(u64, space_args.space_slots, slot_count);
        alloc_size = sizeof(*dest) * slot_count;
@@ -2260,6 +2355,9 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
        for (i = 0; i < num_types; i++) {
                struct btrfs_space_info *tmp;
+                if (!slot_count)
+                        break;
                info = NULL;
                rcu_read_lock();
                list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
@@ -2281,7 +2379,10 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
                                memcpy(dest, &space, sizeof(space));
                                dest++;
                                space_args.total_spaces++;
+                                slot_count--;
                        }
+                        if (!slot_count)
+                                break;
                }
                up_read(&info->groups_sem);
        }
@@ -2332,10 +2433,17 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp
        struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
        struct btrfs_trans_handle *trans;
        u64 transid;
+        int ret;
        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        transid = trans->transid;
-        btrfs_commit_transaction_async(trans, root, 0);
+        ret = btrfs_commit_transaction_async(trans, root, 0);
+        if (ret) {
+                btrfs_end_transaction(trans, root);
+                return ret;
+        }
        if (argp)
                if (copy_to_user(argp, &transid, sizeof(transid)))
@@ -2370,6 +2478,8 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_setflags(file, argp);
        case FS_IOC_GETVERSION:
                return btrfs_ioctl_getversion(file, argp);
+        case FITRIM:
+                return btrfs_ioctl_fitrim(file, argp);
        case BTRFS_IOC_SNAP_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 0);
        case BTRFS_IOC_SNAP_CREATE_V2:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index cc9b450399df..a178f5ebea78 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -280,6 +280,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
        unsigned long tot_out;
        unsigned long tot_len;
        char *buf;
+        bool may_late_unmap, need_unmap;
        data_in = kmap(pages_in[0]);
        tot_len = read_compress_length(data_in);
@@ -300,11 +301,13 @@ static int lzo_decompress_biovec(struct list_head *ws,
                tot_in += in_len;
                working_bytes = in_len;
+                may_late_unmap = need_unmap = false;
                /* fast path: avoid using the working buffer */
                if (in_page_bytes_left >= in_len) {
                        buf = data_in + in_offset;
                        bytes = in_len;
+                        may_late_unmap = true;
                        goto cont;
                }
@@ -329,14 +332,17 @@ cont:
                                if (working_bytes == 0 && tot_in >= tot_len)
                                        break;
-                                kunmap(pages_in[page_in_index]);
+                                if (page_in_index + 1 >= total_pages_in) {
-                                page_in_index++;
-                                if (page_in_index >= total_pages_in) {
                                        ret = -1;
-                                        data_in = NULL;
                                        goto done;
                                }
-                                data_in = kmap(pages_in[page_in_index]);
+                                if (may_late_unmap)
+                                        need_unmap = true;
+                                else
+                                        kunmap(pages_in[page_in_index]);
+                                data_in = kmap(pages_in[++page_in_index]);
                                in_page_bytes_left = PAGE_CACHE_SIZE;
                                in_offset = 0;
@@ -346,6 +352,8 @@ cont:
                out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
                ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
                                            &out_len);
+                if (need_unmap)
+                        kunmap(pages_in[page_in_index - 1]);
                if (ret != LZO_E_OK) {
                        printk(KERN_WARNING "btrfs decompress failed\n");
                        ret = -1;
@@ -363,8 +371,7 @@ cont:
                        break;
        }
 done:
-        if (data_in)
+        kunmap(pages_in[page_in_index]);
-                kunmap(pages_in[page_in_index]);
        return ret;
 }
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2b61e1ddcd99..a1c940425307 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -141,7 +141,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
                                          u64 file_offset)
 {
        struct rb_root *root = &tree->tree;
-        struct rb_node *prev;
+        struct rb_node *prev = NULL;
        struct rb_node *ret;
        struct btrfs_ordered_extent *entry;
@@ -202,6 +202,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        INIT_LIST_HEAD(&entry->list);
        INIT_LIST_HEAD(&entry->root_extent_list);
+        trace_btrfs_ordered_extent_add(inode, entry);
        spin_lock(&tree->lock);
        node = tree_insert(&tree->tree, file_offset,
                           &entry->rb_node);
@@ -387,6 +389,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
        struct list_head *cur;
        struct btrfs_ordered_sum *sum;
+        trace_btrfs_ordered_extent_put(entry->inode, entry);
        if (atomic_dec_and_test(&entry->refs)) {
                while (!list_empty(&entry->list)) {
                        cur = entry->list.next;
@@ -420,6 +424,8 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
        spin_lock(&root->fs_info->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
+        trace_btrfs_ordered_extent_remove(inode, entry);
        /*
         * we have no more ordered extents for this inode and
         * no dirty pages.  We can safely remove it from the
@@ -585,6 +591,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
        u64 start = entry->file_offset;
        u64 end = start + entry->len - 1;
+        trace_btrfs_ordered_extent_start(inode, entry);
        /*
         * pages in the range can be dirty, clean or writeback.  We
         * start IO on any dirty ones so the wait doesn't stall waiting
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0d126be22b63..fb2605d998e9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -260,6 +260,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 #else
                        BUG();
 #endif
+                        break;
                case BTRFS_BLOCK_GROUP_ITEM_KEY:
                        bi = btrfs_item_ptr(l, i,
                                            struct btrfs_block_group_item);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 045c9c2b2d7e..199a80134312 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1157,6 +1157,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
        new_node->bytenr = dest->node->start;
        new_node->level = node->level;
        new_node->lowest = node->lowest;
+        new_node->checked = 1;
        new_node->root = dest;
        if (!node->lowest) {
@@ -1723,6 +1724,7 @@ again:
                        eb = read_tree_block(dest, old_bytenr, blocksize,
                                             old_ptr_gen);
+                        BUG_ON(!eb);
                        btrfs_tree_lock(eb);
                        if (cow) {
                                ret = btrfs_cow_block(trans, dest, eb, parent,
@@ -2028,6 +2030,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
        while (1) {
                trans = btrfs_start_transaction(root, 0);
+                BUG_ON(IS_ERR(trans));
                trans->block_rsv = rc->block_rsv;
                ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
@@ -2147,6 +2150,12 @@ again:
        }
        trans = btrfs_join_transaction(rc->extent_root, 1);
+        if (IS_ERR(trans)) {
+                if (!err)
+                        btrfs_block_rsv_release(rc->extent_root,
+                                                rc->block_rsv, num_bytes);
+                return PTR_ERR(trans);
+        }
        if (!err) {
                if (num_bytes != rc->merging_rsv_size) {
@@ -2337,7 +2346,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
                root = next->root;
                BUG_ON(!root);
-                /* no other choice for non-refernce counted tree */
+                /* no other choice for non-references counted tree */
                if (!root->ref_cows)
                        return root;
@@ -2505,6 +2514,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                blocksize = btrfs_level_size(root, node->level);
                generation = btrfs_node_ptr_generation(upper->eb, slot);
                eb = read_tree_block(root, bytenr, blocksize, generation);
+                if (!eb) {
+                        err = -EIO;
+                        goto next;
+                }
                btrfs_tree_lock(eb);
                btrfs_set_lock_blocking(eb);
@@ -2662,6 +2675,7 @@ static int get_tree_block_key(struct reloc_control *rc,
        BUG_ON(block->key_ready);
        eb = read_tree_block(rc->extent_root, block->bytenr,
                             block->key.objectid, block->key.offset);
+        BUG_ON(!eb);
        WARN_ON(btrfs_header_level(eb) != block->level);
        if (block->level == 0)
                btrfs_item_key_to_cpu(eb, &block->key, 0);
@@ -3222,6 +3236,7 @@ truncate:
        trans = btrfs_join_transaction(root, 0);
        if (IS_ERR(trans)) {
                btrfs_free_path(path);
+                ret = PTR_ERR(trans);
                goto out;
        }
@@ -3628,6 +3643,7 @@ int prepare_to_relocate(struct reloc_control *rc)
        set_reloc_control(rc);
        trans = btrfs_join_transaction(rc->extent_root, 1);
+        BUG_ON(IS_ERR(trans));
        btrfs_commit_transaction(trans, rc->extent_root);
        return 0;
 }
@@ -3644,6 +3660,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        u32 item_size;
        int ret;
        int err = 0;
+        int progress = 0;
        path = btrfs_alloc_path();
        if (!path)
@@ -3656,8 +3673,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        }
        while (1) {
+                progress++;
                trans = btrfs_start_transaction(rc->extent_root, 0);
+                BUG_ON(IS_ERR(trans));
+restart:
                if (update_backref_cache(trans, &rc->backref_cache)) {
                        btrfs_end_transaction(trans, rc->extent_root);
                        continue;
@@ -3770,6 +3789,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                        }
                }
        }
+        if (trans && progress && err == -ENOSPC) {
+                ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
+                                              rc->block_group->flags);
+                if (ret == 0) {
+                        err = 0;
+                        progress = 0;
+                        goto restart;
+                }
+        }
        btrfs_release_path(rc->extent_root, path);
        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
@@ -3804,7 +3832,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        /* get rid of pinned extents */
        trans = btrfs_join_transaction(rc->extent_root, 1);
-        btrfs_commit_transaction(trans, rc->extent_root);
+        if (IS_ERR(trans))
+                err = PTR_ERR(trans);
+        else
+                btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
        btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
        btrfs_free_path(path);
@@ -4022,6 +4053,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
        int ret;
        trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
+        BUG_ON(IS_ERR(trans));
        memset(&root->root_item.drop_progress, 0,
                sizeof(root->root_item.drop_progress));
@@ -4125,6 +4157,11 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        set_reloc_control(rc);
        trans = btrfs_join_transaction(rc->extent_root, 1);
+        if (IS_ERR(trans)) {
+                unset_reloc_control(rc);
+                err = PTR_ERR(trans);
+                goto out_free;
+        }
        rc->merge_reloc_tree = 1;
@@ -4154,9 +4191,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        unset_reloc_control(rc);
        trans = btrfs_join_transaction(rc->extent_root, 1);
-        btrfs_commit_transaction(trans, rc->extent_root);
+        if (IS_ERR(trans))
-out:
+                err = PTR_ERR(trans);
+        else
+                btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
        kfree(rc);
+out:
        while (!list_empty(&reloc_roots)) {
                reloc_root = list_entry(reloc_roots.next,
                                        struct btrfs_root, root_list);
@@ -4174,7 +4215,7 @@ out:
                if (IS_ERR(fs_root))
                        err = PTR_ERR(fs_root);
                else
-                        btrfs_orphan_cleanup(fs_root);
+                        err = btrfs_orphan_cleanup(fs_root);
        }
        return err;
 }
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 6a1086e83ffc..6928bff62daa 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -88,7 +88,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
        search_key.offset = (u64)-1;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
        if (ret < 0)
                goto out;
@@ -332,7 +333,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        struct extent_buffer *leaf;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        ret = btrfs_search_slot(trans, root, key, path, -1, 1);
        if (ret < 0)
                goto out;
@@ -471,3 +473,21 @@ again:
        btrfs_free_path(path);
        return 0;
 }
+/*
+ * Old btrfs forgets to init root_item->flags and root_item->byte_limit
+ * for subvolumes. To work around this problem, we steal a bit from
+ * root_item->inode_item->flags, and use it to indicate if those fields
+ * have been properly initialized.
+ */
+void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
+{
+        u64 inode_flags = le64_to_cpu(root_item->inode.flags);
+        if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
+                inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
+                root_item->inode.flags = cpu_to_le64(inode_flags);
+                root_item->flags = 0;
+                root_item->byte_limit = 0;
+        }
+}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b2130c46fdb5..0ac712efcdf2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -52,6 +52,9 @@
 #include "export.h"
 #include "compression.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/btrfs.h>
 static const struct super_operations btrfs_super_ops;
 static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
@@ -155,7 +158,8 @@ enum {
        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
-        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
+        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
+        Opt_enospc_debug, Opt_subvolrootid, Opt_err,
 };
 static match_table_t tokens = {
@@ -184,6 +188,8 @@ static match_table_t tokens = {
        {Opt_space_cache, "space_cache"},
        {Opt_clear_cache, "clear_cache"},
        {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+        {Opt_enospc_debug, "enospc_debug"},
+        {Opt_subvolrootid, "subvolrootid=%d"},
        {Opt_err, NULL},
 };
@@ -227,6 +233,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        break;
                case Opt_subvol:
                case Opt_subvolid:
+                case Opt_subvolrootid:
                case Opt_device:
                        /*
                         * These are parsed by btrfs_parse_early_options
@@ -358,6 +365,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_user_subvol_rm_allowed:
                        btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
                        break;
+                case Opt_enospc_debug:
+                        btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+                        break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@ -380,10 +390,10 @@ out:
 */
 static int btrfs_parse_early_options(const char *options, fmode_t flags,
                void *holder, char **subvol_name, u64 *subvol_objectid,
-                struct btrfs_fs_devices **fs_devices)
+                u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
 {
        substring_t args[MAX_OPT_ARGS];
-        char *opts, *p;
+        char *opts, *orig, *p;
        int error = 0;
        int intarg;
@@ -397,6 +407,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
        opts = kstrdup(options, GFP_KERNEL);
        if (!opts)
                return -ENOMEM;
+        orig = opts;
        while ((p = strsep(&opts, ",")) != NULL) {
                int token;
@@ -420,6 +431,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
                                        *subvol_objectid = intarg;
                        }
                        break;
+                case Opt_subvolrootid:
+                        intarg = 0;
+                        error = match_int(&args[0], &intarg);
+                        if (!error) {
+                                /* we want the original fs_tree */
+                                if (!intarg)
+                                        *subvol_rootid =
+                                                BTRFS_FS_TREE_OBJECTID;
+                                else
+                                        *subvol_rootid = intarg;
+                        }
+                        break;
                case Opt_device:
                        error = btrfs_scan_one_device(match_strdup(&args[0]),
                                        flags, holder, fs_devices);
@@ -432,7 +455,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
        }
 out_free_opts:
-        kfree(opts);
+        kfree(orig);
 out:
        /*
         * If no subvolume name is specified we use the default one.  Allocate
@@ -614,6 +637,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        struct btrfs_root *root = btrfs_sb(sb);
        int ret;
+        trace_btrfs_sync_fs(wait);
        if (!wait) {
                filemap_flush(root->fs_info->btree_inode->i_mapping);
                return 0;
@@ -623,6 +648,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        btrfs_wait_ordered_extents(root, 0, 0);
        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        ret = btrfs_commit_transaction(trans, root);
        return ret;
 }
@@ -631,6 +658,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
        struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
        struct btrfs_fs_info *info = root->fs_info;
+        char *compress_type;
        if (btrfs_test_opt(root, DEGRADED))
                seq_puts(seq, ",degraded");
@@ -649,8 +677,16 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (info->thread_pool_size !=  min_t(unsigned long,
                                             num_online_cpus() + 2, 8))
                seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
-        if (btrfs_test_opt(root, COMPRESS))
+        if (btrfs_test_opt(root, COMPRESS)) {
-                seq_puts(seq, ",compress");
+                if (info->compress_type == BTRFS_COMPRESS_ZLIB)
+                        compress_type = "zlib";
+                else
+                        compress_type = "lzo";
+                if (btrfs_test_opt(root, FORCE_COMPRESS))
+                        seq_printf(seq, ",compress-force=%s", compress_type);
+                else
+                        seq_printf(seq, ",compress=%s", compress_type);
+        }
        if (btrfs_test_opt(root, NOSSD))
                seq_puts(seq, ",nossd");
        if (btrfs_test_opt(root, SSD_SPREAD))
@@ -665,6 +701,12 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",discard");
        if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
                seq_puts(seq, ",noacl");
+        if (btrfs_test_opt(root, SPACE_CACHE))
+                seq_puts(seq, ",space_cache");
+        if (btrfs_test_opt(root, CLEAR_CACHE))
+                seq_puts(seq, ",clear_cache");
+        if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+                seq_puts(seq, ",user_subvol_rm_allowed");
        return 0;
 }
@@ -708,6 +750,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        fmode_t mode = FMODE_READ;
        char *subvol_name = NULL;
        u64 subvol_objectid = 0;
+        u64 subvol_rootid = 0;
        int error = 0;
        if (!(flags & MS_RDONLY))
@@ -715,7 +758,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        error = btrfs_parse_early_options(data, mode, fs_type,
                                          &subvol_name, &subvol_objectid,
-                                          &fs_devices);
+                                          &subvol_rootid, &fs_devices);
        if (error)
                return ERR_PTR(error);
@@ -761,6 +804,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                }
                btrfs_close_devices(fs_devices);
+                kfree(fs_info);
+                kfree(tree_root);
        } else {
                char b[BDEVNAME_SIZE];
@@ -777,15 +822,17 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                s->s_flags |= MS_ACTIVE;
        }
-        root = get_default_root(s, subvol_objectid);
-        if (IS_ERR(root)) {
-                error = PTR_ERR(root);
-                deactivate_locked_super(s);
-                goto error_free_subvol_name;
-        }
        /* if they gave us a subvolume name bind mount into that */
        if (strcmp(subvol_name, ".")) {
                struct dentry *new_root;
+                root = get_default_root(s, subvol_rootid);
+                if (IS_ERR(root)) {
+                        error = PTR_ERR(root);
+                        deactivate_locked_super(s);
+                        goto error_free_subvol_name;
+                }
                mutex_lock(&root->d_inode->i_mutex);
                new_root = lookup_one_len(subvol_name, root,
                                      strlen(subvol_name));
@@ -806,6 +853,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
                }
                dput(root);
                root = new_root;
+        } else {
+                root = get_default_root(s, subvol_objectid);
+                if (IS_ERR(root)) {
+                        error = PTR_ERR(root);
+                        deactivate_locked_super(s);
+                        goto error_free_subvol_name;
+                }
        }
        kfree(subvol_name);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bae5c7b8bbe2..c571734d5e5a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -32,10 +32,8 @@
 static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
-        WARN_ON(transaction->use_count == 0);
+        WARN_ON(atomic_read(&transaction->use_count) == 0);
-        transaction->use_count--;
+        if (atomic_dec_and_test(&transaction->use_count)) {
-        if (transaction->use_count == 0) {
-                list_del_init(&transaction->list);
                memset(transaction, 0, sizeof(*transaction));
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
@@ -57,16 +55,17 @@ static noinline int join_transaction(struct btrfs_root *root)
        if (!cur_trans) {
                cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
                                             GFP_NOFS);
-                BUG_ON(!cur_trans);
+                if (!cur_trans)
+                        return -ENOMEM;
                root->fs_info->generation++;
-                cur_trans->num_writers = 1;
+                atomic_set(&cur_trans->num_writers, 1);
                cur_trans->num_joined = 0;
                cur_trans->transid = root->fs_info->generation;
                init_waitqueue_head(&cur_trans->writer_wait);
                init_waitqueue_head(&cur_trans->commit_wait);
                cur_trans->in_commit = 0;
                cur_trans->blocked = 0;
-                cur_trans->use_count = 1;
+                atomic_set(&cur_trans->use_count, 1);
                cur_trans->commit_done = 0;
                cur_trans->start_time = get_seconds();
@@ -87,7 +86,7 @@ static noinline int join_transaction(struct btrfs_root *root)
                root->fs_info->running_transaction = cur_trans;
                spin_unlock(&root->fs_info->new_trans_lock);
        } else {
-                cur_trans->num_writers++;
+                atomic_inc(&cur_trans->num_writers);
                cur_trans->num_joined++;
        }
@@ -144,7 +143,7 @@ static void wait_current_trans(struct btrfs_root *root)
        cur_trans = root->fs_info->running_transaction;
        if (cur_trans && cur_trans->blocked) {
                DEFINE_WAIT(wait);
-                cur_trans->use_count++;
+                atomic_inc(&cur_trans->use_count);
                while (1) {
                        prepare_to_wait(&root->fs_info->transaction_wait, &wait,
                                        TASK_UNINTERRUPTIBLE);
@@ -180,6 +179,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 {
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
+        int retries = 0;
        int ret;
        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
@@ -195,10 +195,15 @@ again:
                wait_current_trans(root);
        ret = join_transaction(root);
-        BUG_ON(ret);
+        if (ret < 0) {
+                kmem_cache_free(btrfs_trans_handle_cachep, h);
+                if (type != TRANS_JOIN_NOLOCK)
+                        mutex_unlock(&root->fs_info->trans_mutex);
+                return ERR_PTR(ret);
+        }
        cur_trans = root->fs_info->running_transaction;
-        cur_trans->use_count++;
+        atomic_inc(&cur_trans->use_count);
        if (type != TRANS_JOIN_NOLOCK)
                mutex_unlock(&root->fs_info->trans_mutex);
@@ -218,10 +223,18 @@ again:
        if (num_items > 0) {
                ret = btrfs_trans_reserve_metadata(h, root, num_items);
-                if (ret == -EAGAIN) {
+                if (ret == -EAGAIN && !retries) {
+                        retries++;
                        btrfs_commit_transaction(h, root);
                        goto again;
+                } else if (ret == -EAGAIN) {
+                        /*
+                         * We have already retried and got EAGAIN, so really we
+                         * don't have space, so set ret to -ENOSPC.
+                         */
+                        ret = -ENOSPC;
                }
                if (ret < 0) {
                        btrfs_end_transaction(h, root);
                        return ERR_PTR(ret);
@@ -321,7 +334,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
                        goto out_unlock;  /* nothing committing|committed */
        }
-        cur_trans->use_count++;
+        atomic_inc(&cur_trans->use_count);
        mutex_unlock(&root->fs_info->trans_mutex);
        wait_for_commit(root, cur_trans);
@@ -451,18 +464,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                        wake_up_process(info->transaction_kthread);
        }
-        if (lock)
-                mutex_lock(&info->trans_mutex);
        WARN_ON(cur_trans != info->running_transaction);
-        WARN_ON(cur_trans->num_writers < 1);
+        WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
-        cur_trans->num_writers--;
+        atomic_dec(&cur_trans->num_writers);
        smp_mb();
        if (waitqueue_active(&cur_trans->writer_wait))
                wake_up(&cur_trans->writer_wait);
        put_transaction(cur_trans);
-        if (lock)
-                mutex_unlock(&info->trans_mutex);
        if (current->journal_info == trans)
                current->journal_info = NULL;
@@ -970,6 +979,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        record_root_in_trans(trans, root);
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+        btrfs_check_and_init_root_item(new_root_item);
        root_flags = btrfs_root_flags(new_root_item);
        if (pending->readonly)
@@ -1156,16 +1166,22 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
        struct btrfs_transaction *cur_trans;
        ac = kmalloc(sizeof(*ac), GFP_NOFS);
-        BUG_ON(!ac);
+        if (!ac)
+                return -ENOMEM;
        INIT_DELAYED_WORK(&ac->work, do_async_commit);
        ac->root = root;
        ac->newtrans = btrfs_join_transaction(root, 0);
+        if (IS_ERR(ac->newtrans)) {
+                int err = PTR_ERR(ac->newtrans);
+                kfree(ac);
+                return err;
+        }
        /* take transaction reference */
        mutex_lock(&root->fs_info->trans_mutex);
        cur_trans = trans->transaction;
-        cur_trans->use_count++;
+        atomic_inc(&cur_trans->use_count);
        mutex_unlock(&root->fs_info->trans_mutex);
        btrfs_end_transaction(trans, root);
@@ -1224,7 +1240,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        mutex_lock(&root->fs_info->trans_mutex);
        if (cur_trans->in_commit) {
-                cur_trans->use_count++;
+                atomic_inc(&cur_trans->use_count);
                mutex_unlock(&root->fs_info->trans_mutex);
                btrfs_end_transaction(trans, root);
@@ -1246,7 +1262,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
                if (!prev_trans->commit_done) {
-                        prev_trans->use_count++;
+                        atomic_inc(&prev_trans->use_count);
                        mutex_unlock(&root->fs_info->trans_mutex);
                        wait_for_commit(root, prev_trans);
@@ -1287,14 +1303,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                                TASK_UNINTERRUPTIBLE);
                smp_mb();
-                if (cur_trans->num_writers > 1)
+                if (atomic_read(&cur_trans->num_writers) > 1)
                        schedule_timeout(MAX_SCHEDULE_TIMEOUT);
                else if (should_grow)
                        schedule_timeout(1);
                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&cur_trans->writer_wait, &wait);
-        } while (cur_trans->num_writers > 1 ||
+        } while (atomic_read(&cur_trans->num_writers) > 1 ||
                 (should_grow && cur_trans->num_joined != joined));
        ret = create_pending_snapshots(trans, root->fs_info);
@@ -1381,9 +1397,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        wake_up(&cur_trans->commit_wait);
+        list_del_init(&cur_trans->list);
        put_transaction(cur_trans);
        put_transaction(cur_trans);
+        trace_btrfs_transaction_commit(root);
        mutex_unlock(&root->fs_info->trans_mutex);
        if (current->journal_info == trans)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 229a594cacd5..e441acc6c584 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,11 +27,11 @@ struct btrfs_transaction {
         * total writers in this transaction, it must be zero before the
         * transaction can end
         */
-        unsigned long num_writers;
+        atomic_t num_writers;
        unsigned long num_joined;
        int in_commit;
-        int use_count;
+        atomic_t use_count;
        int commit_done;
        int blocked;
        struct list_head list;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 054744ac5719..c50271ad3157 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -338,6 +338,12 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
                }
                dst_copy = kmalloc(item_size, GFP_NOFS);
                src_copy = kmalloc(item_size, GFP_NOFS);
+                if (!dst_copy || !src_copy) {
+                        btrfs_release_path(root, path);
+                        kfree(dst_copy);
+                        kfree(src_copy);
+                        return -ENOMEM;
+                }
                read_extent_buffer(eb, src_copy, src_ptr, item_size);
@@ -665,6 +671,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
        btrfs_dir_item_key_to_cpu(leaf, di, &location);
        name_len = btrfs_dir_name_len(leaf, di);
        name = kmalloc(name_len, GFP_NOFS);
+        if (!name)
+                return -ENOMEM;
        read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
        btrfs_release_path(root, path);
@@ -744,6 +753,9 @@ static noinline int backref_in_log(struct btrfs_root *log,
        int match = 0;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
        if (ret != 0)
                goto out;
@@ -787,12 +799,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
        struct inode *dir;
        int ret;
        struct btrfs_inode_ref *ref;
-        struct btrfs_dir_item *di;
        struct inode *inode;
        char *name;
        int namelen;
        unsigned long ref_ptr;
        unsigned long ref_end;
+        int search_done = 0;
        /*
         * it is possible that we didn't log all the parent directories
@@ -833,7 +845,10 @@ again:
         * existing back reference, and we don't want to create
         * dangling pointers in the directory.
         */
-conflict_again:
+        if (search_done)
+                goto insert;
        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
        if (ret == 0) {
                char *victim_name;
@@ -874,37 +889,21 @@ conflict_again:
                                ret = btrfs_unlink_inode(trans, root, dir,
                                                         inode, victim_name,
                                                         victim_name_len);
-                                kfree(victim_name);
-                                btrfs_release_path(root, path);
-                                goto conflict_again;
                        }
                        kfree(victim_name);
                        ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
                }
                BUG_ON(ret);
-        }
-        btrfs_release_path(root, path);
-        /* look for a conflicting sequence number */
+                /*
-        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+                 * NOTE: we have searched root tree and checked the
-                                         btrfs_inode_ref_index(eb, ref),
+                 * coresponding ref, it does not need to check again.
-                                         name, namelen, 0);
+                 */
-        if (di && !IS_ERR(di)) {
+                search_done = 1;
-                ret = drop_one_dir_item(trans, root, path, dir, di);
-                BUG_ON(ret);
-        }
-        btrfs_release_path(root, path);
-        /* look for a conflicting name */
-        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
-                                   name, namelen, 0);
-        if (di && !IS_ERR(di)) {
-                ret = drop_one_dir_item(trans, root, path, dir, di);
-                BUG_ON(ret);
        }
        btrfs_release_path(root, path);
+insert:
        /* insert our name */
        ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
                             btrfs_inode_ref_index(eb, ref));
@@ -967,6 +966,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
        key.offset = (u64)-1;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1178,6 +1179,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
        name_len = btrfs_dir_name_len(eb, di);
        name = kmalloc(name_len, GFP_NOFS);
+        if (!name)
+                return -ENOMEM;
        log_type = btrfs_dir_type(eb, di);
        read_extent_buffer(eb, name, (unsigned long)(di + 1),
                   name_len);
@@ -1269,6 +1273,8 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
        ptr_end = ptr + item_size;
        while (ptr < ptr_end) {
                di = (struct btrfs_dir_item *)ptr;
+                if (verify_dir_item(root, eb, di))
+                        return -EIO;
                name_len = btrfs_dir_name_len(eb, di);
                ret = replay_one_name(trans, root, path, eb, di, key);
                BUG_ON(ret);
@@ -1395,6 +1401,11 @@ again:
        ptr_end = ptr + item_size;
        while (ptr < ptr_end) {
                di = (struct btrfs_dir_item *)ptr;
+                if (verify_dir_item(root, eb, di)) {
+                        ret = -EIO;
+                        goto out;
+                }
                name_len = btrfs_dir_name_len(eb, di);
                name = kmalloc(name_len, GFP_NOFS);
                if (!name) {
@@ -1692,6 +1703,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                root_owner = btrfs_header_owner(parent);
                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+                if (!next)
+                        return -ENOMEM;
                if (*level == 1) {
                        wc->process_func(root, next, wc, ptr_gen);
@@ -1802,7 +1815,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
        int orig_level;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        level = btrfs_header_level(log->node);
        orig_level = level;
@@ -2032,6 +2046,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                wait_log_commit(trans, log_root_tree,
                                log_root_tree->log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
+                ret = 0;
                goto out;
        }
        atomic_set(&log_root_tree->log_commit[index2], 1);
@@ -2096,7 +2111,7 @@ out:
        smp_mb();
        if (waitqueue_active(&root->log_commit_wait[index1]))
                wake_up(&root->log_commit_wait[index1]);
-        return 0;
+        return ret;
 }
 static void free_log_tree(struct btrfs_trans_handle *trans,
@@ -2194,6 +2209,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        log = root->log_root;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
                                   name, name_len, -1);
        if (IS_ERR(di)) {
@@ -2594,6 +2612,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
                           nr * sizeof(u32), GFP_NOFS);
+        if (!ins_data)
+                return -ENOMEM;
        ins_sizes = (u32 *)ins_data;
        ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
@@ -2725,7 +2746,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        log = root->log_root;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        dst_path = btrfs_alloc_path();
+        if (!dst_path) {
+                btrfs_free_path(path);
+                return -ENOMEM;
+        }
        min_key.objectid = inode->i_ino;
        min_key.type = BTRFS_INODE_ITEM_KEY;
@@ -3075,16 +3102,20 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
                .stage = 0,
        };
-        fs_info->log_root_recovering = 1;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
+        fs_info->log_root_recovering = 1;
        trans = btrfs_start_transaction(fs_info->tree_root, 0);
+        BUG_ON(IS_ERR(trans));
        wc.trans = trans;
        wc.pin = 1;
-        walk_log_tree(trans, log_root_tree, &wc);
+        ret = walk_log_tree(trans, log_root_tree, &wc);
+        BUG_ON(ret);
 again:
        key.objectid = BTRFS_TREE_LOG_OBJECTID;
@@ -3108,8 +3139,7 @@ again:
                log = btrfs_read_fs_root_no_radix(log_root_tree,
                                                  &found_key);
-                BUG_ON(!log);
+                BUG_ON(IS_ERR(log));
                tmp_key.objectid = found_key.offset;
                tmp_key.type = BTRFS_ROOT_ITEM_KEY;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d158530233b7..309a57b9fc85 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -33,17 +33,6 @@
 #include "volumes.h"
 #include "async-thread.h"
-struct map_lookup {
-        u64 type;
-        int io_align;
-        int io_width;
-        int stripe_len;
-        int sector_size;
-        int num_stripes;
-        int sub_stripes;
-        struct btrfs_bio_stripe stripes[];
-};
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                struct btrfs_device *device);
@@ -162,7 +151,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        struct bio *cur;
        int again = 0;
        unsigned long num_run;
-        unsigned long num_sync_run;
        unsigned long batch_run = 0;
        unsigned long limit;
        unsigned long last_waited = 0;
@@ -173,11 +161,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        limit = btrfs_async_submit_limit(fs_info);
        limit = limit * 2 / 3;
-        /* we want to make sure that every time we switch from the sync
-         * list to the normal list, we unplug
-         */
-        num_sync_run = 0;
 loop:
        spin_lock(&device->io_lock);
@@ -223,15 +206,6 @@ loop_lock:
        spin_unlock(&device->io_lock);
-        /*
-         * if we're doing the regular priority list, make sure we unplug
-         * for any high prio bios we've sent down
-         */
-        if (pending_bios == &device->pending_bios && num_sync_run > 0) {
-                num_sync_run = 0;
-                blk_run_backing_dev(bdi, NULL);
-        }
        while (pending) {
                rmb();
@@ -259,19 +233,11 @@ loop_lock:
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-                if (cur->bi_rw & REQ_SYNC)
-                        num_sync_run++;
                submit_bio(cur->bi_rw, cur);
                num_run++;
                batch_run++;
-                if (need_resched()) {
+                if (need_resched())
-                        if (num_sync_run) {
-                                blk_run_backing_dev(bdi, NULL);
-                                num_sync_run = 0;
-                        }
                        cond_resched();
-                }
                /*
                 * we made progress, there is more work to do and the bdi
@@ -304,13 +270,8 @@ loop_lock:
                                 * against it before looping
                                 */
                                last_waited = ioc->last_waited;
-                                if (need_resched()) {
+                                if (need_resched())
-                                        if (num_sync_run) {
-                                                blk_run_backing_dev(bdi, NULL);
-                                                num_sync_run = 0;
-                                        }
                                        cond_resched();
-                                }
                                continue;
                        }
                        spin_lock(&device->io_lock);
@@ -323,22 +284,6 @@ loop_lock:
                }
        }
-        if (num_sync_run) {
-                num_sync_run = 0;
-                blk_run_backing_dev(bdi, NULL);
-        }
-        /*
-         * IO has already been through a long path to get here.  Checksumming,
-         * async helper threads, perhaps compression.  We've done a pretty
-         * good job of collecting a batch of IO and should just unplug
-         * the device right away.
-         *
-         * This will help anyone who is waiting on the IO, they might have
-         * already unplugged, but managed to do so before the bio they
-         * cared about found its way down here.
-         */
-        blk_run_backing_dev(bdi, NULL);
        cond_resched();
        if (again)
                goto loop;
@@ -1213,6 +1158,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
                return -ENOMEM;
        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                btrfs_free_path(path);
+                return PTR_ERR(trans);
+        }
        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;
@@ -1334,11 +1283,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        ret = btrfs_shrink_device(device, 0);
        if (ret)
-                goto error_brelse;
+                goto error_undo;
        ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
        if (ret)
-                goto error_brelse;
+                goto error_undo;
        device->in_fs_metadata = 0;
@@ -1412,6 +1361,13 @@ out:
        mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
        return ret;
+error_undo:
+        if (device->writeable) {
+                list_add(&device->dev_alloc_list,
+                         &root->fs_info->fs_devices->alloc_list);
+                root->fs_info->fs_devices->rw_devices++;
+        }
+        goto error_brelse;
 }
 /*
@@ -1601,11 +1557,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        ret = find_next_devid(root, &device->devid);
        if (ret) {
+                kfree(device->name);
                kfree(device);
                goto error;
        }
        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                kfree(device->name);
+                kfree(device);
+                ret = PTR_ERR(trans);
+                goto error;
+        }
        lock_chunks(root);
        device->writeable = 1;
@@ -1621,7 +1585,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        device->dev_root = root->fs_info->dev_root;
        device->bdev = bdev;
        device->in_fs_metadata = 1;
-        device->mode = 0;
+        device->mode = FMODE_EXCL;
        set_blocksize(device->bdev, 4096);
        if (seeding_dev) {
@@ -1873,7 +1837,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
                return ret;
        trans = btrfs_start_transaction(root, 0);
-        BUG_ON(!trans);
+        BUG_ON(IS_ERR(trans));
        lock_chunks(root);
@@ -1904,6 +1868,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
        BUG_ON(ret);
+        trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
                ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
                BUG_ON(ret);
@@ -2047,7 +2013,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
                BUG_ON(ret);
                trans = btrfs_start_transaction(dev_root, 0);
-                BUG_ON(!trans);
+                BUG_ON(IS_ERR(trans));
                ret = btrfs_grow_device(trans, device, old_size);
                BUG_ON(ret);
@@ -2213,6 +2179,11 @@ again:
        /* Shrinking succeeded, else we would be at "done". */
        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto done;
+        }
        lock_chunks(root);
        device->disk_total_bytes = new_size;
@@ -2626,6 +2597,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        *num_bytes = chunk_bytes_by_type(type, calc_size,
                                         map->num_stripes, sub_stripes);
+        trace_btrfs_chunk_alloc(info->chunk_root, map, start, *num_bytes);
        em = alloc_extent_map(GFP_NOFS);
        if (!em) {
                ret = -ENOMEM;
@@ -2734,6 +2707,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
                                             item_size);
                BUG_ON(ret);
        }
        kfree(chunk);
        return 0;
 }
@@ -2931,14 +2905,17 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                             u64 logical, u64 *length,
                             struct btrfs_multi_bio **multi_ret,
-                             int mirror_num, struct page *unplug_page)
+                             int mirror_num)
 {
        struct extent_map *em;
        struct map_lookup *map;
        struct extent_map_tree *em_tree = &map_tree->map_tree;
        u64 offset;
        u64 stripe_offset;
+        u64 stripe_end_offset;
        u64 stripe_nr;
+        u64 stripe_nr_orig;
+        u64 stripe_nr_end;
        int stripes_allocated = 8;
        int stripes_required = 1;
        int stripe_index;
@@ -2947,7 +2924,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        int max_errors = 0;
        struct btrfs_multi_bio *multi = NULL;
-        if (multi_ret && !(rw & REQ_WRITE))
+        if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
                stripes_allocated = 1;
 again:
        if (multi_ret) {
@@ -2963,11 +2940,6 @@ again:
        em = lookup_extent_mapping(em_tree, logical, *length);
        read_unlock(&em_tree->lock);
-        if (!em && unplug_page) {
-                kfree(multi);
-                return 0;
-        }
        if (!em) {
                printk(KERN_CRIT "unable to find logical %llu len %llu\n",
                       (unsigned long long)logical,
@@ -2993,7 +2965,15 @@ again:
                        max_errors = 1;
                }
        }
-        if (multi_ret && (rw & REQ_WRITE) &&
+        if (rw & REQ_DISCARD) {
+                if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+                                 BTRFS_BLOCK_GROUP_RAID1 |
+                                 BTRFS_BLOCK_GROUP_DUP |
+                                 BTRFS_BLOCK_GROUP_RAID10)) {
+                        stripes_required = map->num_stripes;
+                }
+        }
+        if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
            stripes_allocated < stripes_required) {
                stripes_allocated = map->num_stripes;
                free_extent_map(em);
@@ -3013,23 +2993,37 @@ again:
        /* stripe_offset is the offset of this block in its stripe*/
        stripe_offset = offset - stripe_offset;
-        if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+        if (rw & REQ_DISCARD)
-                         BTRFS_BLOCK_GROUP_RAID10 |
+                *length = min_t(u64, em->len - offset, *length);
-                         BTRFS_BLOCK_GROUP_DUP)) {
+        else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+                              BTRFS_BLOCK_GROUP_RAID1 |
+                              BTRFS_BLOCK_GROUP_RAID10 |
+                              BTRFS_BLOCK_GROUP_DUP)) {
                /* we limit the length of each bio to what fits in a stripe */
                *length = min_t(u64, em->len - offset,
-                              map->stripe_len - stripe_offset);
+                                map->stripe_len - stripe_offset);
        } else {
                *length = em->len - offset;
        }
-        if (!multi_ret && !unplug_page)
+        if (!multi_ret)
                goto out;
        num_stripes = 1;
        stripe_index = 0;
-        if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+        stripe_nr_orig = stripe_nr;
-                if (unplug_page || (rw & REQ_WRITE))
+        stripe_nr_end = (offset + *length + map->stripe_len - 1) &
+                        (~(map->stripe_len - 1));
+        do_div(stripe_nr_end, map->stripe_len);
+        stripe_end_offset = stripe_nr_end * map->stripe_len -
+                            (offset + *length);
+        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+                if (rw & REQ_DISCARD)
+                        num_stripes = min_t(u64, map->num_stripes,
+                                            stripe_nr_end - stripe_nr_orig);
+                stripe_index = do_div(stripe_nr, map->num_stripes);
+        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+                if (rw & (REQ_WRITE | REQ_DISCARD))
                        num_stripes = map->num_stripes;
                else if (mirror_num)
                        stripe_index = mirror_num - 1;
@@ -3040,7 +3034,7 @@ again:
                }
        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-                if (rw & REQ_WRITE)
+                if (rw & (REQ_WRITE | REQ_DISCARD))
                        num_stripes = map->num_stripes;
                else if (mirror_num)
                        stripe_index = mirror_num - 1;
@@ -3051,8 +3045,12 @@ again:
                stripe_index = do_div(stripe_nr, factor);
                stripe_index *= map->sub_stripes;
-                if (unplug_page || (rw & REQ_WRITE))
+                if (rw & REQ_WRITE)
                        num_stripes = map->sub_stripes;
+                else if (rw & REQ_DISCARD)
+                        num_stripes = min_t(u64, map->sub_stripes *
+                                            (stripe_nr_end - stripe_nr_orig),
+                                            map->num_stripes);
                else if (mirror_num)
                        stripe_index += mirror_num - 1;
                else {
@@ -3070,24 +3068,101 @@ again:
        }
        BUG_ON(stripe_index >= map->num_stripes);
-        for (i = 0; i < num_stripes; i++) {
+        if (rw & REQ_DISCARD) {
-                if (unplug_page) {
+                for (i = 0; i < num_stripes; i++) {
-                        struct btrfs_device *device;
-                        struct backing_dev_info *bdi;
-                        device = map->stripes[stripe_index].dev;
-                        if (device->bdev) {
-                                bdi = blk_get_backing_dev_info(device->bdev);
-                                if (bdi->unplug_io_fn)
-                                        bdi->unplug_io_fn(bdi, unplug_page);
-                        }
-                } else {
                        multi->stripes[i].physical =
                                map->stripes[stripe_index].physical +
                                stripe_offset + stripe_nr * map->stripe_len;
                        multi->stripes[i].dev = map->stripes[stripe_index].dev;
+                        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+                                u64 stripes;
+                                u32 last_stripe = 0;
+                                int j;
+                                div_u64_rem(stripe_nr_end - 1,
+                                            map->num_stripes,
+                                            &last_stripe);
+                                for (j = 0; j < map->num_stripes; j++) {
+                                        u32 test;
+                                        div_u64_rem(stripe_nr_end - 1 - j,
+                                                    map->num_stripes, &test);
+                                        if (test == stripe_index)
+                                                break;
+                                }
+                                stripes = stripe_nr_end - 1 - j;
+                                do_div(stripes, map->num_stripes);
+                                multi->stripes[i].length = map->stripe_len *
+                                        (stripes - stripe_nr + 1);
+                                if (i == 0) {
+                                        multi->stripes[i].length -=
+                                                stripe_offset;
+                                        stripe_offset = 0;
+                                }
+                                if (stripe_index == last_stripe)
+                                        multi->stripes[i].length -=
+                                                stripe_end_offset;
+                        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+                                u64 stripes;
+                                int j;
+                                int factor = map->num_stripes /
+                                             map->sub_stripes;
+                                u32 last_stripe = 0;
+                                div_u64_rem(stripe_nr_end - 1,
+                                            factor, &last_stripe);
+                                last_stripe *= map->sub_stripes;
+                                for (j = 0; j < factor; j++) {
+                                        u32 test;
+                                        div_u64_rem(stripe_nr_end - 1 - j,
+                                                    factor, &test);
+                                        if (test ==
+                                            stripe_index / map->sub_stripes)
+                                                break;
+                                }
+                                stripes = stripe_nr_end - 1 - j;
+                                do_div(stripes, factor);
+                                multi->stripes[i].length = map->stripe_len *
+                                        (stripes - stripe_nr + 1);
+                                if (i < map->sub_stripes) {
+                                        multi->stripes[i].length -=
+                                                stripe_offset;
+                                        if (i == map->sub_stripes - 1)
+                                                stripe_offset = 0;
+                                }
+                                if (stripe_index >= last_stripe &&
+                                    stripe_index <= (last_stripe +
+                                                     map->sub_stripes - 1)) {
+                                        multi->stripes[i].length -=
+                                                stripe_end_offset;
+                                }
+                        } else
+                                multi->stripes[i].length = *length;
+                        stripe_index++;
+                        if (stripe_index == map->num_stripes) {
+                                /* This could only happen for RAID0/10 */
+                                stripe_index = 0;
+                                stripe_nr++;
+                        }
+                }
+        } else {
+                for (i = 0; i < num_stripes; i++) {
+                        multi->stripes[i].physical =
+                                map->stripes[stripe_index].physical +
+                                stripe_offset +
+                                stripe_nr * map->stripe_len;
+                        multi->stripes[i].dev =
+                                map->stripes[stripe_index].dev;
+                        stripe_index++;
                }
-                stripe_index++;
        }
        if (multi_ret) {
                *multi_ret = multi;
@@ -3104,7 +3179,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                      struct btrfs_multi_bio **multi_ret, int mirror_num)
 {
        return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
-                                 mirror_num, NULL);
+                                 mirror_num);
 }
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -3172,14 +3247,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
        return 0;
 }
-int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
-                      u64 logical, struct page *page)
-{
-        u64 length = PAGE_CACHE_SIZE;
-        return __btrfs_map_block(map_tree, READ, logical, &length,
-                                 NULL, 0, page);
-}
 static void end_bio_multi_stripe(struct bio *bio, int err)
 {
        struct btrfs_multi_bio *multi = bio->bi_private;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7fb59d45fe8c..cc2eadaf7a27 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -126,6 +126,7 @@ struct btrfs_fs_devices {
 struct btrfs_bio_stripe {
        struct btrfs_device *dev;
        u64 physical;
+        u64 length; /* only used for discard mappings */
 };
 struct btrfs_multi_bio {
@@ -145,6 +146,17 @@ struct btrfs_device_info {
        u64 max_avail;
 };
+struct map_lookup {
+        u64 type;
+        int io_align;
+        int io_width;
+        int stripe_len;
+        int sector_size;
+        int num_stripes;
+        int sub_stripes;
+        struct btrfs_bio_stripe stripes[];
+};
 /* Used to sort the devices by max_avail(descending sort) */
 int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a5776531dc2b..cfd660550ded 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -180,11 +180,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_dir_item *di;
-        int ret = 0, slot, advance;
+        int ret = 0, slot;
        size_t total_size = 0, size_left = size;
        unsigned long name_ptr;
        size_t name_len;
-        u32 nritems;
        /*
         * ok we want all objects associated with this id.
@@ -204,34 +203,24 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
-        advance = 0;
        while (1) {
                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
                slot = path->slots[0];
                /* this is where we start walking through the path */
-                if (advance || slot >= nritems) {
+                if (slot >= btrfs_header_nritems(leaf)) {
                        /*
                         * if we've reached the last slot in this leaf we need
                         * to go to the next leaf and reset everything
                         */
-                        if (slot >= nritems-1) {
+                        ret = btrfs_next_leaf(root, path);
-                                ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
-                                if (ret)
+                                goto err;
-                                        break;
+                        else if (ret > 0)
-                                leaf = path->nodes[0];
+                                break;
-                                nritems = btrfs_header_nritems(leaf);
+                        continue;
-                                slot = path->slots[0];
-                        } else {
-                                /*
-                                 * just walking through the slots on this leaf
-                                 */
-                                slot++;
-                                path->slots[0]++;
-                        }
                }
-                advance = 1;
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -242,13 +231,15 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                        break;
                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+                if (verify_dir_item(root, leaf, di))
+                        continue;
                name_len = btrfs_dir_name_len(leaf, di);
                total_size += name_len + 1;
                /* we are just looking for how big our buffer needs to be */
                if (!size)
-                        continue;
+                        goto next;
                if (!buffer || (name_len + 1) > size_left) {
                        ret = -ERANGE;
@@ -261,6 +252,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                size_left -= name_len + 1;
                buffer += name_len + 1;
+next:
+                path->slots[0]++;
        }
        ret = total_size;
@@ -370,7 +363,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
 }
 int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
-                              struct inode *inode, struct inode *dir)
+                              struct inode *inode, struct inode *dir,
+                              const struct qstr *qstr)
 {
        int err;
        size_t len;
@@ -378,7 +372,8 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
        char *suffix;
        char *name;
-        err = security_inode_init_security(inode, dir, &suffix, &value, &len);
+        err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
+                                           &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 7a43fd640bbb..b3cc8039134b 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -37,6 +37,7 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
 extern int btrfs_removexattr(struct dentry *dentry, const char *name);
 extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
-                                     struct inode *inode, struct inode *dir);
+                                     struct inode *inode, struct inode *dir,
+                                     const struct qstr *qstr);
 #endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index f5ec2d44150d..faccd47c6c46 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -57,7 +57,8 @@ static struct list_head *zlib_alloc_workspace(void)
        if (!workspace)
                return ERR_PTR(-ENOMEM);
-        workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+        workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
+                                                MAX_WBITS, MAX_MEM_LEVEL));
        workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
        workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
        if (!workspace->def_strm.workspace ||
diff --git a/fs/buffer.c b/fs/buffer.c
index 2219a76e2caf..a08bb8e61c6f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -54,23 +54,15 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 }
 EXPORT_SYMBOL(init_buffer);
-static int sync_buffer(void *word)
+static int sleep_on_buffer(void *word)
 {
-        struct block_device *bd;
-        struct buffer_head *bh
-                = container_of(word, struct buffer_head, b_state);
-        smp_mb();
-        bd = bh->b_bdev;
-        if (bd)
-                blk_run_address_space(bd->bd_inode->i_mapping);
        io_schedule();
        return 0;
 }
 void __lock_buffer(struct buffer_head *bh)
 {
-        wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
+        wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
                                                        TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_buffer);
@@ -90,7 +82,7 @@ EXPORT_SYMBOL(unlock_buffer);
 */
 void __wait_on_buffer(struct buffer_head * bh)
 {
-        wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
+        wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__wait_on_buffer);
@@ -749,10 +741,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
        struct buffer_head *bh;
        struct list_head tmp;
-        struct address_space *mapping, *prev_mapping = NULL;
+        struct address_space *mapping;
        int err = 0, err2;
+        struct blk_plug plug;
        INIT_LIST_HEAD(&tmp);
+        blk_start_plug(&plug);
        spin_lock(lock);
        while (!list_empty(list)) {
@@ -775,7 +769,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                 * still in flight on potentially older
                                 * contents.
                                 */
-                                write_dirty_buffer(bh, WRITE_SYNC_PLUG);
+                                write_dirty_buffer(bh, WRITE_SYNC);
                                /*
                                 * Kick off IO for the previous mapping. Note
@@ -783,16 +777,16 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                 * wait_on_buffer() will do that for us
                                 * through sync_buffer().
                                 */
-                                if (prev_mapping && prev_mapping != mapping)
-                                        blk_run_address_space(prev_mapping);
-                                prev_mapping = mapping;
                                brelse(bh);
                                spin_lock(lock);
                        }
                }
        }
+        spin_unlock(lock);
+        blk_finish_plug(&plug);
+        spin_lock(lock);
        while (!list_empty(&tmp)) {
                bh = BH_ENTRY(tmp.prev);
                get_bh(bh);
@@ -1144,7 +1138,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
 * inode list.
 *
 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
- * mapping->tree_lock and the global inode_lock.
+ * mapping->tree_lock and mapping->host->i_lock.
 */
 void mark_buffer_dirty(struct buffer_head *bh)
 {
@@ -1614,14 +1608,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
 * prevents this contention from occurring.
 *
 * If block_write_full_page() is called with wbc->sync_mode ==
- * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
+ * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
- * causes the writes to be flagged as synchronous writes, but the
+ * causes the writes to be flagged as synchronous writes.
- * block device queue will NOT be unplugged, since usually many pages
- * will be pushed to the out before the higher-level caller actually
- * waits for the writes to be completed.  The various wait functions,
- * such as wait_on_writeback_range() will ultimately call sync_page()
- * which will ultimately call blk_run_backing_dev(), which will end up
- * unplugging the device queue.
 */
 static int __block_write_full_page(struct inode *inode, struct page *page,
                        get_block_t *get_block, struct writeback_control *wbc,
@@ -1634,7 +1622,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        const unsigned blocksize = 1 << inode->i_blkbits;
        int nr_underway = 0;
        int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
-                        WRITE_SYNC_PLUG : WRITE);
+                        WRITE_SYNC : WRITE);
        BUG_ON(!PageLocked(page));
@@ -3138,17 +3126,6 @@ out:
 }
 EXPORT_SYMBOL(try_to_free_buffers);
-void block_sync_page(struct page *page)
-{
-        struct address_space *mapping;
-        smp_mb();
-        mapping = page_mapping(page);
-        if (mapping)
-                blk_run_backing_dev(mapping->backing_dev_info, page);
-}
-EXPORT_SYMBOL(block_sync_page);
 /*
 * There are no bdflush tunables left.  But distributions are
 * still running obsolete flush daemons, so we terminate them here.
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 37fe101a4e0d..1064805e653b 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -197,7 +197,7 @@ struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
 }
 /*
- * update the auxilliary data for an object object on disk
+ * update the auxiliary data for an object object on disk
 */
 static void cachefiles_update_object(struct fscache_object *_object)
 {
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 42c7fafc8bfe..a0358c2189cb 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -275,6 +275,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
                                  bool preemptive)
 {
        struct dentry *grave, *trap;
+        struct path path, path_to_graveyard;
        char nbuffer[8 + 8 + 1];
        int ret;
@@ -287,10 +288,18 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
        /* non-directories can just be unlinked */
        if (!S_ISDIR(rep->d_inode->i_mode)) {
                _debug("unlink stale object");
-                ret = vfs_unlink(dir->d_inode, rep);
-                if (preemptive)
+                path.mnt = cache->mnt;
-                        cachefiles_mark_object_buried(cache, rep);
+                path.dentry = dir;
+                ret = security_path_unlink(&path, rep);
+                if (ret < 0) {
+                        cachefiles_io_error(cache, "Unlink security error");
+                } else {
+                        ret = vfs_unlink(dir->d_inode, rep);
+                        if (preemptive)
+                                cachefiles_mark_object_buried(cache, rep);
+                }
                mutex_unlock(&dir->d_inode->i_mutex);
@@ -379,12 +388,23 @@ try_again:
        }
        /* attempt the rename */
-        ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave);
+        path.mnt = cache->mnt;
-        if (ret != 0 && ret != -ENOMEM)
+        path.dentry = dir;
-                cachefiles_io_error(cache, "Rename failed with error %d", ret);
+        path_to_graveyard.mnt = cache->mnt;
+        path_to_graveyard.dentry = cache->graveyard;
+        ret = security_path_rename(&path, rep, &path_to_graveyard, grave);
+        if (ret < 0) {
+                cachefiles_io_error(cache, "Rename security error %d", ret);
+        } else {
+                ret = vfs_rename(dir->d_inode, rep,
+                                 cache->graveyard->d_inode, grave);
+                if (ret != 0 && ret != -ENOMEM)
+                        cachefiles_io_error(cache,
+                                            "Rename failed with error %d", ret);
-        if (preemptive)
+                if (preemptive)
-                cachefiles_mark_object_buried(cache, rep);
+                        cachefiles_mark_object_buried(cache, rep);
+        }
        unlock_rename(cache->graveyard, dir);
        dput(grave);
@@ -448,6 +468,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
 {
        struct cachefiles_cache *cache;
        struct dentry *dir, *next = NULL;
+        struct path path;
        unsigned long start;
        const char *name;
        int ret, nlen;
@@ -458,6 +479,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
        cache = container_of(parent->fscache.cache,
                             struct cachefiles_cache, cache);
+        path.mnt = cache->mnt;
        ASSERT(parent->dentry);
        ASSERT(parent->dentry->d_inode);
@@ -511,6 +533,10 @@ lookup_again:
                        if (ret < 0)
                                goto create_error;
+                        path.dentry = dir;
+                        ret = security_path_mkdir(&path, next, 0);
+                        if (ret < 0)
+                                goto create_error;
                        start = jiffies;
                        ret = vfs_mkdir(dir->d_inode, next, 0);
                        cachefiles_hist(cachefiles_mkdir_histogram, start);
@@ -536,6 +562,10 @@ lookup_again:
                        if (ret < 0)
                                goto create_error;
+                        path.dentry = dir;
+                        ret = security_path_mknod(&path, next, S_IFREG, 0);
+                        if (ret < 0)
+                                goto create_error;
                        start = jiffies;
                        ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
                        cachefiles_hist(cachefiles_create_histogram, start);
@@ -692,6 +722,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 {
        struct dentry *subdir;
        unsigned long start;
+        struct path path;
        int ret;
        _enter(",,%s", dirname);
@@ -719,6 +750,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
                _debug("attempt mkdir");
+                path.mnt = cache->mnt;
+                path.dentry = dir;
+                ret = security_path_mkdir(&path, subdir, 0700);
+                if (ret < 0)
+                        goto mkdir_error;
                ret = vfs_mkdir(dir->d_inode, subdir, 0700);
                if (ret < 0)
                        goto mkdir_error;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 561438b6a50c..e159c529fd2b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -24,7 +24,7 @@
 * context needs to be associated with the osd write during writeback.
 *
 * Similarly, struct ceph_inode_info maintains a set of counters to
- * count dirty pages on the inode.  In the absense of snapshots,
+ * count dirty pages on the inode.  In the absence of snapshots,
 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
 *
 * When a snapshot is taken (that is, when the client receives
@@ -92,7 +92,7 @@ static int ceph_set_page_dirty(struct page *page)
                ci->i_head_snapc = ceph_get_snap_context(snapc);
        ++ci->i_wrbuffer_ref_head;
        if (ci->i_wrbuffer_ref == 0)
-                igrab(inode);
+                ihold(inode);
        ++ci->i_wrbuffer_ref;
        dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
             "snapc %p seq %lld (%d snaps)\n",
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6b61ded701e1..5323c330bbf3 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -765,7 +765,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
                        if (touch) {
                                struct rb_node *q;
-                                /* touch this + preceeding caps */
+                                /* touch this + preceding caps */
                                __touch_cap(cap);
                                for (q = rb_first(&ci->i_caps); q != p;
                                     q = rb_next(q)) {
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 08f65faac112..0dba6915712b 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -210,8 +210,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
        if (!fsc->debugfs_congestion_kb)
                goto out;
-        dout("a\n");
        snprintf(name, sizeof(name), "../../bdi/%s",
                 dev_name(fsc->backing_dev_info.dev));
        fsc->debugfs_bdi =
@@ -221,7 +219,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
        if (!fsc->debugfs_bdi)
                goto out;
-        dout("b\n");
        fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
                                        0600,
                                        fsc->client->debugfs_dir,
@@ -230,7 +227,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
        if (!fsc->debugfs_mdsmap)
                goto out;
-        dout("ca\n");
        fsc->debugfs_mdsc = debugfs_create_file("mdsc",
                                                0600,
                                                fsc->client->debugfs_dir,
@@ -239,7 +235,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
        if (!fsc->debugfs_mdsc)
                goto out;
-        dout("da\n");
        fsc->debugfs_caps = debugfs_create_file("caps",
                                                   0400,
                                                   fsc->client->debugfs_dir,
@@ -248,7 +243,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
        if (!fsc->debugfs_caps)
                goto out;
-        dout("ea\n");
        fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
                                        0600,
                                        fsc->client->debugfs_dir,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0bc68de8edd7..1a867a3601ae 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -161,7 +161,7 @@ more:
        filp->f_pos = di->offset;
        err = filldir(dirent, dentry->d_name.name,
                      dentry->d_name.len, di->offset,
-                      dentry->d_inode->i_ino,
+                      ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
                      dentry->d_inode->i_mode >> 12);
        if (last) {
@@ -245,15 +245,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
                dout("readdir off 0 -> '.'\n");
                if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
-                            inode->i_ino, inode->i_mode >> 12) < 0)
+                            ceph_translate_ino(inode->i_sb, inode->i_ino),
+                            inode->i_mode >> 12) < 0)
                        return 0;
                filp->f_pos = 1;
                off = 1;
        }
        if (filp->f_pos == 1) {
+                ino_t ino = filp->f_dentry->d_parent->d_inode->i_ino;
                dout("readdir off 1 -> '..'\n");
                if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
-                            filp->f_dentry->d_parent->d_inode->i_ino,
+                            ceph_translate_ino(inode->i_sb, ino),
                            inode->i_mode >> 12) < 0)
                        return 0;
                filp->f_pos = 2;
@@ -377,7 +379,8 @@ more:
                if (filldir(dirent,
                            rinfo->dir_dname[off - fi->offset],
                            rinfo->dir_dname_len[off - fi->offset],
-                            pos, ino, ftype) < 0) {
+                            pos,
+                            ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
                        dout("filldir stopping us...\n");
                        return 0;
                }
@@ -409,7 +412,7 @@ more:
        spin_lock(&inode->i_lock);
        if (ci->i_release_count == fi->dir_release_count) {
                dout(" marking %p complete\n", inode);
-                ci->i_ceph_flags |= CEPH_I_COMPLETE;
+                /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
                ci->i_max_offset = filp->f_pos;
        }
        spin_unlock(&inode->i_lock);
@@ -496,6 +499,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
        /* .snap dir? */
        if (err == -ENOENT &&
+            ceph_snap(parent) == CEPH_NOSNAP &&
            strcmp(dentry->d_name.name,
                   fsc->mount_options->snapdir_name) == 0) {
                struct inode *inode = ceph_get_snapdir(parent);
@@ -992,7 +996,7 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *dir;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        dir = dentry->d_parent->d_inode;
@@ -1023,34 +1027,13 @@ out_touch:
 }
 /*
- * When a dentry is released, clear the dir I_COMPLETE if it was part
+ * Release our ceph_dentry_info.
- * of the current dir gen or if this is in the snapshot namespace.
 */
-static void ceph_dentry_release(struct dentry *dentry)
+static void ceph_d_release(struct dentry *dentry)
 {
        struct ceph_dentry_info *di = ceph_dentry(dentry);
-        struct inode *parent_inode = NULL;
-        u64 snapid = CEPH_NOSNAP;
-        if (!IS_ROOT(dentry)) {
+        dout("d_release %p\n", dentry);
-                parent_inode = dentry->d_parent->d_inode;
-                if (parent_inode)
-                        snapid = ceph_snap(parent_inode);
-        }
-        dout("dentry_release %p parent %p\n", dentry, parent_inode);
-        if (parent_inode && snapid != CEPH_SNAPDIR) {
-                struct ceph_inode_info *ci = ceph_inode(parent_inode);
-                spin_lock(&parent_inode->i_lock);
-                if (ci->i_shared_gen == di->lease_shared_gen ||
-                    snapid <= CEPH_MAXSNAP) {
-                        dout(" clearing %p complete (d_release)\n",
-                             parent_inode);
-                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                        ci->i_release_count++;
-                }
-                spin_unlock(&parent_inode->i_lock);
-        }
        if (di) {
                ceph_dentry_lru_del(dentry);
                if (di->lease_session)
@@ -1275,14 +1258,14 @@ const struct inode_operations ceph_dir_iops = {
 const struct dentry_operations ceph_dentry_ops = {
        .d_revalidate = ceph_d_revalidate,
-        .d_release = ceph_dentry_release,
+        .d_release = ceph_d_release,
 };
 const struct dentry_operations ceph_snapdir_dentry_ops = {
        .d_revalidate = ceph_snapdir_d_revalidate,
-        .d_release = ceph_dentry_release,
+        .d_release = ceph_d_release,
 };
 const struct dentry_operations ceph_snap_dentry_ops = {
-        .d_release = ceph_dentry_release,
+        .d_release = ceph_d_release,
 };
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7d0e4a82d898..159b512d5a27 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -564,11 +564,19 @@ more:
                         * start_request so that a tid has been assigned.
                         */
                        spin_lock(&ci->i_unsafe_lock);
-                        list_add(&req->r_unsafe_item, &ci->i_unsafe_writes);
+                        list_add_tail(&req->r_unsafe_item,
+                                      &ci->i_unsafe_writes);
                        spin_unlock(&ci->i_unsafe_lock);
                        ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
                }
+                
                ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+                if (ret < 0 && req->r_safe_callback) {
+                        spin_lock(&ci->i_unsafe_lock);
+                        list_del_init(&req->r_unsafe_item);
+                        spin_unlock(&ci->i_unsafe_lock);
+                        ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+                }
        }
        if (file->f_flags & O_DIRECT)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5625463aa479..b54c97da1c43 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -36,6 +36,13 @@ static void ceph_vmtruncate_work(struct work_struct *work);
 /*
 * find or create an inode, given the ceph ino number
 */
+static int ceph_set_ino_cb(struct inode *inode, void *data)
+{
+        ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
+        inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+        return 0;
+}
 struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
 {
        struct inode *inode;
@@ -707,7 +714,7 @@ static int fill_inode(struct inode *inode,
                    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
                    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
                        dout(" marking %p complete (empty)\n", inode);
-                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
+                        /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
                        ci->i_max_offset = 2;
                }
                break;
@@ -1030,9 +1037,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        dout("fill_trace doing d_move %p -> %p\n",
                             req->r_old_dentry, dn);
-                        /* d_move screws up d_subdirs order */
-                        ceph_i_clear(dir, CEPH_I_COMPLETE);
                        d_move(req->r_old_dentry, dn);
                        dout(" src %p '%.*s' dst %p '%.*s'\n",
                             req->r_old_dentry,
@@ -1044,12 +1048,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                           rehashing bug in vfs_rename_dir */
                        ceph_invalidate_dentry_lease(dn);
-                        /* take overwritten dentry's readdir offset */
+                        /*
-                        dout("dn %p gets %p offset %lld (old offset %lld)\n",
+                         * d_move() puts the renamed dentry at the end of
-                             req->r_old_dentry, dn, ceph_dentry(dn)->offset,
+                         * d_subdirs.  We need to assign it an appropriate
+                         * directory offset so we can behave when holding
+                         * I_COMPLETE.
+                         */
+                        ceph_set_dentry_offset(req->r_old_dentry);
+                        dout("dn %p gets new offset %lld\n", req->r_old_dentry, 
                             ceph_dentry(req->r_old_dentry)->offset);
-                        ceph_dentry(req->r_old_dentry)->offset =
-                                ceph_dentry(dn)->offset;
                        dn = req->r_old_dentry;  /* use old_dentry */
                        in = dn->d_inode;
@@ -1809,7 +1816,7 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
        err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
        if (!err) {
                generic_fillattr(inode, stat);
-                stat->ino = inode->i_ino;
+                stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
                if (ceph_snap(inode) != CEPH_NOSNAP)
                        stat->dev = ceph_snap(inode);
                else
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a1ee8fa3a8e7..f60b07b0feb0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3215,9 +3215,15 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
 {
        struct ceph_mds_client *mdsc = fsc->mdsc;
+        dout("mdsc_destroy %p\n", mdsc);
        ceph_mdsc_stop(mdsc);
+        /* flush out any connection work with references to us */
+        ceph_msgr_flush();
        fsc->mdsc = NULL;
        kfree(mdsc);
+        dout("mdsc_destroy %p done\n", mdsc);
 }
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 39c243acd062..e86ec1155f8f 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -342,7 +342,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)
        num = 0;
        snapc->seq = realm->seq;
        if (parent) {
-                /* include any of parent's snaps occuring _after_ my
+                /* include any of parent's snaps occurring _after_ my
                   parent became my parent */
                for (i = 0; i < parent->cached_context->num_snaps; i++)
                        if (parent->cached_context->snaps[i] >=
@@ -463,8 +463,8 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
                     capsnap, snapc);
-                igrab(inode);
+                ihold(inode);
-                
                atomic_set(&capsnap->nref, 1);
                capsnap->ci = ci;
                INIT_LIST_HEAD(&capsnap->ci_item);
@@ -584,10 +584,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
        if (lastinode)
                iput(lastinode);
-        dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
+        list_for_each_entry(child, &realm->children, child_item) {
-        list_for_each_entry(child, &realm->children, child_item)
+                dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
-                queue_realm_cap_snaps(child);
+                     realm, realm->ino, child, child->ino);
+                list_del_init(&child->dirty_item);
+                list_add(&child->dirty_item, &realm->dirty_item);
+        }
+        list_del_init(&realm->dirty_item);
        dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
 }
@@ -683,7 +687,9 @@ more:
         * queue cap snaps _after_ we've built the new snap contexts,
         * so that i_head_snapc can be set appropriately.
         */
-        list_for_each_entry(realm, &dirty_realms, dirty_item) {
+        while (!list_empty(&dirty_realms)) {
+                realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
+                                         dirty_item);
                queue_realm_cap_snaps(realm);
        }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9c5085465a63..f2f77fd3c14c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -131,6 +131,7 @@ enum {
        Opt_rbytes,
        Opt_norbytes,
        Opt_noasyncreaddir,
+        Opt_ino32,
 };
 static match_table_t fsopt_tokens = {
@@ -150,6 +151,7 @@ static match_table_t fsopt_tokens = {
        {Opt_rbytes, "rbytes"},
        {Opt_norbytes, "norbytes"},
        {Opt_noasyncreaddir, "noasyncreaddir"},
+        {Opt_ino32, "ino32"},
        {-1, NULL}
 };
@@ -225,6 +227,9 @@ static int parse_fsopt_token(char *c, void *private)
        case Opt_noasyncreaddir:
                fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
                break;
+        case Opt_ino32:
+                fsopt->flags |= CEPH_MOUNT_OPT_INO32;
+                break;
        default:
                BUG_ON(token);
        }
@@ -288,7 +293,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
        fsopt->sb_flags = flags;
        fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
-        fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+        fsopt->rsize = CEPH_RSIZE_DEFAULT;
        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
        fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
        fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
@@ -348,7 +353,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
        if (opt->name)
                seq_printf(m, ",name=%s", opt->name);
-        if (opt->secret)
+        if (opt->key)
                seq_puts(m, ",secret=<hidden>");
        if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
@@ -370,7 +375,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
        if (fsopt->wsize)
                seq_printf(m, ",wsize=%d", fsopt->wsize);
-        if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+        if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
                seq_printf(m, ",rsize=%d", fsopt->rsize);
        if (fsopt->congestion_kb != default_congestion_kb())
                seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 20b907d76ae2..619fe719968f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -27,6 +27,7 @@
 #define CEPH_MOUNT_OPT_DIRSTAT         (1<<4) /* `cat dirname` for stats */
 #define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
 #define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
+#define CEPH_MOUNT_OPT_INO32           (1<<8) /* 32 bit inos */
 #define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
@@ -35,6 +36,7 @@
 #define ceph_test_mount_opt(fsc, opt) \
        (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
+#define CEPH_RSIZE_DEFAULT             (512*1024) /* readahead */
 #define CEPH_MAX_READDIR_DEFAULT        1024
 #define CEPH_MAX_READDIR_BYTES_DEFAULT  (512*1024)
 #define CEPH_SNAPDIRNAME_DEFAULT        ".snap"
@@ -319,6 +321,16 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
        return container_of(inode, struct ceph_inode_info, vfs_inode);
 }
+static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
+{
+        return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
+}
+static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
+{
+        return (struct ceph_fs_client *)sb->s_fs_info;
+}
 static inline struct ceph_vino ceph_vino(struct inode *inode)
 {
        return ceph_inode(inode)->i_vino;
@@ -327,19 +339,49 @@ static inline struct ceph_vino ceph_vino(struct inode *inode)
 /*
 * ino_t is <64 bits on many architectures, blech.
 *
- * don't include snap in ino hash, at least for now.
+ *               i_ino (kernel inode)   st_ino (userspace)
+ * i386          32                     32
+ * x86_64+ino32  64                     32
+ * x86_64        64                     64
+ */
+static inline u32 ceph_ino_to_ino32(ino_t ino)
+{
+        ino ^= ino >> (sizeof(ino) * 8 - 32);
+        if (!ino)
+                ino = 1;
+        return ino;
+}
+/*
+ * kernel i_ino value
 */
 static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
 {
        ino_t ino = (ino_t)vino.ino;  /* ^ (vino.snap << 20); */
 #if BITS_PER_LONG == 32
-        ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+        ino = ceph_ino_to_ino32(ino);
-        if (!ino)
-                ino = 1;
 #endif
        return ino;
 }
+/*
+ * user-visible ino (stat, filldir)
+ */
+#if BITS_PER_LONG == 32
+static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
+{
+        return ino;
+}
+#else
+static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
+{
+        if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
+                ino = ceph_ino_to_ino32(ino);
+        return ino;
+}
+#endif
 /* for printf-style formatting */
 #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
@@ -428,13 +470,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
        return ((loff_t)frag << 32) | (loff_t)off;
 }
-static inline int ceph_set_ino_cb(struct inode *inode, void *data)
-{
-        ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
-        inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
-        return 0;
-}
 /*
 * caps helpers
 */
@@ -503,15 +538,6 @@ extern void ceph_reservation_status(struct ceph_fs_client *client,
                                    int *total, int *avail, int *used,
                                    int *reserved, int *min);
-static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
-{
-        return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
-}
-static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
-{
-        return (struct ceph_fs_client *)sb->s_fs_info;
-}
 /*
diff --git a/fs/cifs/AUTHORS b/fs/cifs/AUTHORS
index 7f7fa3c302af..ea940b1db77b 100644
--- a/fs/cifs/AUTHORS
+++ b/fs/cifs/AUTHORS
@@ -35,7 +35,7 @@ Adrian Bunk (kcalloc cleanups)
 Miklos Szeredi 
 Kazeon team for various fixes especially for 2.4 version.
 Asser Ferno (Change Notify support)
-Shaggy (Dave Kleikamp) for inumerable small fs suggestions and some good cleanup
+Shaggy (Dave Kleikamp) for innumerable small fs suggestions and some good cleanup
 Gunter Kukkukk (testing and suggestions for support of old servers)
 Igor Mammedov (DFS support)
 Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code)
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index ee45648b0d1a..7cb0f7f847e4 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -3,6 +3,7 @@ config CIFS
        depends on INET
        select NLS
        select CRYPTO
+        select CRYPTO_MD4
        select CRYPTO_MD5
        select CRYPTO_HMAC
        select CRYPTO_ARC4
diff --git a/fs/cifs/README b/fs/cifs/README
index fe1683590828..74ab165fc646 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -685,22 +685,6 @@ LinuxExtensionsEnabled	If set to one then the client will attempt to
                        support and want to map the uid and gid fields 
                        to values supplied at mount (rather than the 
                        actual values, then set this to zero. (default 1)
-Experimental            When set to 1 used to enable certain experimental
-                        features (currently enables multipage writes
-                        when signing is enabled, the multipage write
-                        performance enhancement was disabled when
-                        signing turned on in case buffer was modified
-                        just before it was sent, also this flag will
-                        be used to use the new experimental directory change 
-                        notification code).  When set to 2 enables
-                        an additional experimental feature, "raw ntlmssp"
-                        session establishment support (which allows
-                        specifying "sec=ntlmssp" on mount). The Linux cifs
-                        module will use ntlmv2 authentication encapsulated
-                        in "raw ntlmssp" (not using SPNEGO) when
-                        "sec=ntlmssp" is specified on mount.
-                        This support also requires building cifs with
-                        the CONFIG_CIFS_EXPERIMENTAL configuration flag.
 These experimental features and tracing can be enabled by changing flags in 
 /proc/fs/cifs (after the cifs module has been installed or built into the 
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index e654dfd092c3..53d57a3fe427 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -50,7 +50,7 @@ void cifs_fscache_unregister(void)
 */
 struct cifs_server_key {
        uint16_t        family;         /* address family */
-        uint16_t        port;           /* IP port */
+        __be16          port;           /* IP port */
        union {
                struct in_addr  ipv4_addr;
                struct in6_addr ipv6_addr;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 65829d32128c..30d01bc90855 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -423,7 +423,6 @@ static const struct file_operations cifs_lookup_cache_proc_fops;
 static const struct file_operations traceSMB_proc_fops;
 static const struct file_operations cifs_multiuser_mount_proc_fops;
 static const struct file_operations cifs_security_flags_proc_fops;
-static const struct file_operations cifs_experimental_proc_fops;
 static const struct file_operations cifs_linux_ext_proc_fops;
 void
@@ -441,8 +440,6 @@ cifs_proc_init(void)
        proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
        proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
        proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops);
-        proc_create("Experimental", 0, proc_fs_cifs,
-                    &cifs_experimental_proc_fops);
        proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
                    &cifs_linux_ext_proc_fops);
        proc_create("MultiuserMount", 0, proc_fs_cifs,
@@ -469,7 +466,6 @@ cifs_proc_clean(void)
        remove_proc_entry("OplockEnabled", proc_fs_cifs);
        remove_proc_entry("SecurityFlags", proc_fs_cifs);
        remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
-        remove_proc_entry("Experimental", proc_fs_cifs);
        remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
        remove_proc_entry("fs/cifs", NULL);
 }
@@ -550,45 +546,6 @@ static const struct file_operations cifs_oplock_proc_fops = {
        .write          = cifs_oplock_proc_write,
 };
-static int cifs_experimental_proc_show(struct seq_file *m, void *v)
-{
-        seq_printf(m, "%d\n", experimEnabled);
-        return 0;
-}
-static int cifs_experimental_proc_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, cifs_experimental_proc_show, NULL);
-}
-static ssize_t cifs_experimental_proc_write(struct file *file,
-                const char __user *buffer, size_t count, loff_t *ppos)
-{
-        char c;
-        int rc;
-        rc = get_user(c, buffer);
-        if (rc)
-                return rc;
-        if (c == '0' || c == 'n' || c == 'N')
-                experimEnabled = 0;
-        else if (c == '1' || c == 'y' || c == 'Y')
-                experimEnabled = 1;
-        else if (c == '2')
-                experimEnabled = 2;
-        return count;
-}
-static const struct file_operations cifs_experimental_proc_fops = {
-        .owner          = THIS_MODULE,
-        .open           = cifs_experimental_proc_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-        .write          = cifs_experimental_proc_write,
-};
 static int cifs_linux_ext_proc_show(struct seq_file *m, void *v)
 {
        seq_printf(m, "%d\n", linuxExtEnabled);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index f1c68629f277..2b68ac57d97d 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -53,7 +53,7 @@ void cifs_dfs_release_automount_timer(void)
 *
 * Extracts sharename form full UNC.
 * i.e. strips from UNC trailing path that is not part of share
- * name and fixup missing '\' in the begining of DFS node refferal
+ * name and fixup missing '\' in the beginning of DFS node refferal
 * if necessary.
 * Returns pointer to share name on success or ERR_PTR on error.
 * Caller is responsible for freeing returned string.
@@ -282,8 +282,6 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
        cFYI(1, "in %s", __func__);
        BUG_ON(IS_ROOT(mntpt));
-        xid = GetXid();
        /*
         * The MSDFS spec states that paths in DFS referral requests and
         * responses must be prefixed by a single '\' character instead of
@@ -293,7 +291,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
        mnt = ERR_PTR(-ENOMEM);
        full_path = build_path_from_dentry(mntpt);
        if (full_path == NULL)
-                goto free_xid;
+                goto cdda_exit;
        cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
        tlink = cifs_sb_tlink(cifs_sb);
@@ -303,9 +301,11 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
        }
        ses = tlink_tcon(tlink)->ses;
+        xid = GetXid();
        rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
                &num_referrals, &referrals,
                cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        FreeXid(xid);
        cifs_put_tlink(tlink);
@@ -338,8 +338,7 @@ success:
        free_dfs_info_array(referrals, num_referrals);
 free_full_path:
        kfree(full_path);
-free_xid:
+cdda_exit:
-        FreeXid(xid);
        cFYI(1, "leaving %s" , __func__);
        return mnt;
 }
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 4dfba8283165..33d221394aca 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -113,7 +113,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
                   MAX_MECH_STR_LEN +
                   UID_KEY_LEN + (sizeof(uid_t) * 2) +
                   CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
-                   USER_KEY_LEN + strlen(sesInfo->userName) +
+                   USER_KEY_LEN + strlen(sesInfo->user_name) +
                   PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
        spnego_key = ERR_PTR(-ENOMEM);
@@ -153,7 +153,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
        dp = description + strlen(description);
-        sprintf(dp, ";user=%s", sesInfo->userName);
+        sprintf(dp, ";user=%s", sesInfo->user_name);
        dp = description + strlen(description);
        sprintf(dp, ";pid=0x%x", current->pid);
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index fc0fd4fde306..23d43cde4306 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -90,7 +90,7 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
        case UNI_COLON:
                *target = ':';
                break;
-        case UNI_ASTERIK:
+        case UNI_ASTERISK:
                *target = '*';
                break;
        case UNI_QUESTION:
@@ -264,40 +264,40 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
 * names are little endian 16 bit Unicode on the wire
 */
 int
-cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
+cifsConvertToUCS(__le16 *target, const char *source, int srclen,
                 const struct nls_table *cp, int mapChars)
 {
        int i, j, charlen;
-        int len_remaining = maxlen;
        char src_char;
-        __u16 temp;
+        __le16 dst_char;
+        wchar_t tmp;
        if (!mapChars)
                return cifs_strtoUCS(target, source, PATH_MAX, cp);
-        for (i = 0, j = 0; i < maxlen; j++) {
+        for (i = 0, j = 0; i < srclen; j++) {
                src_char = source[i];
                switch (src_char) {
                case 0:
-                        put_unaligned_le16(0, &target[j]);
+                        put_unaligned(0, &target[j]);
                        goto ctoUCS_out;
                case ':':
-                        temp = UNI_COLON;
+                        dst_char = cpu_to_le16(UNI_COLON);
                        break;
                case '*':
-                        temp = UNI_ASTERIK;
+                        dst_char = cpu_to_le16(UNI_ASTERISK);
                        break;
                case '?':
-                        temp = UNI_QUESTION;
+                        dst_char = cpu_to_le16(UNI_QUESTION);
                        break;
                case '<':
-                        temp = UNI_LESSTHAN;
+                        dst_char = cpu_to_le16(UNI_LESSTHAN);
                        break;
                case '>':
-                        temp = UNI_GRTRTHAN;
+                        dst_char = cpu_to_le16(UNI_GRTRTHAN);
                        break;
                case '|':
-                        temp = UNI_PIPE;
+                        dst_char = cpu_to_le16(UNI_PIPE);
                        break;
                /*
                 * FIXME: We can not handle remapping backslash (UNI_SLASH)
@@ -305,17 +305,17 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
                 * as they use backslash as separator.
                 */
                default:
-                        charlen = cp->char2uni(source+i, len_remaining,
+                        charlen = cp->char2uni(source + i, srclen - i, &tmp);
-                                                &temp);
+                        dst_char = cpu_to_le16(tmp);
                        /*
                         * if no match, use question mark, which at least in
                         * some cases serves as wild card
                         */
                        if (charlen < 1) {
-                                temp = 0x003f;
+                                dst_char = cpu_to_le16(0x003f);
                                charlen = 1;
                        }
-                        len_remaining -= charlen;
                        /*
                         * character may take more than one byte in the source
                         * string, but will take exactly two bytes in the
@@ -324,9 +324,8 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
                        i += charlen;
                        continue;
                }
-                put_unaligned_le16(temp, &target[j]);
+                put_unaligned(dst_char, &target[j]);
                i++; /* move to next char in source string */
-                len_remaining--;
        }
 ctoUCS_out:
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 7fe6b52df507..644dd882a560 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -44,7 +44,7 @@
 * reserved symbols (along with \ and /), otherwise illegal to store
 * in filenames in NTFS
 */
-#define UNI_ASTERIK     (__u16) ('*' + 0xF000)
+#define UNI_ASTERISK    (__u16) ('*' + 0xF000)
 #define UNI_QUESTION    (__u16) ('?' + 0xF000)
 #define UNI_COLON       (__u16) (':' + 0xF000)
 #define UNI_GRTRTHAN    (__u16) ('>' + 0xF000)
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1e7636b145a8..beeebf194234 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -372,6 +372,10 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
                ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
                                GFP_KERNEL);
+                if (!ppace) {
+                        cERROR(1, "DACL memory allocation error");
+                        return;
+                }
                for (i = 0; i < num_aces; ++i) {
                        ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 0db5f1de0227..d1a016be73ba 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -30,12 +30,13 @@
 #include <linux/ctype.h>
 #include <linux/random.h>
-/* Calculate and return the CIFS signature based on the mac key and SMB PDU */
+/*
-/* the 16 byte signature must be allocated by the caller  */
+ * Calculate and return the CIFS signature based on the mac key and SMB PDU.
-/* Note we only use the 1st eight bytes */
+ * The 16 byte signature must be allocated by the caller. Note we only use the
-/* Note that the smb header signature field on input contains the
+ * 1st eight bytes and that the smb header signature field on input contains
-        sequence number before this function is called */
+ * the sequence number before this function is called. Also, this function
+ * should be called with the server->srv_mutex held.
+ */
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
                                struct TCP_Server_Info *server, char *signature)
 {
@@ -209,8 +210,10 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
                                        cpu_to_le32(expected_sequence_number);
        cifs_pdu->Signature.Sequence.Reserved = 0;
+        mutex_lock(&server->srv_mutex);
        rc = cifs_calculate_signature(cifs_pdu, server,
                what_we_think_sig_should_be);
+        mutex_unlock(&server->srv_mutex);
        if (rc)
                return rc;
@@ -469,15 +472,15 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
                return rc;
        }
-        /* convert ses->userName to unicode and uppercase */
+        /* convert ses->user_name to unicode and uppercase */
-        len = strlen(ses->userName);
+        len = strlen(ses->user_name);
        user = kmalloc(2 + (len * 2), GFP_KERNEL);
        if (user == NULL) {
                cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
                rc = -ENOMEM;
                goto calc_exit_2;
        }
-        len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
+        len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
        UniStrupr(user);
        crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
@@ -657,9 +660,10 @@ calc_seckey(struct cifsSesInfo *ses)
        get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
        tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
-        if (!tfm_arc4 || IS_ERR(tfm_arc4)) {
+        if (IS_ERR(tfm_arc4)) {
+                rc = PTR_ERR(tfm_arc4);
                cERROR(1, "could not allocate crypto API arc4\n");
-                return PTR_ERR(tfm_arc4);
+                return rc;
        }
        desc.tfm = tfm_arc4;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f2970136d17d..5c412b33cd7c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -53,7 +53,6 @@ int cifsFYI = 0;
 int cifsERROR = 1;
 int traceSMB = 0;
 unsigned int oplockEnabled = 1;
-unsigned int experimEnabled = 0;
 unsigned int linuxExtEnabled = 1;
 unsigned int lookupCacheEnabled = 1;
 unsigned int multiuser_mount = 0;
@@ -127,6 +126,7 @@ cifs_read_super(struct super_block *sb, void *data,
                kfree(cifs_sb);
                return rc;
        }
+        cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
 #ifdef CONFIG_CIFS_DFS_UPCALL
        /* copy mount params to sb for use in submounts */
@@ -409,8 +409,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
                seq_printf(s, ",multiuser");
-        else if (tcon->ses->userName)
+        else if (tcon->ses->user_name)
-                seq_printf(s, ",username=%s", tcon->ses->userName);
+                seq_printf(s, ",username=%s", tcon->ses->user_name);
        if (tcon->ses->domainName)
                seq_printf(s, ",domain=%s", tcon->ses->domainName);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 14789a97304e..a9371b6578c0 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -127,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.69"
+#define CIFS_VERSION   "1.71"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index edd5b29b53c9..a5d1106fcbde 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -37,10 +37,9 @@
 #define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1)
 #define MAX_SERVER_SIZE 15
-#define MAX_SHARE_SIZE  64      /* used to be 20, this should still be enough */
+#define MAX_SHARE_SIZE 80
-#define MAX_USERNAME_SIZE 32    /* 32 is to allow for 15 char names + null
+#define MAX_USERNAME_SIZE 256   /* reasonable maximum for current servers */
-                                   termination then *2 for unicode versions */
+#define MAX_PASSWORD_SIZE 512   /* max for windows seems to be 256 wide chars */
-#define MAX_PASSWORD_SIZE 512  /* max for windows seems to be 256 wide chars */
 #define CIFS_MIN_RCV_POOL 4
@@ -92,7 +91,8 @@ enum statusEnum {
        CifsNew = 0,
        CifsGood,
        CifsExiting,
-        CifsNeedReconnect
+        CifsNeedReconnect,
+        CifsNeedNegotiate
 };
 enum securityEnum {
@@ -188,6 +188,8 @@ struct TCP_Server_Info {
        /* multiplexed reads or writes */
        unsigned int maxBuf;    /* maxBuf specifies the maximum */
        /* message size the server can send or receive for non-raw SMBs */
+        /* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */
+        /* when socket is setup (and during reconnect) before NegProt sent */
        unsigned int max_rw;    /* maxRw specifies the maximum */
        /* message size the server can send or receive for */
        /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
@@ -272,7 +274,7 @@ struct cifsSesInfo {
        int capabilities;
        char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for
                                TCP names - will ipv6 and sctp addresses fit? */
-        char userName[MAX_USERNAME_SIZE + 1];
+        char *user_name;
        char *domainName;
        char *password;
        struct session_key auth_key;
@@ -652,7 +654,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   MID_REQUEST_SUBMITTED 2
 #define   MID_RESPONSE_RECEIVED 4
 #define   MID_RETRY_NEEDED      8 /* session closed while this request out */
-#define   MID_NO_RESP_NEEDED 0x10
+#define   MID_RESPONSE_MALFORMED 0x10
 /* Types of response buffer returned from SendReceive2 */
 #define   CIFS_NO_BUFFER        0    /* Response buffer not returned */
@@ -815,7 +817,6 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
                                have the uid/password or Kerberos credential
                                or equivalent for current user */
 GLOBAL_EXTERN unsigned int oplockEnabled;
-GLOBAL_EXTERN unsigned int experimEnabled;
 GLOBAL_EXTERN unsigned int lookupCacheEnabled;
 GLOBAL_EXTERN unsigned int global_secflags;     /* if on, session setup sent
                                with more secure ntlmssp2 challenge/resp */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 3106f5e5c633..df959bae6728 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -136,18 +136,15 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
                }
        }
-        if (ses->status == CifsExiting)
-                return -EIO;
        /*
         * Give demultiplex thread up to 10 seconds to reconnect, should be
         * greater than cifs socket timeout which is 7 seconds
         */
        while (server->tcpStatus == CifsNeedReconnect) {
                wait_event_interruptible_timeout(server->response_q,
-                        (server->tcpStatus == CifsGood), 10 * HZ);
+                        (server->tcpStatus != CifsNeedReconnect), 10 * HZ);
-                /* is TCP session is reestablished now ?*/
+                /* are we still trying to reconnect? */
                if (server->tcpStatus != CifsNeedReconnect)
                        break;
@@ -156,7 +153,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
                 * retrying until process is killed or server comes
                 * back on-line
                 */
-                if (!tcon->retry || ses->status == CifsExiting) {
+                if (!tcon->retry) {
                        cFYI(1, "gave up waiting on reconnect in smb_init");
                        return -EHOSTDOWN;
                }
@@ -732,7 +729,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
                return rc;
        /* set up echo request */
-        smb->hdr.Tid = cpu_to_le16(0xffff);
+        smb->hdr.Tid = 0xffff;
        smb->hdr.WordCount = 1;
        put_unaligned_le16(1, &smb->EchoCount);
        put_bcc_le(1, &smb->hdr);
@@ -1887,10 +1884,10 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
                                        __constant_cpu_to_le16(CIFS_WRLCK))
                                pLockData->fl_type = F_WRLCK;
-                        pLockData->fl_start = parm_data->start;
+                        pLockData->fl_start = le64_to_cpu(parm_data->start);
-                        pLockData->fl_end = parm_data->start +
+                        pLockData->fl_end = pLockData->fl_start +
-                                                parm_data->length - 1;
+                                        le64_to_cpu(parm_data->length) - 1;
-                        pLockData->fl_pid = parm_data->pid;
+                        pLockData->fl_pid = le32_to_cpu(parm_data->pid);
                }
        }
@@ -4914,7 +4911,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
                   __u16 fid, __u32 pid_of_opener, bool SetAllocation)
 {
        struct smb_com_transaction2_sfi_req *pSMB  = NULL;
-        char *data_offset;
        struct file_end_of_file_info *parm_data;
        int rc = 0;
        __u16 params, param_offset, offset, byte_count, count;
@@ -4938,8 +4934,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
        param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
        offset = param_offset + params;
-        data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
        count = sizeof(struct file_end_of_file_info);
        pSMB->MaxParameterCount = cpu_to_le16(2);
        /* BB find exact max SMB PDU from sess structure BB */
@@ -5253,7 +5247,7 @@ cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
         * Samba server ignores set of file size to zero due to bugs in some
         * older clients, but we should be precise - we use SetFileSize to
         * set file size and do not want to truncate file size to zero
-         * accidently as happened on one Samba server beta by putting
+         * accidentally as happened on one Samba server beta by putting
         * zero instead of -1 here
         */
        data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 47d8ff623683..db9d55b507d0 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -199,8 +199,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
        }
        spin_unlock(&GlobalMid_Lock);
-        while ((server->tcpStatus != CifsExiting) &&
+        while (server->tcpStatus == CifsNeedReconnect) {
-               (server->tcpStatus != CifsGood)) {
                try_to_freeze();
                /* we should try only the port we connected to before */
@@ -212,7 +211,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
                        atomic_inc(&tcpSesReconnectCount);
                        spin_lock(&GlobalMid_Lock);
                        if (server->tcpStatus != CifsExiting)
-                                server->tcpStatus = CifsGood;
+                                server->tcpStatus = CifsNeedNegotiate;
                        spin_unlock(&GlobalMid_Lock);
                }
        }
@@ -248,24 +247,24 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
        total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
        data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
-        remaining = total_data_size - data_in_this_rsp;
+        if (total_data_size == data_in_this_rsp)
-        if (remaining == 0)
                return 0;
-        else if (remaining < 0) {
+        else if (total_data_size < data_in_this_rsp) {
                cFYI(1, "total data %d smaller than data in frame %d",
                        total_data_size, data_in_this_rsp);
                return -EINVAL;
-        } else {
-                cFYI(1, "missing %d bytes from transact2, check next response",
-                        remaining);
-                if (total_data_size > maxBufSize) {
-                        cERROR(1, "TotalDataSize %d is over maximum buffer %d",
-                                total_data_size, maxBufSize);
-                        return -EINVAL;
-                }
-                return remaining;
        }
+        remaining = total_data_size - data_in_this_rsp;
+        cFYI(1, "missing %d bytes from transact2, check next response",
+                remaining);
+        if (total_data_size > maxBufSize) {
+                cERROR(1, "TotalDataSize %d is over maximum buffer %d",
+                        total_data_size, maxBufSize);
+                return -EINVAL;
+        }
+        return remaining;
 }
 static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
@@ -337,8 +336,13 @@ cifs_echo_request(struct work_struct *work)
        struct TCP_Server_Info *server = container_of(work,
                                        struct TCP_Server_Info, echo.work);
-        /* no need to ping if we got a response recently */
+        /*
-        if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+         * We cannot send an echo until the NEGOTIATE_PROTOCOL request is
+         * done, which is indicated by maxBuf != 0. Also, no need to ping if
+         * we got a response recently
+         */
+        if (server->maxBuf == 0 ||
+            time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
                goto requeue_echo;
        rc = CIFSSMBEcho(server);
@@ -416,7 +420,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
                pdu_length = 4; /* enough to get RFC1001 header */
 incomplete_rcv:
-                if (echo_retries > 0 &&
+                if (echo_retries > 0 && server->tcpStatus == CifsGood &&
                    time_after(jiffies, server->lstrp +
                                        (echo_retries * SMB_ECHO_INTERVAL))) {
                        cERROR(1, "Server %s has not responded in %d seconds. "
@@ -578,14 +582,23 @@ incomplete_rcv:
                else if (reconnect == 1)
                        continue;
-                length += 4; /* account for rfc1002 hdr */
+                total_read += 4; /* account for rfc1002 hdr */
+                dump_smb(smb_buffer, total_read);
-                dump_smb(smb_buffer, length);
+                /*
-                if (checkSMB(smb_buffer, smb_buffer->Mid, total_read+4)) {
+                 * We know that we received enough to get to the MID as we
-                        cifs_dump_mem("Bad SMB: ", smb_buffer, 48);
+                 * checked the pdu_length earlier. Now check to see
-                        continue;
+                 * if the rest of the header is OK. We borrow the length
-                }
+                 * var for the rest of the loop to avoid a new stack var.
+                 *
+                 * 48 bytes is enough to display the header and a little bit
+                 * into the payload for debugging purposes.
+                 */
+                length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
+                if (length != 0)
+                        cifs_dump_mem("Bad SMB: ", smb_buffer,
+                                        min_t(unsigned int, total_read, 48));
                mid_entry = NULL;
                server->lstrp = jiffies;
@@ -597,7 +610,8 @@ incomplete_rcv:
                        if ((mid_entry->mid == smb_buffer->Mid) &&
                            (mid_entry->midState == MID_REQUEST_SUBMITTED) &&
                            (mid_entry->command == smb_buffer->Command)) {
-                                if (check2ndT2(smb_buffer,server->maxBuf) > 0) {
+                                if (length == 0 &&
+                                   check2ndT2(smb_buffer, server->maxBuf) > 0) {
                                        /* We have a multipart transact2 resp */
                                        isMultiRsp = true;
                                        if (mid_entry->resp_buf) {
@@ -632,12 +646,17 @@ incomplete_rcv:
                                mid_entry->resp_buf = smb_buffer;
                                mid_entry->largeBuf = isLargeBuf;
 multi_t2_fnd:
-                                mid_entry->midState = MID_RESPONSE_RECEIVED;
+                                if (length == 0)
-                                list_del_init(&mid_entry->qhead);
+                                        mid_entry->midState =
-                                mid_entry->callback(mid_entry);
+                                                        MID_RESPONSE_RECEIVED;
+                                else
+                                        mid_entry->midState =
+                                                        MID_RESPONSE_MALFORMED;
 #ifdef CONFIG_CIFS_STATS2
                                mid_entry->when_received = jiffies;
 #endif
+                                list_del_init(&mid_entry->qhead);
+                                mid_entry->callback(mid_entry);
                                break;
                        }
                        mid_entry = NULL;
@@ -653,6 +672,9 @@ multi_t2_fnd:
                                else
                                        smallbuf = NULL;
                        }
+                } else if (length != 0) {
+                        /* response sanity checks failed */
+                        continue;
                } else if (!is_valid_oplock_break(smb_buffer, server) &&
                           !isMultiRsp) {
                        cERROR(1, "No task to wake, unknown frame received! "
@@ -858,7 +880,8 @@ cifs_parse_mount_options(char *options, const char *devname,
                                /* null user, ie anonymous, authentication */
                                vol->nullauth = 1;
                        }
-                        if (strnlen(value, 200) < 200) {
+                        if (strnlen(value, MAX_USERNAME_SIZE) <
+                                                MAX_USERNAME_SIZE) {
                                vol->username = value;
                        } else {
                                printk(KERN_WARNING "CIFS: username too long\n");
@@ -1449,7 +1472,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
 static bool
 match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
 {
-        unsigned short int port, *sport;
+        __be16 port, *sport;
        switch (addr->sa_family) {
        case AF_INET:
@@ -1549,7 +1572,7 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
                return false;
        }
-        /* now check if signing mode is acceptible */
+        /* now check if signing mode is acceptable */
        if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
            (server->secMode & SECMODE_SIGN_REQUIRED))
                        return false;
@@ -1742,6 +1765,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
                module_put(THIS_MODULE);
                goto out_err_crypto_release;
        }
+        tcp_ses->tcpStatus = CifsNeedNegotiate;
        /* thread spawned, put it on the list */
        spin_lock(&cifs_tcp_ses_lock);
@@ -1785,7 +1809,9 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
                        break;
                default:
                        /* anything else takes username/password */
-                        if (strncmp(ses->userName, vol->username,
+                        if (ses->user_name == NULL)
+                                continue;
+                        if (strncmp(ses->user_name, vol->username,
                                    MAX_USERNAME_SIZE))
                                continue;
                        if (strlen(vol->username) != 0 &&
@@ -1828,6 +1854,8 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
        cifs_put_tcp_session(server);
 }
+static bool warned_on_ntlm;  /* globals init to false automatically */
 static struct cifsSesInfo *
 cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 {
@@ -1883,9 +1911,11 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        else
                sprintf(ses->serverName, "%pI4", &addr->sin_addr);
-        if (volume_info->username)
+        if (volume_info->username) {
-                strncpy(ses->userName, volume_info->username,
+                ses->user_name = kstrdup(volume_info->username, GFP_KERNEL);
-                        MAX_USERNAME_SIZE);
+                if (!ses->user_name)
+                        goto get_ses_fail;
+        }
        /* volume_info->password freed at unmount */
        if (volume_info->password) {
@@ -1900,6 +1930,15 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        }
        ses->cred_uid = volume_info->cred_uid;
        ses->linux_uid = volume_info->linux_uid;
+        /* ntlmv2 is much stronger than ntlm security, and has been broadly
+        supported for many years, time to update default security mechanism */
+        if ((volume_info->secFlg == 0) && warned_on_ntlm == false) {
+                warned_on_ntlm = true;
+                cERROR(1, "default security mechanism requested.  The default "
+                        "security mechanism will be upgraded from ntlm to "
+                        "ntlmv2 in kernel release 2.6.41");
+        }
        ses->overrideSecFlg = volume_info->secFlg;
        mutex_lock(&ses->session_mutex);
@@ -2253,7 +2292,7 @@ static int
 generic_ip_connect(struct TCP_Server_Info *server)
 {
        int rc = 0;
-        unsigned short int sport;
+        __be16 sport;
        int slen, sfamily;
        struct socket *socket = server->ssocket;
        struct sockaddr *saddr;
@@ -2338,7 +2377,7 @@ generic_ip_connect(struct TCP_Server_Info *server)
 static int
 ip_connect(struct TCP_Server_Info *server)
 {
-        unsigned short int *sport;
+        __be16 *sport;
        struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
        struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
@@ -2803,7 +2842,7 @@ try_mount_again:
 remote_path_check:
        /* check if a whole path (including prepath) is not remote */
-        if (!rc && cifs_sb->prepathlen && tcon) {
+        if (!rc && tcon) {
                /* build_path_to_root works only when we have a valid tcon */
                full_path = cifs_build_path_to_root(cifs_sb, tcon);
                if (full_path == NULL) {
@@ -2910,7 +2949,7 @@ mount_fail_check:
                if (mount_data != mount_data_global)
                        kfree(mount_data);
                /* If find_unc succeeded then rc == 0 so we can not end */
-                /* up accidently freeing someone elses tcon struct */
+                /* up accidentally freeing someone elses tcon struct */
                if (tcon)
                        cifs_put_tcon(tcon);
                else if (pSesInfo)
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index dd5f22918c33..9ea65cf36714 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -189,7 +189,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                        inode->i_sb, mode, oflags, &oplock, &fileHandle, xid);
                /* EIO could indicate that (posix open) operation is not
                   supported, despite what server claimed in capability
-                   negotation.  EREMOTE indicates DFS junction, which is not
+                   negotiation.  EREMOTE indicates DFS junction, which is not
                   handled in posix open */
                if (rc == 0) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0de17c1db608..faf59529e847 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -346,7 +346,6 @@ int cifs_open(struct inode *inode, struct file *file)
        struct cifsTconInfo *tcon;
        struct tcon_link *tlink;
        struct cifsFileInfo *pCifsFile = NULL;
-        struct cifsInodeInfo *pCifsInode;
        char *full_path = NULL;
        bool posix_open_ok = false;
        __u16 netfid;
@@ -361,8 +360,6 @@ int cifs_open(struct inode *inode, struct file *file)
        }
        tcon = tlink_tcon(tlink);
-        pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
        full_path = build_path_from_dentry(file->f_path.dentry);
        if (full_path == NULL) {
                rc = -ENOMEM;
@@ -578,8 +575,10 @@ reopen_error_exit:
 int cifs_close(struct inode *inode, struct file *file)
 {
-        cifsFileInfo_put(file->private_data);
+        if (file->private_data != NULL) {
-        file->private_data = NULL;
+                cifsFileInfo_put(file->private_data);
+                file->private_data = NULL;
+        }
        /* return code from the ->release op is always ignored */
        return 0;
@@ -973,6 +972,9 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
             total_written += bytes_written) {
                rc = -EAGAIN;
                while (rc == -EAGAIN) {
+                        struct kvec iov[2];
+                        unsigned int len;
                        if (open_file->invalidHandle) {
                                /* we could deadlock if we called
                                   filemap_fdatawait from here so tell
@@ -982,31 +984,14 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
                                if (rc != 0)
                                        break;
                        }
-                        if (experimEnabled || (pTcon->ses->server &&
-                                ((pTcon->ses->server->secMode &
+                        len = min((size_t)cifs_sb->wsize,
-                                (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+                                  write_size - total_written);
-                                == 0))) {
+                        /* iov[0] is reserved for smb header */
-                                struct kvec iov[2];
+                        iov[1].iov_base = (char *)write_data + total_written;
-                                unsigned int len;
+                        iov[1].iov_len = len;
+                        rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid, len,
-                                len = min((size_t)cifs_sb->wsize,
+                                           *poffset, &bytes_written, iov, 1, 0);
-                                          write_size - total_written);
-                                /* iov[0] is reserved for smb header */
-                                iov[1].iov_base = (char *)write_data +
-                                                  total_written;
-                                iov[1].iov_len = len;
-                                rc = CIFSSMBWrite2(xid, pTcon,
-                                                open_file->netfid, len,
-                                                *poffset, &bytes_written,
-                                                iov, 1, 0);
-                        } else
-                                rc = CIFSSMBWrite(xid, pTcon,
-                                         open_file->netfid,
-                                         min_t(const int, cifs_sb->wsize,
-                                               write_size - total_written),
-                                         *poffset, &bytes_written,
-                                         write_data + total_written,
-                                         NULL, 0);
                }
                if (rc || (bytes_written == 0)) {
                        if (total_written)
@@ -1146,7 +1131,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
        char *write_data;
        int rc = -EFAULT;
        int bytes_written = 0;
-        struct cifs_sb_info *cifs_sb;
        struct inode *inode;
        struct cifsFileInfo *open_file;
@@ -1154,7 +1138,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
                return -EFAULT;
        inode = page->mapping->host;
-        cifs_sb = CIFS_SB(inode->i_sb);
        offset += (loff_t)from;
        write_data = kmap(page);
@@ -1245,12 +1228,6 @@ static int cifs_writepages(struct address_space *mapping,
        }
        tcon = tlink_tcon(open_file->tlink);
-        if (!experimEnabled && tcon->ses->server->secMode &
-                        (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
-                cifsFileInfo_put(open_file);
-                kfree(iov);
-                return generic_writepages(mapping, wbc);
-        }
        cifsFileInfo_put(open_file);
        xid = GetXid();
@@ -1574,34 +1551,6 @@ int cifs_fsync(struct file *file, int datasync)
        return rc;
 }
-/* static void cifs_sync_page(struct page *page)
-{
-        struct address_space *mapping;
-        struct inode *inode;
-        unsigned long index = page->index;
-        unsigned int rpages = 0;
-        int rc = 0;
-        cFYI(1, "sync page %p", page);
-        mapping = page->mapping;
-        if (!mapping)
-                return 0;
-        inode = mapping->host;
-        if (!inode)
-                return; */
-/*      fill in rpages then
-        result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
-/*      cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
-#if 0
-        if (rc < 0)
-                return rc;
-        return 0;
-#endif
-} */
 /*
 * As file closes, flush all cached write data for this inode checking
 * for write behind errors.
@@ -1667,9 +1616,10 @@ static ssize_t
 cifs_iovec_write(struct file *file, const struct iovec *iov,
                 unsigned long nr_segs, loff_t *poffset)
 {
-        size_t total_written = 0, written = 0;
+        unsigned int written;
-        unsigned long num_pages, npages;
+        unsigned long num_pages, npages, i;
-        size_t copied, len, cur_len, i;
+        size_t copied, len, cur_len;
+        ssize_t total_written = 0;
        struct kvec *to_send;
        struct page **pages;
        struct iov_iter it;
@@ -1825,7 +1775,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
 {
        int rc;
        int xid;
-        unsigned int total_read, bytes_read = 0;
+        ssize_t total_read;
+        unsigned int bytes_read = 0;
        size_t len, cur_len;
        int iov_offset = 0;
        struct cifs_sb_info *cifs_sb;
@@ -2011,6 +1962,24 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        return total_read;
 }
+/*
+ * If the page is mmap'ed into a process' page tables, then we need to make
+ * sure that it doesn't change while being written back.
+ */
+static int
+cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct page *page = vmf->page;
+        lock_page(page);
+        return VM_FAULT_LOCKED;
+}
+static struct vm_operations_struct cifs_file_vm_ops = {
+        .fault = filemap_fault,
+        .page_mkwrite = cifs_page_mkwrite,
+};
 int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
 {
        int rc, xid;
@@ -2022,6 +1991,8 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
                cifs_invalidate_mapping(inode);
        rc = generic_file_mmap(file, vma);
+        if (rc == 0)
+                vma->vm_ops = &cifs_file_vm_ops;
        FreeXid(xid);
        return rc;
 }
@@ -2038,6 +2009,8 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
                return rc;
        }
        rc = generic_file_mmap(file, vma);
+        if (rc == 0)
+                vma->vm_ops = &cifs_file_vm_ops;
        FreeXid(xid);
        return rc;
 }
@@ -2513,7 +2486,6 @@ const struct address_space_operations cifs_addr_ops = {
        .set_page_dirty = __set_page_dirty_nobuffers,
        .releasepage = cifs_release_page,
        .invalidatepage = cifs_invalidate_page,
-        /* .sync_page = cifs_sync_page, */
        /* .direct_IO = */
 };
@@ -2531,6 +2503,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
        .set_page_dirty = __set_page_dirty_nobuffers,
        .releasepage = cifs_release_page,
        .invalidatepage = cifs_invalidate_page,
-        /* .sync_page = cifs_sync_page, */
        /* .direct_IO = */
 };
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 02cd60aefbff..ce417a9764a3 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -55,8 +55,9 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
        md5 = crypto_alloc_shash("md5", 0, 0);
        if (IS_ERR(md5)) {
+                rc = PTR_ERR(md5);
                cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
-                return PTR_ERR(md5);
+                return rc;
        }
        size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
        sdescmd5 = kmalloc(size, GFP_KERNEL);
@@ -238,7 +239,7 @@ CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
        if (rc != 0)
                return rc;
-        if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
+        if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
                CIFSSMBClose(xid, tcon, netfid);
                /* it's not a symlink */
                return -EINVAL;
@@ -315,7 +316,7 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
        if (rc != 0)
                goto out;
-        if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) {
+        if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
                CIFSSMBClose(xid, pTcon, netfid);
                /* it's not a symlink */
                goto out;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index a09e077ba925..0c684ae4c071 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -100,6 +100,7 @@ sesInfoFree(struct cifsSesInfo *buf_to_free)
                memset(buf_to_free->password, 0, strlen(buf_to_free->password));
                kfree(buf_to_free->password);
        }
+        kfree(buf_to_free->user_name);
        kfree(buf_to_free->domainName);
        kfree(buf_to_free);
 }
@@ -236,10 +237,7 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
 {
        __u16 mid = 0;
        __u16 last_mid;
-        int   collision;
+        bool collision;
-        if (server == NULL)
-                return mid;
        spin_lock(&GlobalMid_Lock);
        last_mid = server->CurrentMid; /* we do not want to loop forever */
@@ -252,24 +250,38 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
        (and it would also have to have been a request that
         did not time out) */
        while (server->CurrentMid != last_mid) {
-                struct list_head *tmp;
                struct mid_q_entry *mid_entry;
+                unsigned int num_mids;
-                collision = 0;
+                collision = false;
                if (server->CurrentMid == 0)
                        server->CurrentMid++;
-                list_for_each(tmp, &server->pending_mid_q) {
+                num_mids = 0;
-                        mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+                list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
+                        ++num_mids;
-                        if ((mid_entry->mid == server->CurrentMid) &&
+                        if (mid_entry->mid == server->CurrentMid &&
-                            (mid_entry->midState == MID_REQUEST_SUBMITTED)) {
+                            mid_entry->midState == MID_REQUEST_SUBMITTED) {
                                /* This mid is in use, try a different one */
-                                collision = 1;
+                                collision = true;
                                break;
                        }
                }
-                if (collision == 0) {
+                /*
+                 * if we have more than 32k mids in the list, then something
+                 * is very wrong. Possibly a local user is trying to DoS the
+                 * box by issuing long-running calls and SIGKILL'ing them. If
+                 * we get to 2^16 mids then we're in big trouble as this
+                 * function could loop forever.
+                 *
+                 * Go ahead and assign out the mid in this situation, but force
+                 * an eventual reconnect to clean out the pending_mid_q.
+                 */
+                if (num_mids > 32768)
+                        server->tcpStatus = CifsNeedReconnect;
+                if (!collision) {
                        mid = server->CurrentMid;
                        break;
                }
@@ -381,29 +393,31 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 }
 static int
-checkSMBhdr(struct smb_hdr *smb, __u16 mid)
+check_smb_hdr(struct smb_hdr *smb, __u16 mid)
 {
-        /* Make sure that this really is an SMB, that it is a response,
+        /* does it have the right SMB "signature" ? */
-           and that the message ids match */
+        if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
-        if ((*(__le32 *) smb->Protocol == cpu_to_le32(0x424d53ff)) &&
+                cERROR(1, "Bad protocol string signature header 0x%x",
-                (mid == smb->Mid)) {
+                        *(unsigned int *)smb->Protocol);
-                if (smb->Flags & SMBFLG_RESPONSE)
+                return 1;
-                        return 0;
+        }
-                else {
-                /* only one valid case where server sends us request */
+        /* Make sure that message ids match */
-                        if (smb->Command == SMB_COM_LOCKING_ANDX)
+        if (mid != smb->Mid) {
-                                return 0;
+                cERROR(1, "Mids do not match. received=%u expected=%u",
-                        else
+                        smb->Mid, mid);
-                                cERROR(1, "Received Request not response");
+                return 1;
-                }
-        } else { /* bad signature or mid */
-                if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
-                        cERROR(1, "Bad protocol string signature header %x",
-                                *(unsigned int *) smb->Protocol);
-                if (mid != smb->Mid)
-                        cERROR(1, "Mids do not match");
        }
-        cERROR(1, "bad smb detected. The Mid=%d", smb->Mid);
+        /* if it's a response then accept */
+        if (smb->Flags & SMBFLG_RESPONSE)
+                return 0;
+        /* only one valid case where server sends us request */
+        if (smb->Command == SMB_COM_LOCKING_ANDX)
+                return 0;
+        cERROR(1, "Server sent request, not response. mid=%u", smb->Mid);
        return 1;
 }
@@ -448,7 +462,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                return 1;
        }
-        if (checkSMBhdr(smb, mid))
+        if (check_smb_hdr(smb, mid))
                return 1;
        clc_len = smbCalcSize_LE(smb);
@@ -465,25 +479,26 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
                        if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
                                return 0; /* bcc wrapped */
                }
-                cFYI(1, "Calculated size %d vs length %d mismatch for mid %d",
+                cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
                                clc_len, 4 + len, smb->Mid);
-                /* Windows XP can return a few bytes too much, presumably
-                an illegal pad, at the end of byte range lock responses
+                if (4 + len < clc_len) {
-                so we allow for that three byte pad, as long as actual
+                        cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
-                received length is as long or longer than calculated length */
-                /* We have now had to extend this more, since there is a
-                case in which it needs to be bigger still to handle a
-                malformed response to transact2 findfirst from WinXP when
-                access denied is returned and thus bcc and wct are zero
-                but server says length is 0x21 bytes too long as if the server
-                forget to reset the smb rfc1001 length when it reset the
-                wct and bcc to minimum size and drop the t2 parms and data */
-                if ((4+len > clc_len) && (len <= clc_len + 512))
-                        return 0;
-                else {
-                        cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
                                        len, smb->Mid);
                        return 1;
+                } else if (len > clc_len + 512) {
+                        /*
+                         * Some servers (Windows XP in particular) send more
+                         * data than the lengths in the SMB packet would
+                         * indicate on certain calls (byte range locks and
+                         * trans2 find first calls in particular). While the
+                         * client can handle such a frame by ignoring the
+                         * trailing data, we choose limit the amount of extra
+                         * data to 512 bytes.
+                         */
+                        cERROR(1, "RFC1001 size %u more than 512 bytes larger "
+                                  "than SMB for mid=%u", len, smb->Mid);
+                        return 1;
                }
        }
        return 0;
@@ -506,7 +521,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
                        (struct smb_com_transaction_change_notify_rsp *)buf;
                struct file_notify_information *pnotify;
                __u32 data_offset = 0;
-                if (pSMBr->ByteCount > sizeof(struct file_notify_information)) {
+                if (get_bcc_le(buf) > sizeof(struct file_notify_information)) {
                        data_offset = le32_to_cpu(pSMBr->DataOffset);
                        pnotify = (struct file_notify_information *)
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 8d9189f64477..79f641eeda30 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -170,7 +170,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
 {
        int rc, alen, slen;
        const char *pct;
-        char *endp, scope_id[13];
+        char scope_id[13];
        struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
        struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
@@ -197,9 +197,9 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
                memcpy(scope_id, pct + 1, slen);
                scope_id[slen] = '\0';
-                s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0);
+                rc = strict_strtoul(scope_id, 0,
-                if (endp != scope_id + slen)
+                                        (unsigned long *)&s6->sin6_scope_id);
-                        return 0;
+                rc = (rc == 0) ? 1 : 0;
        }
        return rc;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 7f25cc3d2256..f8e4cd2a7912 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -764,7 +764,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 {
        int rc = 0;
        int xid, i;
-        struct cifs_sb_info *cifs_sb;
        struct cifsTconInfo *pTcon;
        struct cifsFileInfo *cifsFile = NULL;
        char *current_entry;
@@ -775,8 +774,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
        xid = GetXid();
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        /*
         * Ensure FindFirst doesn't fail before doing filldir() for '.' and
         * '..'. Otherwise we won't be able to notify VFS in case of failure.
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 1adc9625a344..f6728eb6f4b9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -219,12 +219,12 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
                bcc_ptr++;
        } */
        /* copy user */
-        if (ses->userName == NULL) {
+        if (ses->user_name == NULL) {
                /* null user mount */
                *bcc_ptr = 0;
                *(bcc_ptr+1) = 0;
        } else {
-                bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName,
+                bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name,
                                          MAX_USERNAME_SIZE, nls_cp);
        }
        bcc_ptr += 2 * bytes_ret;
@@ -244,12 +244,11 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
        /* copy user */
        /* BB what about null user mounts - check that we do this BB */
        /* copy user */
-        if (ses->userName == NULL) {
+        if (ses->user_name != NULL)
-                /* BB what about null user mounts - check that we do this BB */
+                strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE);
-        } else {
+        /* else null user mount */
-                strncpy(bcc_ptr, ses->userName, MAX_USERNAME_SIZE);
-        }
+        bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE);
-        bcc_ptr += strnlen(ses->userName, MAX_USERNAME_SIZE);
        *bcc_ptr = 0;
        bcc_ptr++; /* account for null termination */
@@ -405,8 +404,8 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
        /* BB spec says that if AvId field of MsvAvTimestamp is populated then
                we must set the MIC field of the AUTHENTICATE_MESSAGE */
        ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
-        tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
+        tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
-        tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
+        tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
        if (tilen) {
                ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
                if (!ses->auth_key.response) {
@@ -523,14 +522,14 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                tmp += len;
        }
-        if (ses->userName == NULL) {
+        if (ses->user_name == NULL) {
                sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
                sec_blob->UserName.Length = 0;
                sec_blob->UserName.MaximumLength = 0;
                tmp += 2;
        } else {
                int len;
-                len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
+                len = cifs_strtoUCS((__le16 *)tmp, ses->user_name,
                                    MAX_USERNAME_SIZE, nls_cp);
                len *= 2; /* unicode is 2 bytes each */
                sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -656,13 +655,13 @@ ssetup_ntlmssp_authenticate:
        if (type == LANMAN) {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-                char lnm_session_key[CIFS_SESS_KEY_SIZE];
+                char lnm_session_key[CIFS_AUTH_RESP_SIZE];
                pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
                /* no capabilities flags in old lanman negotiation */
-                pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+                pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
                /* Calculate hash with password and copy into bcc_ptr.
                 * Encryption Key (stored as in cryptkey) gets used if the
@@ -675,8 +674,8 @@ ssetup_ntlmssp_authenticate:
                                        true : false, lnm_session_key);
                ses->flags |= CIFS_SES_LANMAN;
-                memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESS_KEY_SIZE);
+                memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
-                bcc_ptr += CIFS_SESS_KEY_SIZE;
+                bcc_ptr += CIFS_AUTH_RESP_SIZE;
                /* can not sign if LANMAN negotiated so no need
                to calculate signing key? but what if server
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index b5450e9f40c0..b5041c849981 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -58,8 +58,9 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
        md4 = crypto_alloc_shash("md4", 0, 0);
        if (IS_ERR(md4)) {
+                rc = PTR_ERR(md4);
                cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
-                return PTR_ERR(md4);
+                return rc;
        }
        size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
        sdescmd4 = kmalloc(size, GFP_KERNEL);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c1ccca1a933f..46d8756f2b24 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -236,9 +236,9 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
                server->tcpStatus = CifsNeedReconnect;
        }
-        if (rc < 0) {
+        if (rc < 0 && rc != -EINTR)
                cERROR(1, "Error %d sending data on socket to server", rc);
-        } else
+        else
                rc = 0;
        /* Don't want to modify the buffer as a
@@ -359,6 +359,10 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
        if (rc)
                return rc;
+        /* enable signing if server requires it */
+        if (server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+                in_buf->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
        mutex_lock(&server->srv_mutex);
        mid = AllocMidQEntry(in_buf, server);
        if (mid == NULL) {
@@ -453,6 +457,9 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
        case MID_RETRY_NEEDED:
                rc = -EAGAIN;
                break;
+        case MID_RESPONSE_MALFORMED:
+                rc = -EIO;
+                break;
        default:
                cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
                        mid->mid, mid->midState);
@@ -570,17 +577,33 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 #endif
        mutex_unlock(&ses->server->srv_mutex);
-        cifs_small_buf_release(in_buf);
-        if (rc < 0)
+        if (rc < 0) {
+                cifs_small_buf_release(in_buf);
                goto out;
+        }
-        if (long_op == CIFS_ASYNC_OP)
+        if (long_op == CIFS_ASYNC_OP) {
+                cifs_small_buf_release(in_buf);
                goto out;
+        }
        rc = wait_for_response(ses->server, midQ);
-        if (rc != 0)
+        if (rc != 0) {
-                goto out;
+                send_nt_cancel(ses->server, in_buf, midQ);
+                spin_lock(&GlobalMid_Lock);
+                if (midQ->midState == MID_REQUEST_SUBMITTED) {
+                        midQ->callback = DeleteMidQEntry;
+                        spin_unlock(&GlobalMid_Lock);
+                        cifs_small_buf_release(in_buf);
+                        atomic_dec(&ses->server->inFlight);
+                        wake_up(&ses->server->request_q);
+                        return rc;
+                }
+                spin_unlock(&GlobalMid_Lock);
+        }
+        cifs_small_buf_release(in_buf);
        rc = sync_mid_result(midQ, ses->server);
        if (rc != 0) {
@@ -724,8 +747,19 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
                goto out;
        rc = wait_for_response(ses->server, midQ);
-        if (rc != 0)
+        if (rc != 0) {
-                goto out;
+                send_nt_cancel(ses->server, in_buf, midQ);
+                spin_lock(&GlobalMid_Lock);
+                if (midQ->midState == MID_REQUEST_SUBMITTED) {
+                        /* no longer considered to be "in-flight" */
+                        midQ->callback = DeleteMidQEntry;
+                        spin_unlock(&GlobalMid_Lock);
+                        atomic_dec(&ses->server->inFlight);
+                        wake_up(&ses->server->request_q);
+                        return rc;
+                }
+                spin_unlock(&GlobalMid_Lock);
+        }
        rc = sync_mid_result(midQ, ses->server);
        if (rc != 0) {
@@ -922,10 +956,21 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
                        }
                }
-                if (wait_for_response(ses->server, midQ) == 0) {
+                rc = wait_for_response(ses->server, midQ);
-                        /* We got the response - restart system call. */
+                if (rc) {
-                        rstart = 1;
+                        send_nt_cancel(ses->server, in_buf, midQ);
+                        spin_lock(&GlobalMid_Lock);
+                        if (midQ->midState == MID_REQUEST_SUBMITTED) {
+                                /* no longer considered to be "in-flight" */
+                                midQ->callback = DeleteMidQEntry;
+                                spin_unlock(&GlobalMid_Lock);
+                                return rc;
+                        }
+                        spin_unlock(&GlobalMid_Lock);
                }
+                /* We got the response - restart system call. */
+                rstart = 1;
        }
        rc = sync_mid_result(midQ, ses->server);
diff --git a/fs/coda/Makefile b/fs/coda/Makefile
index 6c22e61da397..1bab69a0d347 100644
--- a/fs/coda/Makefile
+++ b/fs/coda/Makefile
@@ -9,4 +9,4 @@ coda-objs := psdev.o cache.o cnode.o inode.o dir.o file.o upcall.o \
 # If you want debugging output, please uncomment the following line.
-# EXTRA_CFLAGS += -DDEBUG -DDEBUG_SMB_MALLOC=1
+# ccflags-y := -DDEBUG -DDEBUG_SMB_MALLOC=1
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index c6405ce3c50e..af56ad56a89a 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -13,7 +13,6 @@
 #ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fs_table_header;
-#endif
 static ctl_table coda_table[] = {
        {
@@ -40,7 +39,6 @@ static ctl_table coda_table[] = {
        {}
 };
-#ifdef CONFIG_SYSCTL
 static ctl_table fs_table[] = {
        {
                .procname       = "coda",
@@ -49,22 +47,27 @@ static ctl_table fs_table[] = {
        },
        {}
 };
-#endif
 void coda_sysctl_init(void)
 {
-#ifdef CONFIG_SYSCTL
        if ( !fs_table_header )
                fs_table_header = register_sysctl_table(fs_table);
-#endif
 }
 void coda_sysctl_clean(void)
 {
-#ifdef CONFIG_SYSCTL
        if ( fs_table_header ) {
                unregister_sysctl_table(fs_table_header);
                fs_table_header = NULL;
        }
-#endif
 }
+#else
+void coda_sysctl_init(void)
+{
+}
+void coda_sysctl_clean(void)
+{
+}
+#endif
diff --git a/fs/compat.c b/fs/compat.c
index f6fd0a00e6cc..72fe6cda9108 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 */
 asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
 {
-        struct path path;
+        struct kstatfs tmp;
-        int error;
+        int error = user_statfs(pathname, &tmp);
+        if (!error)
-        error = user_path(pathname, &path);
+                error = put_compat_statfs(buf, &tmp);
-        if (!error) {
-                struct kstatfs tmp;
-                error = vfs_statfs(&path, &tmp);
-                if (!error)
-                        error = put_compat_statfs(buf, &tmp);
-                path_put(&path);
-        }
        return error;
 }
 asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf)
 {
-        struct file * file;
        struct kstatfs tmp;
-        int error;
+        int error = fd_statfs(fd, &tmp);
-        error = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = vfs_statfs(&file->f_path, &tmp);
        if (!error)
                error = put_compat_statfs(buf, &tmp);
-        fput(file);
-out:
        return error;
 }
@@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
 asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-        struct path path;
+        struct kstatfs tmp;
        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = user_path(pathname, &path);
+        error = user_statfs(pathname, &tmp);
-        if (!error) {
+        if (!error)
-                struct kstatfs tmp;
+                error = put_compat_statfs64(buf, &tmp);
-                error = vfs_statfs(&path, &tmp);
-                if (!error)
-                        error = put_compat_statfs64(buf, &tmp);
-                path_put(&path);
-        }
        return error;
 }
 asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-        struct file * file;
        struct kstatfs tmp;
        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = -EBADF;
+        error = fd_statfs(fd, &tmp);
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = vfs_statfs(&file->f_path, &tmp);
        if (!error)
                error = put_compat_statfs64(buf, &tmp);
-        fput(file);
-out:
        return error;
 }
@@ -1228,7 +1200,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
        file = fget_light(fd, &fput_needed);
        if (!file)
                return -EBADF;
-        ret = compat_readv(file, vec, vlen, &pos);
+        ret = -ESPIPE;
+        if (file->f_mode & FMODE_PREAD)
+                ret = compat_readv(file, vec, vlen, &pos);
        fput_light(file, fput_needed);
        return ret;
 }
@@ -1285,7 +1259,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
        file = fget_light(fd, &fput_needed);
        if (!file)
                return -EBADF;
-        ret = compat_writev(file, vec, vlen, &pos);
+        ret = -ESPIPE;
+        if (file->f_mode & FMODE_PWRITE)
+                ret = compat_writev(file, vec, vlen, &pos);
        fput_light(file, fput_needed);
        return ret;
 }
@@ -1695,9 +1671,6 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
-#define MAX_SELECT_SECONDS \
-        ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 int compat_core_sys_select(int n, compat_ulong_t __user *inp,
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
        struct timespec *end_time)
@@ -2308,3 +2281,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd,
 }
 #endif /* CONFIG_TIMERFD */
+#ifdef CONFIG_FHANDLE
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open_by_handle_at(int mountdirfd,
+                             struct file_handle __user *handle, int flags)
+{
+        return do_handle_open(mountdirfd, handle, flags);
+}
+#endif
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 90ff3cb10de3..3313dd19f543 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -990,7 +990,7 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
 * This describes these functions and their helpers.
 *
 * Allow another kernel system to depend on a config_item.  If this
- * happens, the item cannot go away until the dependant can live without
+ * happens, the item cannot go away until the dependent can live without
 * it.  The idea is to give client modules as simple an interface as
 * possible.  When a system asks them to depend on an item, they just
 * call configfs_depend_item().  If the item is live and the client
diff --git a/fs/dcache.c b/fs/dcache.c
index 2a6bd9a4ae97..129a35730994 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -296,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
        __releases(parent->d_lock)
        __releases(dentry->d_inode->i_lock)
 {
-        dentry->d_parent = NULL;
        list_del(&dentry->d_u.d_child);
+        /*
+         * Inform try_to_ascend() that we are no longer attached to the
+         * dentry tree
+         */
+        dentry->d_flags |= DCACHE_DISCONNECTED;
        if (parent)
                spin_unlock(&parent->d_lock);
        dentry_iput(dentry);
@@ -1012,6 +1016,35 @@ void shrink_dcache_for_umount(struct super_block *sb)
 }
 /*
+ * This tries to ascend one level of parenthood, but
+ * we can race with renaming, so we need to re-check
+ * the parenthood after dropping the lock and check
+ * that the sequence number still matches.
+ */
+static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq)
+{
+        struct dentry *new = old->d_parent;
+        rcu_read_lock();
+        spin_unlock(&old->d_lock);
+        spin_lock(&new->d_lock);
+        /*
+         * might go back up the wrong parent if we have had a rename
+         * or deletion
+         */
+        if (new != old->d_parent ||
+                 (old->d_flags & DCACHE_DISCONNECTED) ||
+                 (!locked && read_seqretry(&rename_lock, seq))) {
+                spin_unlock(&new->d_lock);
+                new = NULL;
+        }
+        rcu_read_unlock();
+        return new;
+}
+/*
 * Search for at least 1 mount point in the dentry's subdirs.
 * We descend to the next level whenever the d_subdirs
 * list is non-empty and continue searching.
@@ -1066,24 +1099,10 @@ resume:
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                struct dentry *tmp;
+                struct dentry *child = this_parent;
-                struct dentry *child;
+                this_parent = try_to_ascend(this_parent, locked, seq);
+                if (!this_parent)
-                tmp = this_parent->d_parent;
-                rcu_read_lock();
-                spin_unlock(&this_parent->d_lock);
-                child = this_parent;
-                this_parent = tmp;
-                spin_lock(&this_parent->d_lock);
-                /* might go back up the wrong parent if we have had a rename
-                 * or deletion */
-                if (this_parent != child->d_parent ||
-                         (!locked && read_seqretry(&rename_lock, seq))) {
-                        spin_unlock(&this_parent->d_lock);
-                        rcu_read_unlock();
                        goto rename_retry;
-                }
-                rcu_read_unlock();
                next = child->d_u.d_child.next;
                goto resume;
        }
@@ -1181,24 +1200,10 @@ resume:
         * All done at this level ... ascend and resume the search.
         */
        if (this_parent != parent) {
-                struct dentry *tmp;
+                struct dentry *child = this_parent;
-                struct dentry *child;
+                this_parent = try_to_ascend(this_parent, locked, seq);
+                if (!this_parent)
-                tmp = this_parent->d_parent;
-                rcu_read_lock();
-                spin_unlock(&this_parent->d_lock);
-                child = this_parent;
-                this_parent = tmp;
-                spin_lock(&this_parent->d_lock);
-                /* might go back up the wrong parent if we have had a rename
-                 * or deletion */
-                if (this_parent != child->d_parent ||
-                        (!locked && read_seqretry(&rename_lock, seq))) {
-                        spin_unlock(&this_parent->d_lock);
-                        rcu_read_unlock();
                        goto rename_retry;
-                }
-                rcu_read_unlock();
                next = child->d_u.d_child.next;
                goto resume;
        }
@@ -1523,6 +1528,28 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 }
 EXPORT_SYMBOL(d_alloc_root);
+static struct dentry * __d_find_any_alias(struct inode *inode)
+{
+        struct dentry *alias;
+        if (list_empty(&inode->i_dentry))
+                return NULL;
+        alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
+        __dget(alias);
+        return alias;
+}
+static struct dentry * d_find_any_alias(struct inode *inode)
+{
+        struct dentry *de;
+        spin_lock(&inode->i_lock);
+        de = __d_find_any_alias(inode);
+        spin_unlock(&inode->i_lock);
+        return de;
+}
 /**
 * d_obtain_alias - find or allocate a dentry for a given inode
 * @inode: inode to allocate the dentry for
@@ -1552,7 +1579,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
        if (IS_ERR(inode))
                return ERR_CAST(inode);
-        res = d_find_alias(inode);
+        res = d_find_any_alias(inode);
        if (res)
                goto out_iput;
@@ -1565,7 +1592,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
        spin_lock(&inode->i_lock);
-        res = __d_find_alias(inode, 0);
+        res = __d_find_any_alias(inode);
        if (res) {
                spin_unlock(&inode->i_lock);
                dput(tmp);
@@ -1585,10 +1612,13 @@ struct dentry *d_obtain_alias(struct inode *inode)
        __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
        spin_unlock(&tmp->d_lock);
        spin_unlock(&inode->i_lock);
+        security_d_instantiate(tmp, inode);
        return tmp;
 out_iput:
+        if (res && !IS_ERR(res))
+                security_d_instantiate(res, inode);
        iput(inode);
        return res;
 }
@@ -1781,7 +1811,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
-         * See Documentation/vfs/dcache-locking.txt for more details.
+         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
                struct inode *i;
@@ -1901,7 +1931,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
-         * See Documentation/vfs/dcache-locking.txt for more details.
+         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        rcu_read_lock();
        
@@ -2101,7 +2131,7 @@ EXPORT_SYMBOL(d_rehash);
 */
 void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
 {
-        BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+        BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex));
        BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
        spin_lock(&dentry->d_lock);
@@ -2920,28 +2950,14 @@ resume:
                spin_unlock(&dentry->d_lock);
        }
        if (this_parent != root) {
-                struct dentry *tmp;
+                struct dentry *child = this_parent;
-                struct dentry *child;
-                tmp = this_parent->d_parent;
                if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
                        this_parent->d_flags |= DCACHE_GENOCIDE;
                        this_parent->d_count--;
                }
-                rcu_read_lock();
+                this_parent = try_to_ascend(this_parent, locked, seq);
-                spin_unlock(&this_parent->d_lock);
+                if (!this_parent)
-                child = this_parent;
-                this_parent = tmp;
-                spin_lock(&this_parent->d_lock);
-                /* might go back up the wrong parent if we have had a rename
-                 * or deletion */
-                if (this_parent != child->d_parent ||
-                         (!locked && read_seqretry(&rename_lock, seq))) {
-                        spin_unlock(&this_parent->d_lock);
-                        rcu_read_unlock();
                        goto rename_retry;
-                }
-                rcu_read_unlock();
                next = child->d_u.d_child.next;
                goto resume;
        }
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 37a8ca7c1222..e7a7a2f07324 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -13,9 +13,6 @@
 *
 */
-/* uncomment to get debug messages from the debug filesystem, ah the irony. */
-/* #define DEBUG */
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
@@ -310,7 +307,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_symlink);
-static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
+static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 {
        int ret = 0;
@@ -333,6 +330,7 @@ static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
                        dput(dentry);
                }
        }
+        return ret;
 }
 /**
@@ -351,7 +349,8 @@ static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 void debugfs_remove(struct dentry *dentry)
 {
        struct dentry *parent;
-        
+        int ret;
        if (!dentry)
                return;
@@ -360,9 +359,10 @@ void debugfs_remove(struct dentry *dentry)
                return;
        mutex_lock(&parent->d_inode->i_mutex);
-        __debugfs_remove(dentry, parent);
+        ret = __debugfs_remove(dentry, parent);
        mutex_unlock(&parent->d_inode->i_mutex);
-        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+        if (!ret)
+                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
 EXPORT_SYMBOL_GPL(debugfs_remove);
@@ -540,17 +540,5 @@ static int __init debugfs_init(void)
        return retval;
 }
-static void __exit debugfs_exit(void)
-{
-        debugfs_registered = false;
-        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-        unregister_filesystem(&debug_fs_type);
-        kobject_put(debug_kobj);
-}
 core_initcall(debugfs_init);
-module_exit(debugfs_exit);
-MODULE_LICENSE("GPL");
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1bb547c9cad6..2f27e578d466 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -479,6 +479,7 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
        struct dentry *root = sb->s_root;
        struct pts_fs_info *fsi = DEVPTS_SB(sb);
        struct pts_mount_opts *opts = &fsi->mount_opts;
+        int ret = 0;
        char s[12];
        /* We're supposed to be given the slave end of a pty */
@@ -501,14 +502,17 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
        mutex_lock(&root->d_inode->i_mutex);
        dentry = d_alloc_name(root, s);
-        if (!IS_ERR(dentry)) {
+        if (dentry) {
                d_add(dentry, inode);
                fsnotify_create(root->d_inode, dentry);
+        } else {
+                iput(inode);
+                ret = -ENOMEM;
        }
        mutex_unlock(&root->d_inode->i_mutex);
-        return 0;
+        return ret;
 }
 struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
@@ -544,17 +548,12 @@ void devpts_pty_kill(struct tty_struct *tty)
        mutex_lock(&root->d_inode->i_mutex);
        dentry = d_find_alias(inode);
-        if (IS_ERR(dentry))
-                goto out;
-        if (dentry) {
-                inode->i_nlink--;
-                d_delete(dentry);
-                dput(dentry);   /* d_alloc_name() in devpts_pty_new() */
-        }
+        inode->i_nlink--;
+        d_delete(dentry);
+        dput(dentry);   /* d_alloc_name() in devpts_pty_new() */
        dput(dentry);           /* d_find_alias above */
-out:
        mutex_unlock(&root->d_inode->i_mutex);
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b044705eedd4..ac5f164170e3 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -645,11 +645,11 @@ static int dio_send_cur_page(struct dio *dio)
                /*
                 * See whether this new request is contiguous with the old.
                 *
-                 * Btrfs cannot handl having logically non-contiguous requests
+                 * Btrfs cannot handle having logically non-contiguous requests
-                 * submitted.  For exmple if you have
+                 * submitted.  For example if you have
                 *
                 * Logical:  [0-4095][HOLE][8192-12287]
-                 * Phyiscal: [0-4095]      [4096-8181]
+                 * Physical: [0-4095]      [4096-8191]
                 *
                 * We cannot submit those pages together as one BIO.  So if our
                 * current logical offset in the file does not equal what would
@@ -1110,11 +1110,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
            ((rw & READ) || (dio->result == dio->size)))
                ret = -EIOCBQUEUED;
-        if (ret != -EIOCBQUEUED) {
+        if (ret != -EIOCBQUEUED)
-                /* All IO is now issued, send it on its way */
-                blk_run_address_space(inode->i_mapping);
                dio_await_completion(dio);
-        }
        /*
         * Sync will always be dropping the final ref and completing the
@@ -1176,7 +1173,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        struct dio *dio;
        if (rw & WRITE)
-                rw = WRITE_ODIRECT_PLUG;
+                rw = WRITE_ODIRECT;
        if (bdev)
                bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 4314f0d48d85..abc49f292454 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -18,6 +18,7 @@
 #define WAKE_ASTS  0
+static uint64_t                 ast_seq_count;
 static struct list_head         ast_queue;
 static spinlock_t               ast_queue_lock;
 static struct task_struct *     astd_task;
@@ -25,40 +26,186 @@ static unsigned long		astd_wakeflags;
 static struct mutex             astd_running;
+static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb)
+{
+        int i;
+        log_print("last_bast %x %llu flags %x mode %d sb %d %x",
+                  lkb->lkb_id,
+                  (unsigned long long)lkb->lkb_last_bast.seq,
+                  lkb->lkb_last_bast.flags,
+                  lkb->lkb_last_bast.mode,
+                  lkb->lkb_last_bast.sb_status,
+                  lkb->lkb_last_bast.sb_flags);
+        log_print("last_cast %x %llu flags %x mode %d sb %d %x",
+                  lkb->lkb_id,
+                  (unsigned long long)lkb->lkb_last_cast.seq,
+                  lkb->lkb_last_cast.flags,
+                  lkb->lkb_last_cast.mode,
+                  lkb->lkb_last_cast.sb_status,
+                  lkb->lkb_last_cast.sb_flags);
+        for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+                log_print("cb %x %llu flags %x mode %d sb %d %x",
+                          lkb->lkb_id,
+                          (unsigned long long)lkb->lkb_callbacks[i].seq,
+                          lkb->lkb_callbacks[i].flags,
+                          lkb->lkb_callbacks[i].mode,
+                          lkb->lkb_callbacks[i].sb_status,
+                          lkb->lkb_callbacks[i].sb_flags);
+        }
+}
 void dlm_del_ast(struct dlm_lkb *lkb)
 {
        spin_lock(&ast_queue_lock);
-        if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
+        if (!list_empty(&lkb->lkb_astqueue))
-                list_del(&lkb->lkb_astqueue);
+                list_del_init(&lkb->lkb_astqueue);
        spin_unlock(&ast_queue_lock);
 }
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode)
+int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+                         int status, uint32_t sbflags, uint64_t seq)
 {
+        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+        uint64_t prev_seq;
+        int prev_mode;
+        int i;
+        for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
+                if (lkb->lkb_callbacks[i].seq)
+                        continue;
+                /*
+                 * Suppress some redundant basts here, do more on removal.
+                 * Don't even add a bast if the callback just before it
+                 * is a bast for the same mode or a more restrictive mode.
+                 * (the addional > PR check is needed for PR/CW inversion)
+                 */
+                if ((i > 0) && (flags & DLM_CB_BAST) &&
+                    (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) {
+                        prev_seq = lkb->lkb_callbacks[i-1].seq;
+                        prev_mode = lkb->lkb_callbacks[i-1].mode;
+                        if ((prev_mode == mode) ||
+                            (prev_mode > mode && prev_mode > DLM_LOCK_PR)) {
+                                log_debug(ls, "skip %x add bast %llu mode %d "
+                                          "for bast %llu mode %d",
+                                          lkb->lkb_id,
+                                          (unsigned long long)seq,
+                                          mode,
+                                          (unsigned long long)prev_seq,
+                                          prev_mode);
+                                return 0;
+                        }
+                }
+                lkb->lkb_callbacks[i].seq = seq;
+                lkb->lkb_callbacks[i].flags = flags;
+                lkb->lkb_callbacks[i].mode = mode;
+                lkb->lkb_callbacks[i].sb_status = status;
+                lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF);
+                break;
+        }
+        if (i == DLM_CALLBACKS_SIZE) {
+                log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x",
+                          lkb->lkb_id, (unsigned long long)seq,
+                          flags, mode, status, sbflags);
+                dlm_dump_lkb_callbacks(lkb);
+                return -1;
+        }
+        return 0;
+}
+int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                         struct dlm_callback *cb, int *resid)
+{
+        int i;
+        *resid = 0;
+        if (!lkb->lkb_callbacks[0].seq)
+                return -ENOENT;
+        /* oldest undelivered cb is callbacks[0] */
+        memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback));
+        memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback));
+        /* shift others down */
+        for (i = 1; i < DLM_CALLBACKS_SIZE; i++) {
+                if (!lkb->lkb_callbacks[i].seq)
+                        break;
+                memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i],
+                       sizeof(struct dlm_callback));
+                memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback));
+                (*resid)++;
+        }
+        /* if cb is a bast, it should be skipped if the blocking mode is
+           compatible with the last granted mode */
+        if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) {
+                if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) {
+                        cb->flags |= DLM_CB_SKIP;
+                        log_debug(ls, "skip %x bast %llu mode %d "
+                                  "for cast %llu mode %d",
+                                  lkb->lkb_id,
+                                  (unsigned long long)cb->seq,
+                                  cb->mode,
+                                  (unsigned long long)lkb->lkb_last_cast.seq,
+                                  lkb->lkb_last_cast.mode);
+                        return 0;
+                }
+        }
+        if (cb->flags & DLM_CB_CAST) {
+                memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback));
+                lkb->lkb_last_cast_time = ktime_get();
+        }
+        if (cb->flags & DLM_CB_BAST) {
+                memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback));
+                lkb->lkb_last_bast_time = ktime_get();
+        }
+        return 0;
+}
+void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
+                 uint32_t sbflags)
+{
+        uint64_t seq;
+        int rv;
+        spin_lock(&ast_queue_lock);
+        seq = ++ast_seq_count;
        if (lkb->lkb_flags & DLM_IFL_USER) {
-                dlm_user_add_ast(lkb, type, mode);
+                spin_unlock(&ast_queue_lock);
+                dlm_user_add_ast(lkb, flags, mode, status, sbflags, seq);
                return;
        }
-        spin_lock(&ast_queue_lock);
+        rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
-        if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+        if (rv < 0) {
+                spin_unlock(&ast_queue_lock);
+                return;
+        }
+        if (list_empty(&lkb->lkb_astqueue)) {
                kref_get(&lkb->lkb_ref);
                list_add_tail(&lkb->lkb_astqueue, &ast_queue);
-                lkb->lkb_ast_first = type;
        }
-        /* sanity check, this should not happen */
-        if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
-                log_print("repeat cast %d castmode %d lock %x %s",
-                          mode, lkb->lkb_castmode,
-                          lkb->lkb_id, lkb->lkb_resource->res_name);
-        lkb->lkb_ast_type |= type;
-        if (type == AST_BAST)
-                lkb->lkb_bastmode = mode;
-        else
-                lkb->lkb_castmode = mode;
        spin_unlock(&ast_queue_lock);
        set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -72,7 +219,8 @@ static void process_asts(void)
        struct dlm_lkb *lkb;
        void (*castfn) (void *astparam);
        void (*bastfn) (void *astparam, int mode);
-        int type, first, bastmode, castmode, do_bast, do_cast, last_castmode;
+        struct dlm_callback callbacks[DLM_CALLBACKS_SIZE];
+        int i, rv, resid;
 repeat:
        spin_lock(&ast_queue_lock);
@@ -83,54 +231,45 @@ repeat:
                if (dlm_locking_stopped(ls))
                        continue;
-                list_del(&lkb->lkb_astqueue);
+                /* we remove from astqueue list and remove everything in
-                type = lkb->lkb_ast_type;
+                   lkb_callbacks before releasing the spinlock so empty
-                lkb->lkb_ast_type = 0;
+                   lkb_astqueue is always consistent with empty lkb_callbacks */
-                first = lkb->lkb_ast_first;
-                lkb->lkb_ast_first = 0;
+                list_del_init(&lkb->lkb_astqueue);
-                bastmode = lkb->lkb_bastmode;
-                castmode = lkb->lkb_castmode;
                castfn = lkb->lkb_astfn;
                bastfn = lkb->lkb_bastfn;
-                spin_unlock(&ast_queue_lock);
-                do_cast = (type & AST_COMP) && castfn;
+                memset(&callbacks, 0, sizeof(callbacks));
-                do_bast = (type & AST_BAST) && bastfn;
-                /* Skip a bast if its blocking mode is compatible with the
+                for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
-                   granted mode of the preceding cast. */
+                        rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid);
+                        if (rv < 0)
+                                break;
+                }
+                spin_unlock(&ast_queue_lock);
-                if (do_bast) {
+                if (resid) {
-                        if (first == AST_COMP)
+                        /* shouldn't happen, for loop should have removed all */
-                                last_castmode = castmode;
+                        log_error(ls, "callback resid %d lkb %x",
-                        else
+                                  resid, lkb->lkb_id);
-                                last_castmode = lkb->lkb_castmode_done;
-                        if (dlm_modes_compat(bastmode, last_castmode))
-                                do_bast = 0;
                }
-                if (first == AST_COMP) {
+                for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
-                        if (do_cast)
+                        if (!callbacks[i].seq)
-                                castfn(lkb->lkb_astparam);
+                                break;
-                        if (do_bast)
+                        if (callbacks[i].flags & DLM_CB_SKIP) {
-                                bastfn(lkb->lkb_astparam, bastmode);
+                                continue;
-                } else if (first == AST_BAST) {
+                        } else if (callbacks[i].flags & DLM_CB_BAST) {
-                        if (do_bast)
+                                bastfn(lkb->lkb_astparam, callbacks[i].mode);
-                                bastfn(lkb->lkb_astparam, bastmode);
+                        } else if (callbacks[i].flags & DLM_CB_CAST) {
-                        if (do_cast)
+                                lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
+                                lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
                                castfn(lkb->lkb_astparam);
-                } else {
+                        }
-                        log_error(ls, "bad ast_first %d ast_type %d",
-                                  first, type);
                }
-                if (do_cast)
+                /* removes ref for ast_queue, may cause lkb to be freed */
-                        lkb->lkb_castmode_done = castmode;
-                if (do_bast)
-                        lkb->lkb_bastmode_done = bastmode;
-                /* this removes the reference added by dlm_add_ast
-                   and may result in the lkb being freed */
                dlm_put_lkb(lkb);
                cond_resched();
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index bcb1aaba519d..8aa89c9b5611 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -13,8 +13,13 @@
 #ifndef __ASTD_DOT_H__
 #define __ASTD_DOT_H__
-void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
 void dlm_del_ast(struct dlm_lkb *lkb);
+int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
+                         int status, uint32_t sbflags, uint64_t seq);
+int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
+                         struct dlm_callback *cb, int *resid);
+void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
+                 uint32_t sbflags);
 void dlm_astd_wake(void);
 int dlm_astd_start(void);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index b54bca03d92f..0d329ff8ed4c 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -977,9 +977,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 /* Config file defaults */
 #define DEFAULT_TCP_PORT       21064
 #define DEFAULT_BUFFER_SIZE     4096
-#define DEFAULT_RSBTBL_SIZE      256
+#define DEFAULT_RSBTBL_SIZE     1024
 #define DEFAULT_LKBTBL_SIZE     1024
-#define DEFAULT_DIRTBL_SIZE      512
+#define DEFAULT_DIRTBL_SIZE     1024
 #define DEFAULT_RECOVER_TIMER      5
 #define DEFAULT_TOSS_SECS         10
 #define DEFAULT_SCAN_SECS          5
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 6b42ba807dfd..59779237e2b4 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -257,12 +257,12 @@ static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
                        lkb->lkb_status,
                        lkb->lkb_grmode,
                        lkb->lkb_rqmode,
-                        lkb->lkb_bastmode,
+                        lkb->lkb_last_bast.mode,
                        rsb_lookup,
                        lkb->lkb_wait_type,
                        lkb->lkb_lvbseq,
                        (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
-                        (unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
+                        (unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time));
        return rv;
 }
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index f632b58cd222..b94204913011 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -192,11 +192,6 @@ struct dlm_args {
 * lkb is a process copy, the nodeid specifies the lock master.
 */
-/* lkb_ast_type */
-#define AST_COMP                1
-#define AST_BAST                2
 /* lkb_status */
 #define DLM_LKSTS_WAITING       1
@@ -217,6 +212,20 @@ struct dlm_args {
 #define DLM_IFL_USER            0x00000001
 #define DLM_IFL_ORPHAN          0x00000002
+#define DLM_CALLBACKS_SIZE      6
+#define DLM_CB_CAST             0x00000001
+#define DLM_CB_BAST             0x00000002
+#define DLM_CB_SKIP             0x00000004
+struct dlm_callback {
+        uint64_t                seq;
+        uint32_t                flags;          /* DLM_CBF_ */
+        int                     sb_status;      /* copy to lksb status */
+        uint8_t                 sb_flags;       /* copy to lksb flags */
+        int8_t                  mode; /* rq mode of bast, gr mode of cast */
+};
 struct dlm_lkb {
        struct dlm_rsb          *lkb_resource;  /* the rsb */
        struct kref             lkb_ref;
@@ -236,13 +245,6 @@ struct dlm_lkb {
        int8_t                  lkb_wait_type;  /* type of reply waiting for */
        int8_t                  lkb_wait_count;
-        int8_t                  lkb_ast_type;   /* type of ast queued for */
-        int8_t                  lkb_ast_first;  /* type of first ast queued */
-        int8_t                  lkb_bastmode;   /* req mode of queued bast */
-        int8_t                  lkb_castmode;   /* gr mode of queued cast */
-        int8_t                  lkb_bastmode_done; /* last delivered bastmode */
-        int8_t                  lkb_castmode_done; /* last delivered castmode */
        struct list_head        lkb_idtbl_list; /* lockspace lkbtbl */
        struct list_head        lkb_statequeue; /* rsb g/c/w list */
@@ -251,10 +253,15 @@ struct dlm_lkb {
        struct list_head        lkb_astqueue;   /* need ast to be sent */
        struct list_head        lkb_ownqueue;   /* list of locks for a process */
        struct list_head        lkb_time_list;
-        ktime_t                 lkb_time_bast;  /* for debugging */
        ktime_t                 lkb_timestamp;
        unsigned long           lkb_timeout_cs;
+        struct dlm_callback     lkb_callbacks[DLM_CALLBACKS_SIZE];
+        struct dlm_callback     lkb_last_cast;
+        struct dlm_callback     lkb_last_bast;
+        ktime_t                 lkb_last_cast_time;     /* for debugging */
+        ktime_t                 lkb_last_bast_time;     /* for debugging */
        char                    *lkb_lvbptr;
        struct dlm_lksb         *lkb_lksb;      /* caller's status block */
        void                    (*lkb_astfn) (void *astparam);
@@ -544,8 +551,6 @@ struct dlm_user_args {
                                          (dlm_user_proc) on the struct file,
                                          the process's locks point back to it*/
        struct dlm_lksb         lksb;
-        int                     old_mode;
-        int                     update_user_lvb;
        struct dlm_lksb __user  *user_lksb;
        void __user             *castparam;
        void __user             *castaddr;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 64e5f3efdd81..56d6bfcc1e48 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -160,10 +160,10 @@ static const int __quecvt_compat_matrix[8][8] = {
 void dlm_print_lkb(struct dlm_lkb *lkb)
 {
        printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
-               "     status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
+               "     status %d rqmode %d grmode %d wait_type %d\n",
               lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
               lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
-               lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
+               lkb->lkb_grmode, lkb->lkb_wait_type);
 }
 static void dlm_print_rsb(struct dlm_rsb *r)
@@ -305,10 +305,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
                rv = -EDEADLK;
        }
-        lkb->lkb_lksb->sb_status = rv;
+        dlm_add_ast(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags);
-        lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
-        dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
 }
 static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -319,13 +316,10 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
 static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
-        lkb->lkb_time_bast = ktime_get();
        if (is_master_copy(lkb)) {
-                lkb->lkb_bastmode = rqmode; /* printed by debugfs */
                send_bast(r, lkb, rqmode);
        } else {
-                dlm_add_ast(lkb, AST_BAST, rqmode);
+                dlm_add_ast(lkb, DLM_CB_BAST, rqmode, 0, 0);
        }
 }
@@ -525,7 +519,7 @@ static void toss_rsb(struct kref *kref)
        }
 }
-/* When all references to the rsb are gone it's transfered to
+/* When all references to the rsb are gone it's transferred to
   the tossed list for later disposal. */
 static void put_rsb(struct dlm_rsb *r)
@@ -600,6 +594,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
        INIT_LIST_HEAD(&lkb->lkb_ownqueue);
        INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
        INIT_LIST_HEAD(&lkb->lkb_time_list);
+        INIT_LIST_HEAD(&lkb->lkb_astqueue);
        get_random_bytes(&bucket, sizeof(bucket));
        bucket &= (ls->ls_lkbtbl_size - 1);
@@ -2819,9 +2814,9 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
           not from lkb fields */
        if (lkb->lkb_bastfn)
-                ms->m_asts |= AST_BAST;
+                ms->m_asts |= DLM_CB_BAST;
        if (lkb->lkb_astfn)
-                ms->m_asts |= AST_COMP;
+                ms->m_asts |= DLM_CB_CAST;
        /* compare with switch in create_message; send_remove() doesn't
           use send_args() */
@@ -3122,8 +3117,8 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
        lkb->lkb_grmode = DLM_LOCK_IV;
        lkb->lkb_rqmode = ms->m_rqmode;
-        lkb->lkb_bastfn = (ms->m_asts & AST_BAST) ? &fake_bastfn : NULL;
+        lkb->lkb_bastfn = (ms->m_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
-        lkb->lkb_astfn = (ms->m_asts & AST_COMP) ? &fake_astfn : NULL;
+        lkb->lkb_astfn = (ms->m_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
        if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
                /* lkb was just created so there won't be an lvb yet */
@@ -4412,8 +4407,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
        lkb->lkb_grmode = rl->rl_grmode;
        /* don't set lkb_status because add_lkb wants to itself */
-        lkb->lkb_bastfn = (rl->rl_asts & AST_BAST) ? &fake_bastfn : NULL;
+        lkb->lkb_bastfn = (rl->rl_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
-        lkb->lkb_astfn = (rl->rl_asts & AST_COMP) ? &fake_astfn : NULL;
+        lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
        if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
                int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
@@ -4589,7 +4584,6 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
        error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
                              fake_astfn, ua, fake_bastfn, &args);
        lkb->lkb_flags |= DLM_IFL_USER;
-        ua->old_mode = DLM_LOCK_IV;
        if (error) {
                __put_lkb(ls, lkb);
@@ -4658,7 +4652,6 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
        ua->bastparam = ua_tmp->bastparam;
        ua->bastaddr = ua_tmp->bastaddr;
        ua->user_lksb = ua_tmp->user_lksb;
-        ua->old_mode = lkb->lkb_grmode;
        error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
                              fake_astfn, ua, fake_bastfn, &args);
@@ -4917,8 +4910,9 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
        }
        list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
-                lkb->lkb_ast_type = 0;
+                memset(&lkb->lkb_callbacks, 0,
-                list_del(&lkb->lkb_astqueue);
+                       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+                list_del_init(&lkb->lkb_astqueue);
                dlm_put_lkb(lkb);
        }
@@ -4958,7 +4952,9 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
        spin_lock(&proc->asts_spin);
        list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
-                list_del(&lkb->lkb_astqueue);
+                memset(&lkb->lkb_callbacks, 0,
+                       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
+                list_del_init(&lkb->lkb_astqueue);
                dlm_put_lkb(lkb);
        }
        spin_unlock(&proc->asts_spin);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9c64ae9e4c1a..5e2c71f05e46 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -810,7 +810,7 @@ static int tcp_accept_from_sock(struct connection *con)
        /*
         * Add it to the active queue in case we got data
-         * beween processing the accept adding the socket
+         * between processing the accept adding the socket
         * to the read_sockets list
         */
        if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
@@ -1468,15 +1468,15 @@ static void work_stop(void)
 static int work_start(void)
 {
-        recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM |
+        recv_workqueue = alloc_workqueue("dlm_recv",
-                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+                                         WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
        if (!recv_workqueue) {
                log_print("can't start dlm_recv");
                return -ENOMEM;
        }
-        send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM |
+        send_workqueue = alloc_workqueue("dlm_send",
-                                         WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+                                         WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
        if (!send_workqueue) {
                log_print("can't start dlm_send");
                destroy_workqueue(recv_workqueue);
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 3c83a49a48a3..f10a50f24e8f 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -321,9 +321,9 @@ static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
        rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type);
        if (lkb->lkb_bastfn)
-                rl->rl_asts |= AST_BAST;
+                rl->rl_asts |= DLM_CB_BAST;
        if (lkb->lkb_astfn)
-                rl->rl_asts |= AST_COMP;
+                rl->rl_asts |= DLM_CB_CAST;
        rl->rl_namelen = cpu_to_le16(r->res_length);
        memcpy(rl->rl_name, r->res_name, r->res_length);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index eda43f362616..14638235f7b2 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -304,7 +304,7 @@ static void set_master_lkbs(struct dlm_rsb *r)
 }
 /*
- * Propogate the new master nodeid to locks
+ * Propagate the new master nodeid to locks
 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
 * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
 * rsb's to consider.
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 66d6c16bf440..d5ab3fe7c198 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -24,6 +24,7 @@
 #include "lock.h"
 #include "lvb_table.h"
 #include "user.h"
+#include "ast.h"
 static const char name_prefix[] = "dlm";
 static const struct file_operations device_fops;
@@ -152,19 +153,16 @@ static void compat_output(struct dlm_lock_result *res,
   not related to the lifetime of the lkb struct which is managed
   entirely by refcount. */
-static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
+static int lkb_is_endoflife(int mode, int status)
 {
-        switch (sb_status) {
+        switch (status) {
        case -DLM_EUNLOCK:
                return 1;
        case -DLM_ECANCEL:
        case -ETIMEDOUT:
        case -EDEADLK:
-                if (lkb->lkb_grmode == DLM_LOCK_IV)
-                        return 1;
-                break;
        case -EAGAIN:
-                if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV)
+                if (mode == DLM_LOCK_IV)
                        return 1;
                break;
        }
@@ -174,12 +172,13 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
 /* we could possibly check if the cancel of an orphan has resulted in the lkb
   being removed and then remove that lkb from the orphans list and free it */
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
+void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
+                      int status, uint32_t sbflags, uint64_t seq)
 {
        struct dlm_ls *ls;
        struct dlm_user_args *ua;
        struct dlm_user_proc *proc;
-        int eol = 0, ast_type;
+        int rv;
        if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
                return;
@@ -200,49 +199,29 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
        ua = lkb->lkb_ua;
        proc = ua->proc;
-        if (type == AST_BAST && ua->bastaddr == NULL)
+        if ((flags & DLM_CB_BAST) && ua->bastaddr == NULL)
                goto out;
+        if ((flags & DLM_CB_CAST) && lkb_is_endoflife(mode, status))
+                lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
        spin_lock(&proc->asts_spin);
-        ast_type = lkb->lkb_ast_type;
+        rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
-        lkb->lkb_ast_type |= type;
+        if (rv < 0) {
-        if (type == AST_BAST)
+                spin_unlock(&proc->asts_spin);
-                lkb->lkb_bastmode = mode;
+                goto out;
-        else
+        }
-                lkb->lkb_castmode = mode;
-        if (!ast_type) {
+        if (list_empty(&lkb->lkb_astqueue)) {
                kref_get(&lkb->lkb_ref);
                list_add_tail(&lkb->lkb_astqueue, &proc->asts);
-                lkb->lkb_ast_first = type;
                wake_up_interruptible(&proc->wait);
        }
-        if (type == AST_COMP && (ast_type & AST_COMP))
-                log_debug(ls, "ast overlap %x status %x %x",
-                          lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags);
-        eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
-        if (eol) {
-                lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
-        }
-        /* We want to copy the lvb to userspace when the completion
-           ast is read if the status is 0, the lock has an lvb and
-           lvb_ops says we should.  We could probably have set_lvb_lock()
-           set update_user_lvb instead and not need old_mode */
-        if ((lkb->lkb_ast_type & AST_COMP) &&
-            (lkb->lkb_lksb->sb_status == 0) &&
-            lkb->lkb_lksb->sb_lvbptr &&
-            dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
-                ua->update_user_lvb = 1;
-        else
-                ua->update_user_lvb = 0;
        spin_unlock(&proc->asts_spin);
-        if (eol) {
+        if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
+                /* N.B. spin_lock locks_spin, not asts_spin */
                spin_lock(&proc->locks_spin);
                if (!list_empty(&lkb->lkb_ownqueue)) {
                        list_del_init(&lkb->lkb_ownqueue);
@@ -705,8 +684,9 @@ static int device_close(struct inode *inode, struct file *file)
        return 0;
 }
-static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
+static int copy_result_to_user(struct dlm_user_args *ua, int compat,
-                               int mode, char __user *buf, size_t count)
+                               uint32_t flags, int mode, int copy_lvb,
+                               char __user *buf, size_t count)
 {
 #ifdef CONFIG_COMPAT
        struct dlm_lock_result32 result32;
@@ -730,7 +710,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
           notes that a new blocking AST address and parameter are set even if
           the conversion fails, so maybe we should just do that. */
-        if (type == AST_BAST) {
+        if (flags & DLM_CB_BAST) {
                result.user_astaddr = ua->bastaddr;
                result.user_astparam = ua->bastparam;
                result.bast_mode = mode;
@@ -750,8 +730,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
        /* copy lvb to userspace if there is one, it's been updated, and
           the user buffer has space for it */
-        if (ua->update_user_lvb && ua->lksb.sb_lvbptr &&
+        if (copy_lvb && ua->lksb.sb_lvbptr && count >= len + DLM_USER_LVB_LEN) {
-            count >= len + DLM_USER_LVB_LEN) {
                if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
                                 DLM_USER_LVB_LEN)) {
                        error = -EFAULT;
@@ -801,13 +780,12 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
        struct dlm_user_proc *proc = file->private_data;
        struct dlm_lkb *lkb;
        DECLARE_WAITQUEUE(wait, current);
-        int error = 0, removed;
+        struct dlm_callback cb;
-        int ret_type, ret_mode;
+        int rv, resid, copy_lvb = 0;
-        int bastmode, castmode, do_bast, do_cast;
        if (count == sizeof(struct dlm_device_version)) {
-                error = copy_version_to_user(buf, count);
+                rv = copy_version_to_user(buf, count);
-                return error;
+                return rv;
        }
        if (!proc) {
@@ -854,92 +832,57 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
                }
        }
-        /* there may be both completion and blocking asts to return for
+        /* if we empty lkb_callbacks, we don't want to unlock the spinlock
-           the lkb, don't remove lkb from asts list unless no asts remain */
+           without removing lkb_astqueue; so empty lkb_astqueue is always
+           consistent with empty lkb_callbacks */
        lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
-        removed = 0;
+        rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid);
-        ret_type = 0;
+        if (rv < 0) {
-        ret_mode = 0;
+                /* this shouldn't happen; lkb should have been removed from
-        do_bast = lkb->lkb_ast_type & AST_BAST;
+                   list when resid was zero */
-        do_cast = lkb->lkb_ast_type & AST_COMP;
+                log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id);
-        bastmode = lkb->lkb_bastmode;
+                list_del_init(&lkb->lkb_astqueue);
-        castmode = lkb->lkb_castmode;
+                spin_unlock(&proc->asts_spin);
+                /* removes ref for proc->asts, may cause lkb to be freed */
-        /* when both are queued figure out which to do first and
+                dlm_put_lkb(lkb);
-           switch first so the other goes in the next read */
+                goto try_another;
-        if (do_cast && do_bast) {
-                if (lkb->lkb_ast_first == AST_COMP) {
-                        ret_type = AST_COMP;
-                        ret_mode = castmode;
-                        lkb->lkb_ast_type &= ~AST_COMP;
-                        lkb->lkb_ast_first = AST_BAST;
-                } else {
-                        ret_type = AST_BAST;
-                        ret_mode = bastmode;
-                        lkb->lkb_ast_type &= ~AST_BAST;
-                        lkb->lkb_ast_first = AST_COMP;
-                }
-        } else {
-                ret_type = lkb->lkb_ast_first;
-                ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
-                lkb->lkb_ast_type &= ~ret_type;
-                lkb->lkb_ast_first = 0;
        }
+        if (!resid)
+                list_del_init(&lkb->lkb_astqueue);
+        spin_unlock(&proc->asts_spin);
-        /* if we're doing a bast but the bast is unnecessary, then
+        if (cb.flags & DLM_CB_SKIP) {
-           switch to do nothing or do a cast if that was needed next */
+                /* removes ref for proc->asts, may cause lkb to be freed */
+                if (!resid)
-        if ((ret_type == AST_BAST) &&
+                        dlm_put_lkb(lkb);
-            dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) {
+                goto try_another;
-                ret_type = 0;
-                ret_mode = 0;
-                if (do_cast) {
-                        ret_type = AST_COMP;
-                        ret_mode = castmode;
-                        lkb->lkb_ast_type &= ~AST_COMP;
-                        lkb->lkb_ast_first = 0;
-                }
        }
-        if (lkb->lkb_ast_first != lkb->lkb_ast_type) {
+        if (cb.flags & DLM_CB_CAST) {
-                log_print("device_read %x ast_first %x ast_type %x",
+                int old_mode, new_mode;
-                          lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
-        }
-        if (!lkb->lkb_ast_type) {
+                old_mode = lkb->lkb_last_cast.mode;
-                list_del(&lkb->lkb_astqueue);
+                new_mode = cb.mode;
-                removed = 1;
-        }
-        spin_unlock(&proc->asts_spin);
-        if (ret_type) {
+                if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
-                error = copy_result_to_user(lkb->lkb_ua,
+                    dlm_lvb_operations[old_mode + 1][new_mode + 1])
-                                test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+                        copy_lvb = 1;
-                                ret_type, ret_mode, buf, count);
-                if (ret_type == AST_COMP)
+                lkb->lkb_lksb->sb_status = cb.sb_status;
-                        lkb->lkb_castmode_done = castmode;
+                lkb->lkb_lksb->sb_flags = cb.sb_flags;
-                if (ret_type == AST_BAST)
-                        lkb->lkb_bastmode_done = bastmode;
        }
-        /* removes reference for the proc->asts lists added by
+        rv = copy_result_to_user(lkb->lkb_ua,
-           dlm_user_add_ast() and may result in the lkb being freed */
+                                 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+                                 cb.flags, cb.mode, copy_lvb, buf, count);
-        if (removed)
+        /* removes ref for proc->asts, may cause lkb to be freed */
+        if (!resid)
                dlm_put_lkb(lkb);
-        /* the bast that was queued was eliminated (see unnecessary above),
+        return rv;
-           leaving nothing to return */
-        if (!ret_type)
-                goto try_another;
-        return error;
 }
 static unsigned int device_poll(struct file *file, poll_table *wait)
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index f196091dd7ff..00499ab8835f 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -9,7 +9,8 @@
 #ifndef __USER_DOT_H__
 #define __USER_DOT_H__
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode);
+void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
+                      int status, uint32_t sbflags, uint64_t seq);
 int dlm_user_init(void);
 void dlm_user_exit(void);
 int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2195c213ab2f..98b77c89494c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -8,6 +8,7 @@
 #include <linux/writeback.h>
 #include <linux/sysctl.h>
 #include <linux/gfp.h>
+#include "internal.h"
 /* A global variable is a bit ugly, but it keeps the code simple */
 int sysctl_drop_caches;
@@ -16,20 +17,23 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 {
        struct inode *inode, *toput_inode = NULL;
-        spin_lock(&inode_lock);
+        spin_lock(&inode_sb_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+                spin_lock(&inode->i_lock);
-                        continue;
+                if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-                if (inode->i_mapping->nrpages == 0)
+                    (inode->i_mapping->nrpages == 0)) {
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
                __iget(inode);
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode->i_lock);
+                spin_unlock(&inode_sb_list_lock);
                invalidate_mapping_pages(inode->i_mapping, 0, -1);
                iput(toput_inode);
                toput_inode = inode;
-                spin_lock(&inode_lock);
+                spin_lock(&inode_sb_list_lock);
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
        iput(toput_inode);
 }
@@ -45,7 +49,11 @@ static void drop_slab(void)
 int drop_caches_sysctl_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
-        proc_dointvec_minmax(table, write, buffer, length, ppos);
+        int ret;
+        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+        if (ret)
+                return ret;
        if (write) {
                if (sysctl_drop_caches & 1)
                        iterate_supers(drop_pagecache_sb, NULL);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index bfd8b680e648..d2a70a4561f9 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -266,7 +266,6 @@ void ecryptfs_destroy_mount_crypt_stat(
                                 &mount_crypt_stat->global_auth_tok_list,
                                 mount_crypt_stat_list) {
                list_del(&auth_tok->mount_crypt_stat_list);
-                mount_crypt_stat->num_global_auth_toks--;
                if (auth_tok->global_auth_tok_key
                    && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID))
                        key_put(auth_tok->global_auth_tok_key);
@@ -1389,6 +1388,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
                rc = -ENOMEM;
                goto out;
        }
+        /* Zeroed page ensures the in-header unencrypted i_size is set to 0 */
        rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat,
                                         ecryptfs_dentry);
        if (unlikely(rc)) {
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 6fc4f319b550..534c1d46e69e 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -46,24 +46,28 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct dentry *lower_dentry;
        struct vfsmount *lower_mnt;
-        struct dentry *dentry_save;
+        struct dentry *dentry_save = NULL;
-        struct vfsmount *vfsmount_save;
+        struct vfsmount *vfsmount_save = NULL;
        int rc = 1;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
        if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
                goto out;
-        dentry_save = nd->path.dentry;
+        if (nd) {
-        vfsmount_save = nd->path.mnt;
+                dentry_save = nd->path.dentry;
-        nd->path.dentry = lower_dentry;
+                vfsmount_save = nd->path.mnt;
-        nd->path.mnt = lower_mnt;
+                nd->path.dentry = lower_dentry;
+                nd->path.mnt = lower_mnt;
+        }
        rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd);
-        nd->path.dentry = dentry_save;
+        if (nd) {
-        nd->path.mnt = vfsmount_save;
+                nd->path.dentry = dentry_save;
+                nd->path.mnt = vfsmount_save;
+        }
        if (dentry->d_inode) {
                struct inode *lower_inode =
                        ecryptfs_inode_to_lower(dentry->d_inode);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index dbc84ed96336..bd3cafd0949d 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -233,7 +233,7 @@ ecryptfs_get_key_payload_data(struct key *key)
 struct ecryptfs_key_sig {
        struct list_head crypt_stat_list;
-        char keysig[ECRYPTFS_SIG_SIZE_HEX];
+        char keysig[ECRYPTFS_SIG_SIZE_HEX + 1];
 };
 struct ecryptfs_filename {
@@ -257,19 +257,18 @@ struct ecryptfs_filename {
 struct ecryptfs_crypt_stat {
 #define ECRYPTFS_STRUCT_INITIALIZED   0x00000001
 #define ECRYPTFS_POLICY_APPLIED       0x00000002
-#define ECRYPTFS_NEW_FILE             0x00000004
+#define ECRYPTFS_ENCRYPTED            0x00000004
-#define ECRYPTFS_ENCRYPTED            0x00000008
+#define ECRYPTFS_SECURITY_WARNING     0x00000008
-#define ECRYPTFS_SECURITY_WARNING     0x00000010
+#define ECRYPTFS_ENABLE_HMAC          0x00000010
-#define ECRYPTFS_ENABLE_HMAC          0x00000020
+#define ECRYPTFS_ENCRYPT_IV_PAGES     0x00000020
-#define ECRYPTFS_ENCRYPT_IV_PAGES     0x00000040
+#define ECRYPTFS_KEY_VALID            0x00000040
-#define ECRYPTFS_KEY_VALID            0x00000080
+#define ECRYPTFS_METADATA_IN_XATTR    0x00000080
-#define ECRYPTFS_METADATA_IN_XATTR    0x00000100
+#define ECRYPTFS_VIEW_AS_ENCRYPTED    0x00000100
-#define ECRYPTFS_VIEW_AS_ENCRYPTED    0x00000200
+#define ECRYPTFS_KEY_SET              0x00000200
-#define ECRYPTFS_KEY_SET              0x00000400
+#define ECRYPTFS_ENCRYPT_FILENAMES    0x00000400
-#define ECRYPTFS_ENCRYPT_FILENAMES    0x00000800
+#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00000800
-#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
+#define ECRYPTFS_ENCFN_USE_FEK        0x00001000
-#define ECRYPTFS_ENCFN_USE_FEK        0x00002000
+#define ECRYPTFS_UNLINK_SIGS          0x00002000
-#define ECRYPTFS_UNLINK_SIGS          0x00004000
        u32 flags;
        unsigned int file_version;
        size_t iv_bytes;
@@ -297,7 +296,6 @@ struct ecryptfs_inode_info {
        struct inode vfs_inode;
        struct inode *wii_inode;
        struct file *lower_file;
-        struct mutex lower_file_mutex;
        struct ecryptfs_crypt_stat crypt_stat;
 };
@@ -333,7 +331,6 @@ struct ecryptfs_global_auth_tok {
        u32 flags;
        struct list_head mount_crypt_stat_list;
        struct key *global_auth_tok_key;
-        struct ecryptfs_auth_tok *global_auth_tok;
        unsigned char sig[ECRYPTFS_SIG_SIZE_HEX + 1];
 };
@@ -380,7 +377,6 @@ struct ecryptfs_mount_crypt_stat {
        u32 flags;
        struct list_head global_auth_tok_list;
        struct mutex global_auth_tok_list_mutex;
-        size_t num_global_auth_toks;
        size_t global_default_cipher_key_size;
        size_t global_default_fn_cipher_key_bytes;
        unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
@@ -632,8 +628,7 @@ int ecryptfs_interpose(struct dentry *hidden_dentry,
                       u32 flags);
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                                        struct dentry *lower_dentry,
-                                        struct inode *ecryptfs_dir_inode,
+                                        struct inode *ecryptfs_dir_inode);
-                                        struct nameidata *ecryptfs_nd);
 int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
                                         size_t *decrypted_name_size,
                                         struct dentry *ecryptfs_dentry,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 81e10e6a9443..cedc913d11ba 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -273,7 +273,14 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 static int
 ecryptfs_fsync(struct file *file, int datasync)
 {
-        return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
+        int rc = 0;
+        rc = generic_file_fsync(file, datasync);
+        if (rc)
+                goto out;
+        rc = vfs_fsync(ecryptfs_file_to_lower(file), datasync);
+out:
+        return rc;
 }
 static int ecryptfs_fasync(int fd, struct file *file, int flag)
@@ -317,6 +324,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 const struct file_operations ecryptfs_dir_fops = {
        .readdir = ecryptfs_readdir,
+        .read = generic_read_dir,
        .unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index bd33f87a1907..f99051b7adab 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -74,16 +74,20 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
        unsigned int flags_save;
        int rc;
-        dentry_save = nd->path.dentry;
+        if (nd) {
-        vfsmount_save = nd->path.mnt;
+                dentry_save = nd->path.dentry;
-        flags_save = nd->flags;
+                vfsmount_save = nd->path.mnt;
-        nd->path.dentry = lower_dentry;
+                flags_save = nd->flags;
-        nd->path.mnt = lower_mnt;
+                nd->path.dentry = lower_dentry;
-        nd->flags &= ~LOOKUP_OPEN;
+                nd->path.mnt = lower_mnt;
+                nd->flags &= ~LOOKUP_OPEN;
+        }
        rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
-        nd->path.dentry = dentry_save;
+        if (nd) {
-        nd->path.mnt = vfsmount_save;
+                nd->path.dentry = dentry_save;
-        nd->flags = flags_save;
+                nd->path.mnt = vfsmount_save;
+                nd->flags = flags_save;
+        }
        return rc;
 }
@@ -139,26 +143,6 @@ out:
 }
 /**
- * grow_file
- * @ecryptfs_dentry: the eCryptfs dentry
- *
- * This is the code which will grow the file to its correct size.
- */
-static int grow_file(struct dentry *ecryptfs_dentry)
-{
-        struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
-        char zero_virt[] = { 0x00 };
-        int rc = 0;
-        rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1);
-        i_size_write(ecryptfs_inode, 0);
-        rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
-        ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |=
-                ECRYPTFS_NEW_FILE;
-        return rc;
-}
-/**
 * ecryptfs_initialize_file
 *
 * Cause the file to be changed from a basic empty file to an ecryptfs
@@ -177,7 +161,6 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
                crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
                goto out;
        }
-        crypt_stat->flags |= ECRYPTFS_NEW_FILE;
        ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n");
        rc = ecryptfs_new_file_context(ecryptfs_dentry);
        if (rc) {
@@ -198,9 +181,6 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
                printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc);
                goto out;
        }
-        rc = grow_file(ecryptfs_dentry);
-        if (rc)
-                printk(KERN_ERR "Error growing file; rc = [%d]\n", rc);
 out:
        return rc;
 }
@@ -241,8 +221,7 @@ out:
 */
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                                        struct dentry *lower_dentry,
-                                        struct inode *ecryptfs_dir_inode,
+                                        struct inode *ecryptfs_dir_inode)
-                                        struct nameidata *ecryptfs_nd)
 {
        struct dentry *lower_dir_dentry;
        struct vfsmount *lower_mnt;
@@ -290,8 +269,6 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                goto out;
        if (special_file(lower_inode->i_mode))
                goto out;
-        if (!ecryptfs_nd)
-                goto out;
        /* Released in this function */
        page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
        if (!page_virt) {
@@ -349,75 +326,6 @@ out:
 }
 /**
- * ecryptfs_new_lower_dentry
- * @name: The name of the new dentry.
- * @lower_dir_dentry: Parent directory of the new dentry.
- * @nd: nameidata from last lookup.
- *
- * Create a new dentry or get it from lower parent dir.
- */
-static struct dentry *
-ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
-                          struct nameidata *nd)
-{
-        struct dentry *new_dentry;
-        struct dentry *tmp;
-        struct inode *lower_dir_inode;
-        lower_dir_inode = lower_dir_dentry->d_inode;
-        tmp = d_alloc(lower_dir_dentry, name);
-        if (!tmp)
-                return ERR_PTR(-ENOMEM);
-        mutex_lock(&lower_dir_inode->i_mutex);
-        new_dentry = lower_dir_inode->i_op->lookup(lower_dir_inode, tmp, nd);
-        mutex_unlock(&lower_dir_inode->i_mutex);
-        if (!new_dentry)
-                new_dentry = tmp;
-        else
-                dput(tmp);
-        return new_dentry;
-}
-/**
- * ecryptfs_lookup_one_lower
- * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
- * @lower_dir_dentry: lower parent directory
- * @name: lower file name
- *
- * Get the lower dentry from vfs. If lower dentry does not exist yet,
- * create it.
- */
-static struct dentry *
-ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
-                          struct dentry *lower_dir_dentry, struct qstr *name)
-{
-        struct nameidata nd;
-        struct vfsmount *lower_mnt;
-        int err;
-        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
-                                    ecryptfs_dentry->d_parent));
-        err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
-        mntput(lower_mnt);
-        if (!err) {
-                /* we dont need the mount */
-                mntput(nd.path.mnt);
-                return nd.path.dentry;
-        }
-        if (err != -ENOENT)
-                return ERR_PTR(err);
-        /* create a new lower dentry */
-        return ecryptfs_new_lower_dentry(name, lower_dir_dentry, &nd);
-}
-/**
 * ecryptfs_lookup
 * @ecryptfs_dir_inode: The eCryptfs directory inode
 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
@@ -434,7 +342,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        size_t encrypted_and_encoded_name_size;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
        struct dentry *lower_dir_dentry, *lower_dentry;
-        struct qstr lower_name;
        int rc = 0;
        if ((ecryptfs_dentry->d_name.len == 1
@@ -444,20 +351,14 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                goto out_d_drop;
        }
        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
-        lower_name.name = ecryptfs_dentry->d_name.name;
+        mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
-        lower_name.len = ecryptfs_dentry->d_name.len;
+        lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
-        lower_name.hash = ecryptfs_dentry->d_name.hash;
+                                      lower_dir_dentry,
-        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+                                      ecryptfs_dentry->d_name.len);
-                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
-                                lower_dir_dentry->d_inode, &lower_name);
-                if (rc < 0)
-                        goto out_d_drop;
-        }
-        lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-                                                 lower_dir_dentry, &lower_name);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
-                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
+                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
                                encrypted_and_encoded_name);
                goto out_d_drop;
@@ -479,28 +380,21 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                       "filename; rc = [%d]\n", __func__, rc);
                goto out_d_drop;
        }
-        lower_name.name = encrypted_and_encoded_name;
+        mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
-        lower_name.len = encrypted_and_encoded_name_size;
+        lower_dentry = lookup_one_len(encrypted_and_encoded_name,
-        lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
+                                      lower_dir_dentry,
-        if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+                                      encrypted_and_encoded_name_size);
-                rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
-                                lower_dir_dentry->d_inode, &lower_name);
-                if (rc < 0)
-                        goto out_d_drop;
-        }
-        lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-                                                 lower_dir_dentry, &lower_name);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
-                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
+                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
                                encrypted_and_encoded_name);
                goto out_d_drop;
        }
 lookup_and_interpose:
        rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
-                                                 ecryptfs_dir_inode,
+                                                 ecryptfs_dir_inode);
-                                                 ecryptfs_nd);
        goto out;
 out_d_drop:
        d_drop(ecryptfs_dentry);
@@ -1092,6 +986,8 @@ int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
                         ecryptfs_dentry_to_lower(dentry), &lower_stat);
        if (!rc) {
+                fsstack_copy_attr_all(dentry->d_inode,
+                                      ecryptfs_inode_to_lower(dentry->d_inode));
                generic_fillattr(dentry->d_inode, stat);
                stat->blocks = lower_stat.blocks;
        }
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index c1436cff6f2d..03e609c45012 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -65,6 +65,24 @@ static int process_request_key_err(long err_code)
        return rc;
 }
+static int process_find_global_auth_tok_for_sig_err(int err_code)
+{
+        int rc = err_code;
+        switch (err_code) {
+        case -ENOENT:
+                ecryptfs_printk(KERN_WARNING, "Missing auth tok\n");
+                break;
+        case -EINVAL:
+                ecryptfs_printk(KERN_WARNING, "Invalid auth tok\n");
+                break;
+        default:
+                rc = process_request_key_err(err_code);
+                break;
+        }
+        return rc;
+}
 /**
 * ecryptfs_parse_packet_length
 * @data: Pointer to memory containing length at offset
@@ -403,27 +421,120 @@ out:
        return rc;
 }
+/**
+ * ecryptfs_verify_version
+ * @version: The version number to confirm
+ *
+ * Returns zero on good version; non-zero otherwise
+ */
+static int ecryptfs_verify_version(u16 version)
+{
+        int rc = 0;
+        unsigned char major;
+        unsigned char minor;
+        major = ((version >> 8) & 0xFF);
+        minor = (version & 0xFF);
+        if (major != ECRYPTFS_VERSION_MAJOR) {
+                ecryptfs_printk(KERN_ERR, "Major version number mismatch. "
+                                "Expected [%d]; got [%d]\n",
+                                ECRYPTFS_VERSION_MAJOR, major);
+                rc = -EINVAL;
+                goto out;
+        }
+        if (minor != ECRYPTFS_VERSION_MINOR) {
+                ecryptfs_printk(KERN_ERR, "Minor version number mismatch. "
+                                "Expected [%d]; got [%d]\n",
+                                ECRYPTFS_VERSION_MINOR, minor);
+                rc = -EINVAL;
+                goto out;
+        }
+out:
+        return rc;
+}
+/**
+ * ecryptfs_verify_auth_tok_from_key
+ * @auth_tok_key: key containing the authentication token
+ * @auth_tok: authentication token
+ *
+ * Returns zero on valid auth tok; -EINVAL otherwise
+ */
+static int
+ecryptfs_verify_auth_tok_from_key(struct key *auth_tok_key,
+                                  struct ecryptfs_auth_tok **auth_tok)
+{
+        int rc = 0;
+        (*auth_tok) = ecryptfs_get_key_payload_data(auth_tok_key);
+        if (ecryptfs_verify_version((*auth_tok)->version)) {
+                printk(KERN_ERR "Data structure version mismatch. Userspace "
+                       "tools must match eCryptfs kernel module with major "
+                       "version [%d] and minor version [%d]\n",
+                       ECRYPTFS_VERSION_MAJOR, ECRYPTFS_VERSION_MINOR);
+                rc = -EINVAL;
+                goto out;
+        }
+        if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD
+            && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) {
+                printk(KERN_ERR "Invalid auth_tok structure "
+                       "returned from key query\n");
+                rc = -EINVAL;
+                goto out;
+        }
+out:
+        return rc;
+}
 static int
 ecryptfs_find_global_auth_tok_for_sig(
-        struct ecryptfs_global_auth_tok **global_auth_tok,
+        struct key **auth_tok_key,
+        struct ecryptfs_auth_tok **auth_tok,
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
 {
        struct ecryptfs_global_auth_tok *walker;
        int rc = 0;
-        (*global_auth_tok) = NULL;
+        (*auth_tok_key) = NULL;
+        (*auth_tok) = NULL;
        mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
        list_for_each_entry(walker,
                            &mount_crypt_stat->global_auth_tok_list,
                            mount_crypt_stat_list) {
-                if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
+                if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX))
-                        rc = key_validate(walker->global_auth_tok_key);
+                        continue;
-                        if (!rc)
-                                (*global_auth_tok) = walker;
+                if (walker->flags & ECRYPTFS_AUTH_TOK_INVALID) {
+                        rc = -EINVAL;
                        goto out;
                }
+                rc = key_validate(walker->global_auth_tok_key);
+                if (rc) {
+                        if (rc == -EKEYEXPIRED)
+                                goto out;
+                        goto out_invalid_auth_tok;
+                }
+                down_write(&(walker->global_auth_tok_key->sem));
+                rc = ecryptfs_verify_auth_tok_from_key(
+                                walker->global_auth_tok_key, auth_tok);
+                if (rc)
+                        goto out_invalid_auth_tok_unlock;
+                (*auth_tok_key) = walker->global_auth_tok_key;
+                key_get(*auth_tok_key);
+                goto out;
        }
-        rc = -EINVAL;
+        rc = -ENOENT;
+        goto out;
+out_invalid_auth_tok_unlock:
+        up_write(&(walker->global_auth_tok_key->sem));
+out_invalid_auth_tok:
+        printk(KERN_WARNING "Invalidating auth tok with sig = [%s]\n", sig);
+        walker->flags |= ECRYPTFS_AUTH_TOK_INVALID;
+        key_put(walker->global_auth_tok_key);
+        walker->global_auth_tok_key = NULL;
 out:
        mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
        return rc;
@@ -451,14 +562,11 @@ ecryptfs_find_auth_tok_for_sig(
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
        char *sig)
 {
-        struct ecryptfs_global_auth_tok *global_auth_tok;
        int rc = 0;
-        (*auth_tok_key) = NULL;
+        rc = ecryptfs_find_global_auth_tok_for_sig(auth_tok_key, auth_tok,
-        (*auth_tok) = NULL;
+                                                   mount_crypt_stat, sig);
-        if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
+        if (rc == -ENOENT) {
-                                                  mount_crypt_stat, sig)) {
                /* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the
                 * mount_crypt_stat structure, we prevent to use auth toks that
                 * are not inserted through the ecryptfs_add_global_auth_tok
@@ -470,8 +578,7 @@ ecryptfs_find_auth_tok_for_sig(
                rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok,
                                                       sig);
-        } else
+        }
-                (*auth_tok) = global_auth_tok->global_auth_tok;
        return rc;
 }
@@ -531,6 +638,16 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
        }
        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
        (*packet_size) = 0;
+        rc = ecryptfs_find_auth_tok_for_sig(
+                &auth_tok_key,
+                &s->auth_tok, mount_crypt_stat,
+                mount_crypt_stat->global_default_fnek_sig);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to find auth tok for "
+                       "fnek sig [%s]; rc = [%d]\n", __func__,
+                       mount_crypt_stat->global_default_fnek_sig, rc);
+                goto out;
+        }
        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
                &s->desc.tfm,
                &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
@@ -616,16 +733,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
                goto out_free_unlock;
        }
        dest[s->i++] = s->cipher_code;
-        rc = ecryptfs_find_auth_tok_for_sig(
-                &auth_tok_key,
-                &s->auth_tok, mount_crypt_stat,
-                mount_crypt_stat->global_default_fnek_sig);
-        if (rc) {
-                printk(KERN_ERR "%s: Error attempting to find auth tok for "
-                       "fnek sig [%s]; rc = [%d]\n", __func__,
-                       mount_crypt_stat->global_default_fnek_sig, rc);
-                goto out_free_unlock;
-        }
        /* TODO: Support other key modules than passphrase for
         * filename encryption */
        if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
@@ -765,8 +872,10 @@ out_free_unlock:
 out_unlock:
        mutex_unlock(s->tfm_mutex);
 out:
-        if (auth_tok_key)
+        if (auth_tok_key) {
+                up_write(&(auth_tok_key->sem));
                key_put(auth_tok_key);
+        }
        kfree(s);
        return rc;
 }
@@ -879,6 +988,15 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
                       __func__, s->cipher_code);
                goto out;
        }
+        rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
+                                            &s->auth_tok, mount_crypt_stat,
+                                            s->fnek_sig_hex);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to find auth tok for "
+                       "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
+                       rc);
+                goto out;
+        }
        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
                                                        &s->tfm_mutex,
                                                        s->cipher_string);
@@ -925,15 +1043,6 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
         * >= ECRYPTFS_MAX_IV_BYTES. */
        memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
        s->desc.info = s->iv;
-        rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
-                                            &s->auth_tok, mount_crypt_stat,
-                                            s->fnek_sig_hex);
-        if (rc) {
-                printk(KERN_ERR "%s: Error attempting to find auth tok for "
-                       "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
-                       rc);
-                goto out_free_unlock;
-        }
        /* TODO: Support other key modules than passphrase for
         * filename encryption */
        if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
@@ -1002,8 +1111,10 @@ out:
                (*filename_size) = 0;
                (*filename) = NULL;
        }
-        if (auth_tok_key)
+        if (auth_tok_key) {
+                up_write(&(auth_tok_key->sem));
                key_put(auth_tok_key);
+        }
        kfree(s);
        return rc;
 }
@@ -1520,38 +1631,6 @@ out:
        return rc;
 }
-/**
- * ecryptfs_verify_version
- * @version: The version number to confirm
- *
- * Returns zero on good version; non-zero otherwise
- */
-static int ecryptfs_verify_version(u16 version)
-{
-        int rc = 0;
-        unsigned char major;
-        unsigned char minor;
-        major = ((version >> 8) & 0xFF);
-        minor = (version & 0xFF);
-        if (major != ECRYPTFS_VERSION_MAJOR) {
-                ecryptfs_printk(KERN_ERR, "Major version number mismatch. "
-                                "Expected [%d]; got [%d]\n",
-                                ECRYPTFS_VERSION_MAJOR, major);
-                rc = -EINVAL;
-                goto out;
-        }
-        if (minor != ECRYPTFS_VERSION_MINOR) {
-                ecryptfs_printk(KERN_ERR, "Minor version number mismatch. "
-                                "Expected [%d]; got [%d]\n",
-                                ECRYPTFS_VERSION_MINOR, minor);
-                rc = -EINVAL;
-                goto out;
-        }
-out:
-        return rc;
-}
 int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
                                      struct ecryptfs_auth_tok **auth_tok,
                                      char *sig)
@@ -1563,31 +1642,16 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
                printk(KERN_ERR "Could not find key with description: [%s]\n",
                       sig);
                rc = process_request_key_err(PTR_ERR(*auth_tok_key));
+                (*auth_tok_key) = NULL;
                goto out;
        }
-        (*auth_tok) = ecryptfs_get_key_payload_data(*auth_tok_key);
+        down_write(&(*auth_tok_key)->sem);
-        if (ecryptfs_verify_version((*auth_tok)->version)) {
+        rc = ecryptfs_verify_auth_tok_from_key(*auth_tok_key, auth_tok);
-                printk(KERN_ERR
-                       "Data structure version mismatch. "
-                       "Userspace tools must match eCryptfs "
-                       "kernel module with major version [%d] "
-                       "and minor version [%d]\n",
-                       ECRYPTFS_VERSION_MAJOR,
-                       ECRYPTFS_VERSION_MINOR);
-                rc = -EINVAL;
-                goto out_release_key;
-        }
-        if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD
-            && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) {
-                printk(KERN_ERR "Invalid auth_tok structure "
-                       "returned from key query\n");
-                rc = -EINVAL;
-                goto out_release_key;
-        }
-out_release_key:
        if (rc) {
+                up_write(&(*auth_tok_key)->sem);
                key_put(*auth_tok_key);
                (*auth_tok_key) = NULL;
+                goto out;
        }
 out:
        return rc;
@@ -1809,6 +1873,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
 find_next_matching_auth_tok:
        found_auth_tok = 0;
        if (auth_tok_key) {
+                up_write(&(auth_tok_key->sem));
                key_put(auth_tok_key);
                auth_tok_key = NULL;
        }
@@ -1895,8 +1960,10 @@ found_matching_auth_tok:
 out_wipe_list:
        wipe_auth_tok_list(&auth_tok_list);
 out:
-        if (auth_tok_key)
+        if (auth_tok_key) {
+                up_write(&(auth_tok_key->sem));
                key_put(auth_tok_key);
+        }
        return rc;
 }
@@ -2324,7 +2391,7 @@ ecryptfs_generate_key_packet_set(char *dest_base,
                                 size_t max)
 {
        struct ecryptfs_auth_tok *auth_tok;
-        struct ecryptfs_global_auth_tok *global_auth_tok;
+        struct key *auth_tok_key = NULL;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
                &ecryptfs_superblock_to_private(
                        ecryptfs_dentry->d_sb)->mount_crypt_stat;
@@ -2343,21 +2410,16 @@ ecryptfs_generate_key_packet_set(char *dest_base,
        list_for_each_entry(key_sig, &crypt_stat->keysig_list,
                            crypt_stat_list) {
                memset(key_rec, 0, sizeof(*key_rec));
-                rc = ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
+                rc = ecryptfs_find_global_auth_tok_for_sig(&auth_tok_key,
+                                                           &auth_tok,
                                                           mount_crypt_stat,
                                                           key_sig->keysig);
                if (rc) {
-                        printk(KERN_ERR "Error attempting to get the global "
+                        printk(KERN_WARNING "Unable to retrieve auth tok with "
-                               "auth_tok; rc = [%d]\n", rc);
+                               "sig = [%s]\n", key_sig->keysig);
+                        rc = process_find_global_auth_tok_for_sig_err(rc);
                        goto out_free;
                }
-                if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID) {
-                        printk(KERN_WARNING
-                               "Skipping invalid auth tok with sig = [%s]\n",
-                               global_auth_tok->sig);
-                        continue;
-                }
-                auth_tok = global_auth_tok->global_auth_tok;
                if (auth_tok->token_type == ECRYPTFS_PASSWORD) {
                        rc = write_tag_3_packet((dest_base + (*len)),
                                                &max, auth_tok,
@@ -2395,6 +2457,9 @@ ecryptfs_generate_key_packet_set(char *dest_base,
                        rc = -EINVAL;
                        goto out_free;
                }
+                up_write(&(auth_tok_key->sem));
+                key_put(auth_tok_key);
+                auth_tok_key = NULL;
        }
        if (likely(max > 0)) {
                dest_base[(*len)] = 0x00;
@@ -2407,6 +2472,11 @@ out_free:
 out:
        if (rc)
                (*len) = 0;
+        if (auth_tok_key) {
+                up_write(&(auth_tok_key->sem));
+                key_put(auth_tok_key);
+        }
        mutex_unlock(&crypt_stat->keysig_list_mutex);
        return rc;
 }
@@ -2424,6 +2494,7 @@ int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig)
                return -ENOMEM;
        }
        memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX);
+        new_key_sig->keysig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
        /* Caller must hold keysig_list_mutex */
        list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list);
@@ -2453,7 +2524,6 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
        mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
        list_add(&new_auth_tok->mount_crypt_stat_list,
                 &mount_crypt_stat->global_auth_tok_list);
-        mount_crypt_stat->num_global_auth_toks++;
        mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
 out:
        return rc;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 758323a0f09a..fdb2eb0ad09e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -122,7 +122,6 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
                ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
        int rc = 0;
-        mutex_lock(&inode_info->lower_file_mutex);
        if (!inode_info->lower_file) {
                struct dentry *lower_dentry;
                struct vfsmount *lower_mnt =
@@ -138,7 +137,6 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
                        inode_info->lower_file = NULL;
                }
        }
-        mutex_unlock(&inode_info->lower_file_mutex);
        return rc;
 }
@@ -241,14 +239,14 @@ static int ecryptfs_init_global_auth_toks(
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
 {
        struct ecryptfs_global_auth_tok *global_auth_tok;
+        struct ecryptfs_auth_tok *auth_tok;
        int rc = 0;
        list_for_each_entry(global_auth_tok,
                            &mount_crypt_stat->global_auth_tok_list,
                            mount_crypt_stat_list) {
                rc = ecryptfs_keyring_auth_tok_for_sig(
-                        &global_auth_tok->global_auth_tok_key,
+                        &global_auth_tok->global_auth_tok_key, &auth_tok,
-                        &global_auth_tok->global_auth_tok,
                        global_auth_tok->sig);
                if (rc) {
                        printk(KERN_ERR "Could not find valid key in user "
@@ -256,8 +254,10 @@ static int ecryptfs_init_global_auth_toks(
                               "option: [%s]\n", global_auth_tok->sig);
                        global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID;
                        goto out;
-                } else
+                } else {
                        global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID;
+                        up_write(&(global_auth_tok->global_auth_tok_key)->sem);
+                }
        }
 out:
        return rc;
@@ -276,7 +276,7 @@ static void ecryptfs_init_mount_crypt_stat(
 /**
 * ecryptfs_parse_options
 * @sb: The ecryptfs super block
- * @options: The options pased to the kernel
+ * @options: The options passed to the kernel
 *
 * Parse mount options:
 * debug=N         - ecryptfs_verbosity level for debug output
@@ -840,7 +840,7 @@ static int __init ecryptfs_init(void)
        }
        rc = ecryptfs_init_messaging();
        if (rc) {
-                printk(KERN_ERR "Failure occured while attempting to "
+                printk(KERN_ERR "Failure occurred while attempting to "
                                "initialize the communications channel to "
                                "ecryptfsd\n");
                goto out_destroy_kthread;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index cc64fca89f8d..6a44148c5fb9 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -62,6 +62,18 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
 {
        int rc;
+        /*
+         * Refuse to write the page out if we are called from reclaim context
+         * since our writepage() path may potentially allocate memory when
+         * calling into the lower fs vfs_write() which may in turn invoke
+         * us again.
+         */
+        if (current->flags & PF_MEMALLOC) {
+                redirty_page_for_writepage(wbc, page);
+                rc = 0;
+                goto out;
+        }
        rc = ecryptfs_encrypt_page(page);
        if (rc) {
                ecryptfs_printk(KERN_WARNING, "Error encrypting "
@@ -70,8 +82,8 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
                goto out;
        }
        SetPageUptodate(page);
-        unlock_page(page);
 out:
+        unlock_page(page);
        return rc;
 }
@@ -193,11 +205,7 @@ static int ecryptfs_readpage(struct file *file, struct page *page)
                &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
        int rc = 0;
-        if (!crypt_stat
+        if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-            || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
-            || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
-                ecryptfs_printk(KERN_DEBUG,
-                                "Passing through unencrypted page\n");
                rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
                                                      PAGE_CACHE_SIZE,
                                                      page->mapping->host);
@@ -295,8 +303,7 @@ static int ecryptfs_write_begin(struct file *file,
                struct ecryptfs_crypt_stat *crypt_stat =
                        &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
-                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
+                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-                    || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
                        rc = ecryptfs_read_lower_page_segment(
                                page, index, 0, PAGE_CACHE_SIZE, mapping->host);
                        if (rc) {
@@ -374,6 +381,11 @@ static int ecryptfs_write_begin(struct file *file,
            && (pos != 0))
                zero_user(page, 0, PAGE_CACHE_SIZE);
 out:
+        if (unlikely(rc)) {
+                unlock_page(page);
+                page_cache_release(page);
+                *pagep = NULL;
+        }
        return rc;
 }
@@ -486,13 +498,8 @@ static int ecryptfs_write_end(struct file *file,
        struct ecryptfs_crypt_stat *crypt_stat =
                &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
        int rc;
+        int need_unlock_page = 1;
-        if (crypt_stat->flags & ECRYPTFS_NEW_FILE) {
-                ecryptfs_printk(KERN_DEBUG, "ECRYPTFS_NEW_FILE flag set in "
-                        "crypt_stat at memory location [%p]\n", crypt_stat);
-                crypt_stat->flags &= ~(ECRYPTFS_NEW_FILE);
-        } else
-                ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
        ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
                        "(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
        if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
@@ -512,26 +519,26 @@ static int ecryptfs_write_end(struct file *file,
                        "zeros in page with index = [0x%.16lx]\n", index);
                goto out;
        }
-        rc = ecryptfs_encrypt_page(page);
+        set_page_dirty(page);
-        if (rc) {
+        unlock_page(page);
-                ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
+        need_unlock_page = 0;
-                                "index [0x%.16lx])\n", index);
-                goto out;
-        }
        if (pos + copied > i_size_read(ecryptfs_inode)) {
                i_size_write(ecryptfs_inode, pos + copied);
                ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
                        "[0x%.16llx]\n",
                        (unsigned long long)i_size_read(ecryptfs_inode));
+                balance_dirty_pages_ratelimited(mapping);
+                rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
+                if (rc) {
+                        printk(KERN_ERR "Error writing inode size to metadata; "
+                               "rc = [%d]\n", rc);
+                        goto out;
+                }
        }
-        rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
+        rc = copied;
-        if (rc)
-                printk(KERN_ERR "Error writing inode size to metadata; "
-                       "rc = [%d]\n", rc);
-        else
-                rc = copied;
 out:
-        unlock_page(page);
+        if (need_unlock_page)
+                unlock_page(page);
        page_cache_release(page);
        return rc;
 }
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index db184ef15d3d..85d430963116 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -44,15 +44,11 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
        ssize_t rc;
        inode_info = ecryptfs_inode_to_private(ecryptfs_inode);
-        mutex_lock(&inode_info->lower_file_mutex);
        BUG_ON(!inode_info->lower_file);
-        inode_info->lower_file->f_pos = offset;
        fs_save = get_fs();
        set_fs(get_ds());
-        rc = vfs_write(inode_info->lower_file, data, size,
+        rc = vfs_write(inode_info->lower_file, data, size, &offset);
-                       &inode_info->lower_file->f_pos);
        set_fs(fs_save);
-        mutex_unlock(&inode_info->lower_file_mutex);
        mark_inode_dirty_sync(ecryptfs_inode);
        return rc;
 }
@@ -234,15 +230,11 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
        mm_segment_t fs_save;
        ssize_t rc;
-        mutex_lock(&inode_info->lower_file_mutex);
        BUG_ON(!inode_info->lower_file);
-        inode_info->lower_file->f_pos = offset;
        fs_save = get_fs();
        set_fs(get_ds());
-        rc = vfs_read(inode_info->lower_file, data, size,
+        rc = vfs_read(inode_info->lower_file, data, size, &offset);
-                      &inode_info->lower_file->f_pos);
        set_fs(fs_save);
-        mutex_unlock(&inode_info->lower_file_mutex);
        return rc;
 }
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 3042fe123a34..bacc882e1ae4 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -55,7 +55,6 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
        if (unlikely(!inode_info))
                goto out;
        ecryptfs_init_crypt_stat(&inode_info->crypt_stat);
-        mutex_init(&inode_info->lower_file_mutex);
        inode_info->lower_file = NULL;
        inode = &inode_info->vfs_inode;
 out:
@@ -198,7 +197,7 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 const struct super_operations ecryptfs_sops = {
        .alloc_inode = ecryptfs_alloc_inode,
        .destroy_inode = ecryptfs_destroy_inode,
-        .drop_inode = generic_delete_inode,
+        .drop_inode = generic_drop_inode,
        .statfs = ecryptfs_statfs,
        .remount_fs = NULL,
        .evict_inode = ecryptfs_evict_inode,
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index a8e7797b9477..9c13412e6c99 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -23,7 +23,6 @@ static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
 }
 static const struct address_space_operations efs_aops = {
        .readpage = efs_readpage,
-        .sync_page = block_sync_page,
        .bmap = _efs_bmap
 };
diff --git a/fs/eventfd.c b/fs/eventfd.c
index e0194b3e14d6..d9a591773919 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -99,7 +99,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_get);
 * @ctx: [in] Pointer to eventfd context.
 *
 * The eventfd context reference must have been previously acquired either
- * with eventfd_ctx_get() or eventfd_ctx_fdget()).
+ * with eventfd_ctx_get() or eventfd_ctx_fdget().
 */
 void eventfd_ctx_put(struct eventfd_ctx *ctx)
 {
@@ -146,9 +146,9 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
 * @ctx: [in] Pointer to eventfd context.
 * @wait: [in] Wait queue to be removed.
- * @cnt: [out] Pointer to the 64bit conter value.
+ * @cnt: [out] Pointer to the 64-bit counter value.
 *
- * Returns zero if successful, or the following error codes:
+ * Returns %0 if successful, or the following error codes:
 *
 * -EAGAIN      : The operation would have blocked.
 *
@@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
 * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
 * @ctx: [in] Pointer to eventfd context.
 * @no_wait: [in] Different from zero if the operation should not block.
- * @cnt: [out] Pointer to the 64bit conter value.
+ * @cnt: [out] Pointer to the 64-bit counter value.
 *
- * Returns zero if successful, or the following error codes:
+ * Returns %0 if successful, or the following error codes:
 *
- * -EAGAIN      : The operation would have blocked but @no_wait was nonzero.
+ * -EAGAIN      : The operation would have blocked but @no_wait was non-zero.
 * -ERESTARTSYS : A signal interrupted the wait operation.
 *
 * If @no_wait is zero, the function might sleep until the eventfd internal
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cc8a9b7d6064..f9cfd168fbe2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -62,7 +62,14 @@
 * This mutex is acquired by ep_free() during the epoll file
 * cleanup path and it is also acquired by eventpoll_release_file()
 * if a file has been pushed inside an epoll set and it is then
- * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
+ * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
+ * It is also acquired when inserting an epoll fd onto another epoll
+ * fd. We do this so that we walk the epoll tree and ensure that this
+ * insertion does not create a cycle of epoll file descriptors, which
+ * could lead to deadlock. We need a global mutex to prevent two
+ * simultaneous inserts (A into B and B into A) from racing and
+ * constructing a cycle without either insert observing that it is
+ * going to.
 * It is possible to drop the "ep->mtx" and to use the global
 * mutex "epmutex" (together with "ep->lock") to have it working,
 * but having "ep->mtx" will make the interface more scalable.
@@ -145,11 +152,11 @@ struct epitem {
 /*
 * This structure is stored inside the "private_data" member of the file
- * structure and rapresent the main data sructure for the eventpoll
+ * structure and represents the main data structure for the eventpoll
 * interface.
 */
 struct eventpoll {
-        /* Protect the this structure access */
+        /* Protect the access to this structure */
        spinlock_t lock;
        /*
@@ -174,7 +181,7 @@ struct eventpoll {
        /*
         * This is a single linked list that chains all the "struct epitem" that
-         * happened while transfering ready events to userspace w/out
+         * happened while transferring ready events to userspace w/out
         * holding ->lock.
         */
        struct epitem *ovflist;
@@ -224,6 +231,9 @@ static long max_user_watches __read_mostly;
 */
 static DEFINE_MUTEX(epmutex);
+/* Used to check for epoll file descriptor inclusion loops */
+static struct nested_calls poll_loop_ncalls;
 /* Used for safe wake up implementation */
 static struct nested_calls poll_safewake_ncalls;
@@ -306,6 +316,19 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
 }
 /**
+ * ep_events_available - Checks if ready events might be available.
+ *
+ * @ep: Pointer to the eventpoll context.
+ *
+ * Returns: Returns a value different than zero if ready events are available,
+ *          or zero otherwise.
+ */
+static inline int ep_events_available(struct eventpoll *ep)
+{
+        return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
+}
+/**
 * ep_call_nested - Perform a bound (possibly) nested call, by checking
 *                  that the recursion limit is not exceeded, and that
 *                  the same nested call (by the meaning of same cookie) is
@@ -583,7 +606,7 @@ static void ep_free(struct eventpoll *ep)
         * We do not need to hold "ep->mtx" here because the epoll file
         * is on the way to be removed and no one has references to it
         * anymore. The only hit might come from eventpoll_release_file() but
-         * holding "epmutex" is sufficent here.
+         * holding "epmutex" is sufficient here.
         */
        mutex_lock(&epmutex);
@@ -697,7 +720,7 @@ void eventpoll_release_file(struct file *file)
        /*
         * We don't want to get "file->f_lock" because it is not
         * necessary. It is not necessary because we're in the "struct file"
-         * cleanup path, and this means that noone is using this file anymore.
+         * cleanup path, and this means that no one is using this file anymore.
         * So, for example, epoll_ctl() cannot hit here since if we reach this
         * point, the file counter already went to zero and fget() would fail.
         * The only hit might come from ep_free() but by holding the mutex
@@ -783,7 +806,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
 /*
 * This is the callback that is passed to the wait queue wakeup
- * machanism. It is called by the stored file descriptors when they
+ * mechanism. It is called by the stored file descriptors when they
 * have events to report.
 */
 static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
@@ -814,9 +837,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
                goto out_unlock;
        /*
-         * If we are trasfering events to userspace, we can hold no locks
+         * If we are transferring events to userspace, we can hold no locks
         * (because we're accessing user memory, and because of linux f_op->poll()
-         * semantics). All the events that happens during that period of time are
+         * semantics). All the events that happen during that period of time are
         * chained in ep->ovflist and requeued later on.
         */
        if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
@@ -1089,7 +1112,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
                                 * Trigger mode, we need to insert back inside
                                 * the ready list, so that the next call to
                                 * epoll_wait() will check again the events
-                                 * availability. At this point, noone can insert
+                                 * availability. At this point, no one can insert
                                 * into ep->rdllist besides us. The epoll_ctl()
                                 * callers are locked out by
                                 * ep_scan_ready_list() holding "mtx" and the
@@ -1114,31 +1137,63 @@ static int ep_send_events(struct eventpoll *ep,
        return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
 }
+static inline struct timespec ep_set_mstimeout(long ms)
+{
+        struct timespec now, ts = {
+                .tv_sec = ms / MSEC_PER_SEC,
+                .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
+        };
+        ktime_get_ts(&now);
+        return timespec_add_safe(now, ts);
+}
+/**
+ * ep_poll - Retrieves ready events, and delivers them to the caller supplied
+ *           event buffer.
+ *
+ * @ep: Pointer to the eventpoll context.
+ * @events: Pointer to the userspace buffer where the ready events should be
+ *          stored.
+ * @maxevents: Size (in terms of number of events) of the caller event buffer.
+ * @timeout: Maximum timeout for the ready events fetch operation, in
+ *           milliseconds. If the @timeout is zero, the function will not block,
+ *           while if the @timeout is less than zero, the function will block
+ *           until at least one event has been retrieved (or an error
+ *           occurred).
+ *
+ * Returns: Returns the number of ready events which have been fetched, or an
+ *          error code, in case of error.
+ */
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                   int maxevents, long timeout)
 {
-        int res, eavail, timed_out = 0;
+        int res = 0, eavail, timed_out = 0;
        unsigned long flags;
-        long slack;
+        long slack = 0;
        wait_queue_t wait;
-        struct timespec end_time;
        ktime_t expires, *to = NULL;
        if (timeout > 0) {
-                ktime_get_ts(&end_time);
+                struct timespec end_time = ep_set_mstimeout(timeout);
-                timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC);
                slack = select_estimate_accuracy(&end_time);
                to = &expires;
                *to = timespec_to_ktime(end_time);
        } else if (timeout == 0) {
+                /*
+                 * Avoid the unnecessary trip to the wait queue loop, if the
+                 * caller specified a non blocking operation.
+                 */
                timed_out = 1;
+                spin_lock_irqsave(&ep->lock, flags);
+                goto check_events;
        }
-retry:
+fetch_events:
        spin_lock_irqsave(&ep->lock, flags);
-        res = 0;
+        if (!ep_events_available(ep)) {
-        if (list_empty(&ep->rdllist)) {
                /*
                 * We don't have any available event to return to the caller.
                 * We need to sleep here, and we will be wake up by
@@ -1154,7 +1209,7 @@ retry:
                         * to TASK_INTERRUPTIBLE before doing the checks.
                         */
                        set_current_state(TASK_INTERRUPTIBLE);
-                        if (!list_empty(&ep->rdllist) || timed_out)
+                        if (ep_events_available(ep) || timed_out)
                                break;
                        if (signal_pending(current)) {
                                res = -EINTR;
@@ -1171,8 +1226,9 @@ retry:
                set_current_state(TASK_RUNNING);
        }
+check_events:
        /* Is it worth to try to dig for events ? */
-        eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
+        eavail = ep_events_available(ep);
        spin_unlock_irqrestore(&ep->lock, flags);
@@ -1183,11 +1239,67 @@ retry:
         */
        if (!res && eavail &&
            !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
-                goto retry;
+                goto fetch_events;
        return res;
 }
+/**
+ * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
+ *                      API, to verify that adding an epoll file inside another
+ *                      epoll structure, does not violate the constraints, in
+ *                      terms of closed loops, or too deep chains (which can
+ *                      result in excessive stack usage).
+ *
+ * @priv: Pointer to the epoll file to be currently checked.
+ * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
+ *          data structure pointer.
+ * @call_nests: Current dept of the @ep_call_nested() call stack.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+{
+        int error = 0;
+        struct file *file = priv;
+        struct eventpoll *ep = file->private_data;
+        struct rb_node *rbp;
+        struct epitem *epi;
+        mutex_lock(&ep->mtx);
+        for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+                epi = rb_entry(rbp, struct epitem, rbn);
+                if (unlikely(is_file_epoll(epi->ffd.file))) {
+                        error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+                                               ep_loop_check_proc, epi->ffd.file,
+                                               epi->ffd.file->private_data, current);
+                        if (error != 0)
+                                break;
+                }
+        }
+        mutex_unlock(&ep->mtx);
+        return error;
+}
+/**
+ * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
+ *                 another epoll file (represented by @ep) does not create
+ *                 closed loops or too deep chains.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the epoll file to be checked.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check(struct eventpoll *ep, struct file *file)
+{
+        return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+                              ep_loop_check_proc, file, ep, current);
+}
 /*
 * Open an eventpoll file descriptor.
 */
@@ -1236,6 +1348,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                struct epoll_event __user *, event)
 {
        int error;
+        int did_lock_epmutex = 0;
        struct file *file, *tfile;
        struct eventpoll *ep;
        struct epitem *epi;
@@ -1277,6 +1390,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
         */
        ep = file->private_data;
+        /*
+         * When we insert an epoll file descriptor, inside another epoll file
+         * descriptor, there is the change of creating closed loops, which are
+         * better be handled here, than in more critical paths.
+         *
+         * We hold epmutex across the loop check and the insert in this case, in
+         * order to prevent two separate inserts from racing and each doing the
+         * insert "at the same time" such that ep_loop_check passes on both
+         * before either one does the insert, thereby creating a cycle.
+         */
+        if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+                mutex_lock(&epmutex);
+                did_lock_epmutex = 1;
+                error = -ELOOP;
+                if (ep_loop_check(ep, tfile) != 0)
+                        goto error_tgt_fput;
+        }
        mutex_lock(&ep->mtx);
        /*
@@ -1312,6 +1444,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        mutex_unlock(&ep->mtx);
 error_tgt_fput:
+        if (unlikely(did_lock_epmutex))
+                mutex_unlock(&epmutex);
        fput(tfile);
 error_fput:
        fput(file);
@@ -1431,6 +1566,12 @@ static int __init eventpoll_init(void)
                EP_ITEM_COST;
        BUG_ON(max_user_watches < 0);
+        /*
+         * Initialize the structure used to perform epoll file descriptor
+         * inclusion loops checks.
+         */
+        ep_nested_calls_init(&poll_loop_ncalls);
        /* Initialize the structure used to perform safe poll wait head wake ups */
        ep_nested_calls_init(&poll_safewake_ncalls);
diff --git a/fs/exec.c b/fs/exec.c
index c62efcb959c7..5e62d26a4fec 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
        struct file *file;
        char *tmp = getname(library);
        int error = PTR_ERR(tmp);
+        static const struct open_flags uselib_flags = {
+                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+                .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
+                .intent = LOOKUP_OPEN
+        };
        if (IS_ERR(tmp))
                goto out;
-        file = do_filp_open(AT_FDCWD, tmp,
+        file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
-                                O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
-                                MAY_READ | MAY_EXEC | MAY_OPEN);
        putname(tmp);
        error = PTR_ERR(file);
        if (IS_ERR(file))
@@ -721,10 +724,13 @@ struct file *open_exec(const char *name)
 {
        struct file *file;
        int err;
+        static const struct open_flags open_exec_flags = {
+                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+                .acc_mode = MAY_EXEC | MAY_OPEN,
+                .intent = LOOKUP_OPEN
+        };
-        file = do_filp_open(AT_FDCWD, name,
+        file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
-                                O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
-                                MAY_EXEC | MAY_OPEN);
        if (IS_ERR(file))
                goto out;
@@ -1869,7 +1875,7 @@ static void wait_for_dump_helpers(struct file *file)
 /*
- * uhm_pipe_setup
+ * umh_pipe_setup
 * helper function to customize the process used
 * to collect the core in userspace.  Specifically
 * it sets up a pipe and installs it as fd 0 (stdin)
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index f0d520312d8b..3bbd46956d77 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -53,10 +53,14 @@
 #define EXOFS_ROOT_ID   0x10002 /* object ID for root directory */
 /* exofs Application specific page/attribute */
+/* Inode attrs */
 # define EXOFS_APAGE_FS_DATA    (OSD_APAGE_APP_DEFINED_FIRST + 3)
 # define EXOFS_ATTR_INODE_DATA  1
 # define EXOFS_ATTR_INODE_FILE_LAYOUT   2
 # define EXOFS_ATTR_INODE_DIR_LAYOUT    3
+/* Partition attrs */
+# define EXOFS_APAGE_SB_DATA    (0xF0000000U + 3)
+# define EXOFS_ATTR_SB_STATS    1
 /*
 * The maximum number of files we can have is limited by the size of the
@@ -86,8 +90,8 @@ enum {
 */
 enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
 struct exofs_fscb {
-        __le64  s_nextid;       /* Highest object ID used */
+        __le64  s_nextid;       /* Only used after mkfs */
-        __le64  s_numfiles;     /* Number of files on fs */
+        __le64  s_numfiles;     /* Only used after mkfs */
        __le32  s_version;      /* == EXOFS_FSCB_VER */
        __le16  s_magic;        /* Magic signature */
        __le16  s_newfs;        /* Non-zero if this is a new fs */
@@ -98,10 +102,20 @@ struct exofs_fscb {
 } __packed;
 /*
+ * This struct is set on the FS partition's attributes.
+ * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
+ * with the create command, to atomically persist the sb writeable information.
+ */
+struct exofs_sb_stats {
+        __le64  s_nextid;       /* Highest object ID used */
+        __le64  s_numfiles;     /* Number of files on fs */
+} __packed;
+/*
 * Describes the raid used in the FS. It is part of the device table.
 * This here is taken from the pNFS-objects definition. In exofs we
 * use one raid policy through-out the filesystem. (NOTE: the funny
- * alignment at begining. We take care of it at exofs_device_table.
+ * alignment at beginning. We take care of it at exofs_device_table.
 */
 struct exofs_dt_data_map {
        __le32  cb_num_comps;
@@ -122,7 +136,7 @@ struct exofs_dt_device_info {
        u8      systemid[OSD_SYSTEMID_LEN];
        __le64  long_name_offset;       /* If !0 then offset-in-file */
        __le32  osdname_len;            /* */
-        u8      osdname[44];            /* Embbeded, Ususally an asci uuid */
+        u8      osdname[44];            /* Embbeded, Usually an asci uuid */
 } __packed;
 /*
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index dcc941d82d67..d0941c6a1f72 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -124,7 +124,7 @@ out:
 Ebadsize:
        EXOFS_ERR("ERROR [exofs_check_page]: "
-                "size of directory #%lu is not a multiple of chunk size",
+                "size of directory(0x%lx) is not a multiple of chunk size\n",
                dir->i_ino
        );
        goto fail;
@@ -142,8 +142,8 @@ Espan:
        goto bad_entry;
 bad_entry:
        EXOFS_ERR(
-                "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
+                "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
-                "offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
+                "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n",
                dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
                _LLU(le64_to_cpu(p->inode_no)),
                rec_len, p->name_len);
@@ -151,8 +151,8 @@ bad_entry:
 Eend:
        p = (struct exofs_dir_entry *)(kaddr + offs);
        EXOFS_ERR("ERROR [exofs_check_page]: "
-                "entry in directory #%lu spans the page boundary"
+                "entry in directory(0x%lx) spans the page boundary"
-                "offset=%lu, inode=%llu",
+                "offset=%lu, inode=0x%llx\n",
                dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
                _LLU(le64_to_cpu(p->inode_no)));
 fail:
@@ -261,9 +261,8 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                struct page *page = exofs_get_page(inode, n);
                if (IS_ERR(page)) {
-                        EXOFS_ERR("ERROR: "
+                        EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
-                                   "bad page in #%lu",
+                                  inode->i_ino);
-                                   inode->i_ino);
                        filp->f_pos += PAGE_CACHE_SIZE - offset;
                        return PTR_ERR(page);
                }
@@ -283,7 +282,8 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                for (; (char *)de <= limit; de = exofs_next_entry(de)) {
                        if (de->rec_len == 0) {
                                EXOFS_ERR("ERROR: "
-                                        "zero-length directory entry");
+                                     "zero-length entry in directory(0x%lx)\n",
+                                     inode->i_ino);
                                exofs_put_page(page);
                                return -EIO;
                        }
@@ -342,9 +342,9 @@ struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
                        kaddr += exofs_last_byte(dir, n) - reclen;
                        while ((char *) de <= kaddr) {
                                if (de->rec_len == 0) {
-                                        EXOFS_ERR(
+                                        EXOFS_ERR("ERROR: zero-length entry in "
-                                                "ERROR: exofs_find_entry: "
+                                                  "directory(0x%lx)\n",
-                                                "zero-length directory entry");
+                                                  dir->i_ino);
                                        exofs_put_page(page);
                                        goto out;
                                }
@@ -472,7 +472,8 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
                        }
                        if (de->rec_len == 0) {
                                EXOFS_ERR("ERROR: exofs_add_link: "
-                                        "zero-length directory entry");
+                                      "zero-length entry in directory(0x%lx)\n",
+                                      inode->i_ino);
                                err = -EIO;
                                goto out_unlock;
                        }
@@ -491,7 +492,8 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
                exofs_put_page(page);
        }
-        EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
+        EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n",
+                  dentry, inode->i_ino);
        return -EINVAL;
 got_it:
@@ -542,7 +544,8 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
        while (de < dir) {
                if (de->rec_len == 0) {
                        EXOFS_ERR("ERROR: exofs_delete_entry:"
-                                "zero-length directory entry");
+                                  "zero-length entry in directory(0x%lx)\n",
+                                  inode->i_ino);
                        err = -EIO;
                        goto out;
                }
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 2dc925fa1010..c965806c2821 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -77,7 +77,7 @@ struct exofs_layout {
 * our extension to the in-memory superblock
 */
 struct exofs_sb_info {
-        struct exofs_fscb s_fscb;               /* Written often, pre-allocate*/
+        struct exofs_sb_stats s_ess;            /* Written often, pre-allocate*/
        int             s_timeout;              /* timeout for OSD operations */
        uint64_t        s_nextid;               /* highest object ID used     */
        uint32_t        s_numfiles;             /* number of files on fs      */
@@ -256,6 +256,8 @@ static inline int exofs_oi_read(struct exofs_i_info *oi,
 }
 /* inode.c               */
+unsigned exofs_max_io_pages(struct exofs_layout *layout,
+                            unsigned expected_pages);
 int exofs_setattr(struct dentry *, struct iattr *);
 int exofs_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
@@ -279,7 +281,7 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
                    struct inode *);
 /* super.c               */
-int exofs_sync_fs(struct super_block *sb, int wait);
+int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
 /*********************
 * operation vectors *
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index b905c79b4f0a..45ca323d8363 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -45,22 +45,8 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
 static int exofs_file_fsync(struct file *filp, int datasync)
 {
        int ret;
-        struct inode *inode = filp->f_mapping->host;
-        struct super_block *sb;
-        if (!(inode->i_state & I_DIRTY))
-                return 0;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return 0;
-        ret = sync_inode_metadata(inode, 1);
-        /* This is a good place to write the sb */
-        /* TODO: Sechedule an sb-sync on create */
-        sb = inode->i_sb;
-        if (sb->s_dirt)
-                exofs_sync_fs(sb, 1);
+        ret = sync_inode_metadata(filp->f_mapping->host, 1);
        return ret;
 }
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 42685424817b..8472c098445d 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -43,6 +43,17 @@ enum { BIO_MAX_PAGES_KMALLOC =
                PAGE_SIZE / sizeof(struct page *),
 };
+unsigned exofs_max_io_pages(struct exofs_layout *layout,
+                            unsigned expected_pages)
+{
+        unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
+        /* TODO: easily support bio chaining */
+        pages =  min_t(unsigned, pages,
+                       layout->group_width * BIO_MAX_PAGES_KMALLOC);
+        return pages;
+}
 struct page_collect {
        struct exofs_sb_info *sbi;
        struct inode *inode;
@@ -97,8 +108,7 @@ static void _pcol_reset(struct page_collect *pcol)
 static int pcol_try_alloc(struct page_collect *pcol)
 {
-        unsigned pages = min_t(unsigned, pcol->expected_pages,
+        unsigned pages;
-                          MAX_PAGES_KMALLOC);
        if (!pcol->ios) { /* First time allocate io_state */
                int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
@@ -108,8 +118,7 @@ static int pcol_try_alloc(struct page_collect *pcol)
        }
        /* TODO: easily support bio chaining */
-        pages =  min_t(unsigned, pages,
+        pages =  exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
-                       pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
        for (; pages; pages >>= 1) {
                pcol->pages = kmalloc(pages * sizeof(struct page *),
@@ -350,8 +359,10 @@ static int readpage_strip(void *data, struct page *page)
                if (!pcol->read_4_write)
                        unlock_page(page);
-                EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
+                EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx "
-                             " splitting\n", inode->i_ino, page->index);
+                             "read_4_write=%d index=0x%lx end_index=0x%lx "
+                             "splitting\n", inode->i_ino, len,
+                             pcol->read_4_write, page->index, end_index);
                return read_exec(pcol);
        }
@@ -722,11 +733,28 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
         /* read modify write */
        if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+                loff_t i_size = i_size_read(mapping->host);
+                pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+                size_t rlen;
+                if (page->index < end_index)
+                        rlen = PAGE_CACHE_SIZE;
+                else if (page->index == end_index)
+                        rlen = i_size & ~PAGE_CACHE_MASK;
+                else
+                        rlen = 0;
+                if (!rlen) {
+                        clear_highpage(page);
+                        SetPageUptodate(page);
+                        goto out;
+                }
                ret = _readpage(page, true);
                if (ret) {
                        /*SetPageError was done by _readpage. Is it ok?*/
                        unlock_page(page);
-                        EXOFS_DBGMSG("__readpage_filler failed\n");
+                        EXOFS_DBGMSG("__readpage failed\n");
                }
        }
 out:
@@ -795,7 +823,6 @@ const struct address_space_operations exofs_aops = {
        .direct_IO      = NULL, /* TODO: Should be trivial to do */
        /* With these NULL has special meaning or default is not exported */
-        .sync_page      = NULL,
        .get_xip_mem    = NULL,
        .migratepage    = NULL,
        .launder_page   = NULL,
@@ -1074,6 +1101,7 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
        }
        return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
 }
 /*
 * Callback function from exofs_new_inode().  The important thing is that we
 * set the obj_created flag so that other methods know that the object exists on
@@ -1132,7 +1160,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        sbi = sb->s_fs_info;
        inode->i_mapping->backing_dev_info = sb->s_bdi;
-        sb->s_dirt = 1;
        inode_init_owner(inode, dir, mode);
        inode->i_ino = sbi->s_nextid++;
        inode->i_blkbits = EXOFS_BLKSHIFT;
@@ -1143,6 +1170,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
        spin_unlock(&sbi->s_next_gen_lock);
        insert_inode_hash(inode);
+        exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
        mark_inode_dirty(inode);
        ret = exofs_get_io_state(&sbi->layout, &ios);
@@ -1273,7 +1302,8 @@ out:
 int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-        return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+        /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */
+        return exofs_update_inode(inode, 1);
 }
 /*
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 264e95d02830..4d70db110cfc 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -272,7 +272,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                err = exofs_set_link(new_dir, new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME;
                if (dir_de)
@@ -286,12 +285,9 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= EXOFS_LINK_MAX)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = exofs_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
@@ -299,7 +295,7 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
        old_inode->i_ctime = CURRENT_TIME;
        exofs_delete_entry(old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 8c6c4669b381..06065bd37fc3 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -48,6 +48,7 @@
 * struct to hold what we get from mount options
 */
 struct exofs_mountopt {
+        bool is_osdname;
        const char *dev_name;
        uint64_t pid;
        int timeout;
@@ -56,7 +57,7 @@ struct exofs_mountopt {
 /*
 * exofs-specific mount-time options.
 */
-enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
+enum { Opt_name, Opt_pid, Opt_to, Opt_err };
 /*
 * Our mount-time options.  These should ideally be 64-bit unsigned, but the
@@ -64,6 +65,7 @@ enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
 * sufficient for most applications now.
 */
 static match_table_t tokens = {
+        {Opt_name, "osdname=%s"},
        {Opt_pid, "pid=%u"},
        {Opt_to, "to=%u"},
        {Opt_err, NULL}
@@ -94,6 +96,14 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
                token = match_token(p, tokens, args);
                switch (token) {
+                case Opt_name:
+                        opts->dev_name = match_strdup(&args[0]);
+                        if (unlikely(!opts->dev_name)) {
+                                EXOFS_ERR("Error allocating dev_name");
+                                return -ENOMEM;
+                        }
+                        opts->is_osdname = true;
+                        break;
                case Opt_pid:
                        if (0 == match_strlcpy(str, &args[0], sizeof(str)))
                                return -EINVAL;
@@ -203,6 +213,101 @@ static void destroy_inodecache(void)
 static const struct super_operations exofs_sops;
 static const struct export_operations exofs_export_ops;
+static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
+        EXOFS_APAGE_SB_DATA,
+        EXOFS_ATTR_SB_STATS,
+        sizeof(struct exofs_sb_stats));
+static int __sbi_read_stats(struct exofs_sb_info *sbi)
+{
+        struct osd_attr attrs[] = {
+                [0] = g_attr_sb_stats,
+        };
+        struct exofs_io_state *ios;
+        int ret;
+        ret = exofs_get_io_state(&sbi->layout, &ios);
+        if (unlikely(ret)) {
+                EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+                return ret;
+        }
+        ios->cred = sbi->s_cred;
+        ios->in_attr = attrs;
+        ios->in_attr_len = ARRAY_SIZE(attrs);
+        ret = exofs_sbi_read(ios);
+        if (unlikely(ret)) {
+                EXOFS_ERR("Error reading super_block stats => %d\n", ret);
+                goto out;
+        }
+        ret = extract_attr_from_ios(ios, &attrs[0]);
+        if (ret) {
+                EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
+                goto out;
+        }
+        if (attrs[0].len) {
+                struct exofs_sb_stats *ess;
+                if (unlikely(attrs[0].len != sizeof(*ess))) {
+                        EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
+                                  "size(%d) != expected(%zd)\n",
+                                  __func__, attrs[0].len, sizeof(*ess));
+                        goto out;
+                }
+                ess = attrs[0].val_ptr;
+                sbi->s_nextid = le64_to_cpu(ess->s_nextid);
+                sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
+        }
+out:
+        exofs_put_io_state(ios);
+        return ret;
+}
+static void stats_done(struct exofs_io_state *ios, void *p)
+{
+        exofs_put_io_state(ios);
+        /* Good thanks nothing to do anymore */
+}
+/* Asynchronously write the stats attribute */
+int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
+{
+        struct osd_attr attrs[] = {
+                [0] = g_attr_sb_stats,
+        };
+        struct exofs_io_state *ios;
+        int ret;
+        ret = exofs_get_io_state(&sbi->layout, &ios);
+        if (unlikely(ret)) {
+                EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
+                return ret;
+        }
+        sbi->s_ess.s_nextid   = cpu_to_le64(sbi->s_nextid);
+        sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
+        attrs[0].val_ptr = &sbi->s_ess;
+        ios->cred = sbi->s_cred;
+        ios->done = stats_done;
+        ios->private = sbi;
+        ios->out_attr = attrs;
+        ios->out_attr_len = ARRAY_SIZE(attrs);
+        ret = exofs_sbi_write(ios);
+        if (unlikely(ret)) {
+                EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
+                exofs_put_io_state(ios);
+        }
+        return ret;
+}
 /*
 * Write the superblock to the OSD
 */
@@ -213,18 +318,25 @@ int exofs_sync_fs(struct super_block *sb, int wait)
        struct exofs_io_state *ios;
        int ret = -ENOMEM;
-        lock_super(sb);
+        fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
+        if (unlikely(!fscb))
+                return -ENOMEM;
        sbi = sb->s_fs_info;
-        fscb = &sbi->s_fscb;
+        /* NOTE: We no longer dirty the super_block anywhere in exofs. The
+         * reason we write the fscb here on unmount is so we can stay backwards
+         * compatible with fscb->s_version == 1. (What we are not compatible
+         * with is if a new version FS crashed and then we try to mount an old
+         * version). Otherwise the exofs_fscb is read-only from mkfs time. All
+         * the writeable info is set in exofs_sbi_write_stats() above.
+         */
        ret = exofs_get_io_state(&sbi->layout, &ios);
-        if (ret)
+        if (unlikely(ret))
                goto out;
-        /* Note: We only write the changing part of the fscb. .i.e upto the
+        lock_super(sb);
-         *       the fscb->s_dev_table_oid member. There is no read-modify-write
-         *       here.
-         */
        ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
        memset(fscb, 0, ios->length);
        fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -239,16 +351,17 @@ int exofs_sync_fs(struct super_block *sb, int wait)
        ios->cred = sbi->s_cred;
        ret = exofs_sbi_write(ios);
-        if (unlikely(ret)) {
+        if (unlikely(ret))
                EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
-                goto out;
+        else
-        }
+                sb->s_dirt = 0;
-        sb->s_dirt = 0;
+        unlock_super(sb);
 out:
        EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
        exofs_put_io_state(ios);
-        unlock_super(sb);
+        kfree(fscb);
        return ret;
 }
@@ -292,13 +405,14 @@ static void exofs_put_super(struct super_block *sb)
        int num_pend;
        struct exofs_sb_info *sbi = sb->s_fs_info;
-        if (sb->s_dirt)
-                exofs_write_super(sb);
        /* make sure there are no pending commands */
        for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
             num_pend = atomic_read(&sbi->s_curr_pending)) {
                wait_queue_head_t wq;
+                printk(KERN_NOTICE "%s: !!Pending operations in flight. "
+                       "This is a BUG. please report to osd-dev@open-osd.org\n",
+                       __func__);
                init_waitqueue_head(&wq);
                wait_event_timeout(wq,
                                  (atomic_read(&sbi->s_curr_pending) == 0),
@@ -390,6 +504,23 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
        return 0;
 }
+static unsigned __ra_pages(struct exofs_layout *layout)
+{
+        const unsigned _MIN_RA = 32; /* min 128K read-ahead */
+        unsigned ra_pages = layout->group_width * layout->stripe_unit /
+                                PAGE_SIZE;
+        unsigned max_io_pages = exofs_max_io_pages(layout, ~0);
+        ra_pages *= 2; /* two stripes */
+        if (ra_pages < _MIN_RA)
+                ra_pages = roundup(_MIN_RA, ra_pages / 2);
+        if (ra_pages > max_io_pages)
+                ra_pages = max_io_pages;
+        return ra_pages;
+}
 /* @odi is valid only as long as @fscb_dev is valid */
 static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
                             struct osd_dev_info *odi)
@@ -495,7 +626,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
                }
                od = osduld_info_lookup(&odi);
-                if (unlikely(IS_ERR(od))) {
+                if (IS_ERR(od)) {
                        ret = PTR_ERR(od);
                        EXOFS_ERR("ERROR: device requested is not found "
                                  "osd_name-%s =>%d\n", odi.osdname, ret);
@@ -558,9 +689,17 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_bdi;
        /* use mount options to fill superblock */
-        od = osduld_path_lookup(opts->dev_name);
+        if (opts->is_osdname) {
+                struct osd_dev_info odi = {.systemid_len = 0};
+                odi.osdname_len = strlen(opts->dev_name);
+                odi.osdname = (u8 *)opts->dev_name;
+                od = osduld_info_lookup(&odi);
+        } else {
+                od = osduld_path_lookup(opts->dev_name);
+        }
        if (IS_ERR(od)) {
-                ret = PTR_ERR(od);
+                ret = -EINVAL;
                goto free_sbi;
        }
@@ -594,6 +733,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_sbi;
        sb->s_magic = le16_to_cpu(fscb.s_magic);
+        /* NOTE: we read below to be backward compatible with old versions */
        sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
        sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
@@ -604,7 +744,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                ret = -EINVAL;
                goto free_sbi;
        }
-        if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
+        if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
                EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
                          EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
                ret = -EINVAL;
@@ -622,7 +762,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                        goto free_sbi;
        }
+        __sbi_read_stats(sbi);
        /* set up operation vectors */
+        sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
        sb->s_bdi = &sbi->bdi;
        sb->s_fs_info = sbi;
        sb->s_op = &exofs_sops;
@@ -652,6 +795,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
                            sbi->layout.s_pid);
+        if (opts->is_osdname)
+                kfree(opts->dev_name);
        return 0;
 free_sbi:
@@ -660,6 +805,8 @@ free_bdi:
        EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
                  opts->dev_name, sbi->layout.s_pid, ret);
        exofs_free_sbi(sbi);
+        if (opts->is_osdname)
+                kfree(opts->dev_name);
        return ret;
 }
@@ -677,7 +824,8 @@ static struct dentry *exofs_mount(struct file_system_type *type,
        if (ret)
                return ERR_PTR(ret);
-        opts.dev_name = dev_name;
+        if (!opts.dev_name)
+                opts.dev_name = dev_name;
        return mount_nodev(type, flags, &opts, exofs_fill_super);
 }
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 4b6825740dd5..b05acb796135 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
        struct inode * inode = dentry->d_inode;
        int len = *max_len;
        int type = FILEID_INO32_GEN;
-        
-        if (len < 2 || (connectable && len < 4))
+        if (connectable && (len < 4)) {
+                *max_len = 4;
+                return 255;
+        } else if (len < 2) {
+                *max_len = 2;
                return 255;
+        }
        len = 2;
        fid->i32.ino = inode->i_ino;
@@ -369,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
        /*
         * Try to get any dentry for the given file handle from the filesystem.
         */
+        if (!nop || !nop->fh_to_dentry)
+                return ERR_PTR(-ESTALE);
        result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
        if (!result)
                result = ERR_PTR(-ESTALE);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 7b4180554a62..abea5a17c764 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -406,7 +406,7 @@ ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
                return -EINVAL;
        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(dentry->d_inode))
+        if (!inode_owner_or_capable(dentry->d_inode))
                return -EPERM;
        if (value) {
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 0d06f4e75699..8f44cef1b3ef 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -850,7 +850,7 @@ static int find_next_reservable_window(
                rsv_window_remove(sb, my_rsv);
        /*
-         * Let's book the whole avaliable window for now.  We will check the
+         * Let's book the whole available window for now.  We will check the
         * disk bitmap later and then, if there are free blocks then we adjust
         * the window size if it's larger than requested.
         * Otherwise, we will remove this node from the tree next time
@@ -1357,9 +1357,9 @@ retry_alloc:
                        goto allocated;
        }
        /*
-         * We may end up a bogus ealier ENOSPC error due to
+         * We may end up a bogus earlier ENOSPC error due to
         * filesystem is "full" of reservations, but
-         * there maybe indeed free blocks avaliable on disk
+         * there maybe indeed free blocks available on disk
         * In this case, we just forget about the reservations
         * just do block allocation as without reservations.
         */
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 6346a2acf326..645be9e7ee47 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -110,7 +110,7 @@ extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
 extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
 /* ialloc.c */
-extern struct inode * ext2_new_inode (struct inode *, int);
+extern struct inode * ext2_new_inode (struct inode *, int, const struct qstr *);
 extern void ext2_free_inode (struct inode *);
 extern unsigned long ext2_count_free_inodes (struct super_block *);
 extern void ext2_check_inodes_bitmap (struct super_block *);
@@ -174,3 +174,9 @@ ext2_group_first_block_no(struct super_block *sb, unsigned long group_no)
        return group_no * (ext2_fsblk_t)EXT2_BLOCKS_PER_GROUP(sb) +
                le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block);
 }
+#define ext2_set_bit    __test_and_set_bit_le
+#define ext2_clear_bit  __test_and_clear_bit_le
+#define ext2_test_bit   test_bit_le
+#define ext2_find_first_zero_bit        find_first_zero_bit_le
+#define ext2_find_next_zero_bit         find_next_zero_bit_le
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ad70479aabff..ee9ed31948e1 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -429,7 +429,8 @@ found:
        return group;
 }
-struct inode *ext2_new_inode(struct inode *dir, int mode)
+struct inode *ext2_new_inode(struct inode *dir, int mode,
+                             const struct qstr *qstr)
 {
        struct super_block *sb;
        struct buffer_head *bitmap_bh = NULL;
@@ -585,7 +586,7 @@ got:
        if (err)
                goto fail_free_drop;
-        err = ext2_init_security(inode,dir);
+        err = ext2_init_security(inode, dir, qstr);
        if (err)
                goto fail_free_drop;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 40ad210a5049..788e09a07f7e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -305,7 +305,7 @@ static ext2_fsblk_t ext2_find_near(struct inode *inode, Indirect *ind)
                return ind->bh->b_blocknr;
        /*
-         * It is going to be refered from inode itself? OK, just put it into
+         * It is going to be referred from inode itself? OK, just put it into
         * the same cylinder group then.
         */
        bg_start = ext2_group_first_block_no(inode->i_sb, ei->i_block_group);
@@ -860,7 +860,6 @@ const struct address_space_operations ext2_aops = {
        .readpage               = ext2_readpage,
        .readpages              = ext2_readpages,
        .writepage              = ext2_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext2_write_begin,
        .write_end              = ext2_write_end,
        .bmap                   = ext2_bmap,
@@ -880,7 +879,6 @@ const struct address_space_operations ext2_nobh_aops = {
        .readpage               = ext2_readpage,
        .readpages              = ext2_readpages,
        .writepage              = ext2_nobh_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext2_nobh_write_begin,
        .write_end              = nobh_write_end,
        .bmap                   = ext2_bmap,
@@ -915,7 +913,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
 *
 *      When we do truncate() we may have to clean the ends of several indirect
 *      blocks but leave the blocks themselves alive. Block is partially
- *      truncated if some data below the new i_size is refered from it (and
+ *      truncated if some data below the new i_size is referred from it (and
 *      it is on the path to the first completely truncated data block, indeed).
 *      We have to free the top of that path along with everything to the right
 *      of the path. Since no allocation past the truncation point is possible
@@ -992,7 +990,7 @@ no_top:
 *      @p:     array of block numbers
 *      @q:     points immediately past the end of array
 *
- *      We are freeing all blocks refered from that array (numbers are
+ *      We are freeing all blocks referred from that array (numbers are
 *      stored as little-endian 32-bit) and updating @inode->i_blocks
 *      appropriately.
 */
@@ -1032,7 +1030,7 @@ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
 *      @q:     pointer immediately past the end of array
 *      @depth: depth of the branches to free
 *
- *      We are freeing all blocks refered from these branches (numbers are
+ *      We are freeing all blocks referred from these branches (numbers are
 *      stored as little-endian 32-bit) and updating @inode->i_blocks
 *      appropriately.
 */
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index e7431309bdca..f81e250ac5c4 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -39,7 +39,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                if (ret)
                        return ret;
-                if (!is_owner_or_cap(inode)) {
+                if (!inode_owner_or_capable(inode)) {
                        ret = -EACCES;
                        goto setflags_out;
                }
@@ -89,7 +89,7 @@ setflags_out:
        case EXT2_IOC_GETVERSION:
                return put_user(inode->i_generation, (int __user *) arg);
        case EXT2_IOC_SETVERSION:
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EPERM;
                ret = mnt_want_write(filp->f_path.mnt);
                if (ret)
@@ -115,7 +115,7 @@ setflags_out:
                if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
                        return -ENOTTY;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                if (get_user(rsv_window_size, (int __user *)arg))
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2e1d8341d827..ed5c5d496ee9 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -104,7 +104,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st
        dquot_initialize(dir);
-        inode = ext2_new_inode(dir, mode);
+        inode = ext2_new_inode(dir, mode, &dentry->d_name);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
@@ -133,7 +133,7 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_
        dquot_initialize(dir);
-        inode = ext2_new_inode (dir, mode);
+        inode = ext2_new_inode (dir, mode, &dentry->d_name);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
@@ -159,7 +159,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
        dquot_initialize(dir);
-        inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO);
+        inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out;
@@ -230,7 +230,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        inode_inc_link_count(dir);
-        inode = ext2_new_inode (dir, S_IFDIR | mode);
+        inode = ext2_new_inode(dir, S_IFDIR | mode, &dentry->d_name);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_dir;
@@ -344,7 +344,6 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
                new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -356,12 +355,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
                        if (new_dir->i_nlink >= EXT2_LINK_MAX)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = ext2_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
@@ -369,12 +365,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
-         * inode_dec_link_count() will mark the inode dirty.
         */
        old_inode->i_ctime = CURRENT_TIME_SEC;
+        mark_inode_dirty(old_inode);
        ext2_delete_entry (old_de, old_page);
-        inode_dec_link_count(old_inode);
        if (dir_de) {
                if (old_dir != new_dir)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7731695e65d9..0a78dae7e2cb 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1382,7 +1382,7 @@ static struct dentry *ext2_mount(struct file_system_type *fs_type,
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
 * we don't have to be afraid of races */
 static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off)
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index c2e4dce984d2..529970617a21 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -35,7 +35,7 @@
 *   +------------------+
 *
 * The block header is followed by multiple entry descriptors. These entry
- * descriptors are variable in size, and alligned to EXT2_XATTR_PAD
+ * descriptors are variable in size, and aligned to EXT2_XATTR_PAD
 * byte boundaries. The entry descriptors are sorted by attribute name,
 * so that two extended attribute blocks can be compared efficiently.
 *
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index a1a1c2184616..5e41cccff762 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -116,9 +116,11 @@ exit_ext2_xattr(void)
 # endif  /* CONFIG_EXT2_FS_XATTR */
 #ifdef CONFIG_EXT2_FS_SECURITY
-extern int ext2_init_security(struct inode *inode, struct inode *dir);
+extern int ext2_init_security(struct inode *inode, struct inode *dir,
+                              const struct qstr *qstr);
 #else
-static inline int ext2_init_security(struct inode *inode, struct inode *dir)
+static inline int ext2_init_security(struct inode *inode, struct inode *dir,
+                                     const struct qstr *qstr)
 {
        return 0;
 }
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 3004e15d5da5..5d979b4347b0 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -47,14 +47,15 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
 }
 int
-ext2_init_security(struct inode *inode, struct inode *dir)
+ext2_init_security(struct inode *inode, struct inode *dir,
+                   const struct qstr *qstr)
 {
        int err;
        size_t len;
        void *value;
        char *name;
-        err = security_inode_init_security(inode, dir, &name, &value, &len);
+        err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e4fa49e6c539..9d021c0d472a 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -435,7 +435,7 @@ ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
                return -EINVAL;
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value) {
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 045995c8ce5a..fe52297e31ad 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -590,7 +590,7 @@ do_more:
                                BUFFER_TRACE(debug_bh, "Deleted!");
                                if (!bh2jh(bitmap_bh)->b_committed_data)
                                        BUFFER_TRACE(debug_bh,
-                                                "No commited data in bitmap");
+                                                "No committed data in bitmap");
                                BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
                                __brelse(debug_bh);
                        }
@@ -1063,7 +1063,7 @@ static int find_next_reservable_window(
                rsv_window_remove(sb, my_rsv);
        /*
-         * Let's book the whole avaliable window for now.  We will check the
+         * Let's book the whole available window for now.  We will check the
         * disk bitmap later and then, if there are free blocks then we adjust
         * the window size if it's larger than requested.
         * Otherwise, we will remove this node from the tree next time
@@ -1456,7 +1456,7 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
 *
 * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
 * it is profitable to retry the operation, this function will wait
- * for the current or commiting transaction to complete, and then
+ * for the current or committing transaction to complete, and then
 * return TRUE.
 *
 * if the total number of retries exceed three times, return FALSE.
@@ -1632,9 +1632,9 @@ retry_alloc:
                        goto allocated;
        }
        /*
-         * We may end up a bogus ealier ENOSPC error due to
+         * We may end up a bogus earlier ENOSPC error due to
         * filesystem is "full" of reservations, but
-         * there maybe indeed free blocks avaliable on disk
+         * there maybe indeed free blocks available on disk
         * In this case, we just forget about the reservations
         * just do block allocation as without reservations.
         */
@@ -1991,6 +1991,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
                spin_unlock(sb_bgl_lock(sbi, group));
                percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+                free_blocks -= next - start;
                /* Do not issue a TRIM on extents smaller than minblocks */
                if ((next - start) < minblocks)
                        goto free_extent;
@@ -2040,7 +2041,7 @@ free_extent:
                cond_resched();
                /* No more suitable extents */
-                if ((free_blocks - count) < minblocks)
+                if (free_blocks < minblocks)
                        break;
        }
@@ -2090,7 +2091,8 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
        ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
        int ret = 0;
-        start = range->start >> sb->s_blocksize_bits;
+        start = (range->start >> sb->s_blocksize_bits) +
+                le32_to_cpu(es->s_first_data_block);
        len = range->len >> sb->s_blocksize_bits;
        minlen = range->minlen >> sb->s_blocksize_bits;
        trimmed = 0;
@@ -2099,10 +2101,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
                return -EINVAL;
        if (start >= max_blks)
                goto out;
-        if (start < le32_to_cpu(es->s_first_data_block)) {
-                len -= le32_to_cpu(es->s_first_data_block) - start;
-                start = le32_to_cpu(es->s_first_data_block);
-        }
        if (start + len > max_blks)
                len = max_blks - start;
@@ -2129,10 +2127,15 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
                if (free_blocks < minlen)
                        continue;
-                if (len >= EXT3_BLOCKS_PER_GROUP(sb))
+                /*
-                        len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block);
+                 * For all the groups except the last one, last block will
-                else
+                 * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to
+                 * change it for the last group in which case first_block +
+                 * len < EXT3_BLOCKS_PER_GROUP(sb).
+                 */
+                if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb))
                        last_block = first_block + len;
+                len -= last_block - first_block;
                ret = ext3_trim_all_free(sb, group, first_block,
                                        last_block, minlen);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 9724aef22460..bfc2dc43681d 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -404,7 +404,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
 * For other inodes, search forward from the parent directory's block
 * group to find a free inode.
 */
-struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
+struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
+                             const struct qstr *qstr, int mode)
 {
        struct super_block *sb;
        struct buffer_head *bitmap_bh = NULL;
@@ -589,7 +590,7 @@ got:
        if (err)
                goto fail_free_drop;
-        err = ext3_init_security(handle,inode, dir);
+        err = ext3_init_security(handle, inode, dir, qstr);
        if (err)
                goto fail_free_drop;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ae94f6d949f5..68b2e43d7c35 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1894,7 +1894,6 @@ static const struct address_space_operations ext3_ordered_aops = {
        .readpage               = ext3_readpage,
        .readpages              = ext3_readpages,
        .writepage              = ext3_ordered_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext3_write_begin,
        .write_end              = ext3_ordered_write_end,
        .bmap                   = ext3_bmap,
@@ -1910,7 +1909,6 @@ static const struct address_space_operations ext3_writeback_aops = {
        .readpage               = ext3_readpage,
        .readpages              = ext3_readpages,
        .writepage              = ext3_writeback_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext3_write_begin,
        .write_end              = ext3_writeback_write_end,
        .bmap                   = ext3_bmap,
@@ -1926,7 +1924,6 @@ static const struct address_space_operations ext3_journalled_aops = {
        .readpage               = ext3_readpage,
        .readpages              = ext3_readpages,
        .writepage              = ext3_journalled_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext3_write_begin,
        .write_end              = ext3_journalled_write_end,
        .set_page_dirty         = ext3_journalled_set_page_dirty,
@@ -2058,7 +2055,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
 *
 *      When we do truncate() we may have to clean the ends of several
 *      indirect blocks but leave the blocks themselves alive. Block is
- *      partially truncated if some data below the new i_size is refered
+ *      partially truncated if some data below the new i_size is referred
 *      from it (and it is on the path to the first completely truncated
 *      data block, indeed).  We have to free the top of that path along
 *      with everything to the right of the path. Since no allocation
@@ -2187,7 +2184,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
 * @first:      array of block numbers
 * @last:       points immediately past the end of array
 *
- * We are freeing all blocks refered from that array (numbers are stored as
+ * We are freeing all blocks referred from that array (numbers are stored as
 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
 *
 * We accumulate contiguous runs of blocks to free.  Conveniently, if these
@@ -2275,7 +2272,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
 *      @last:  pointer immediately past the end of array
 *      @depth: depth of the branches to free
 *
- *      We are freeing all blocks refered from these branches (numbers are
+ *      We are freeing all blocks referred from these branches (numbers are
 *      stored as little-endian 32-bit) and updating @inode->i_blocks
 *      appropriately.
 */
@@ -3294,7 +3291,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
        if (ext3_should_journal_data(inode))
                ret = 3 * (bpp + indirects) + 2;
        else
-                ret = 2 * (bpp + indirects) + 2;
+                ret = 2 * (bpp + indirects) + indirects + 2;
 #ifdef CONFIG_QUOTA
        /* We know that structure was already allocated during dquot_initialize so
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index fc080dd561f7..f4090bd2f345 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -38,7 +38,7 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                unsigned int oldflags;
                unsigned int jflag;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                if (get_user(flags, (int __user *) arg))
@@ -123,7 +123,7 @@ flags_out:
                __u32 generation;
                int err;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EPERM;
                err = mnt_want_write(filp->f_path.mnt);
@@ -192,7 +192,7 @@ setversion_out:
                if (err)
                        return err;
-                if (!is_owner_or_cap(inode)) {
+                if (!inode_owner_or_capable(inode)) {
                        err = -EACCES;
                        goto setrsvsz_out;
                }
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b27ba71810ec..32f3b8695859 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1540,8 +1540,8 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        goto cleanup;
                node2 = (struct dx_node *)(bh2->b_data);
                entries2 = node2->entries;
+                memset(&node2->fake, 0, sizeof(struct fake_dirent));
                node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
-                node2->fake.inode = 0;
                BUFFER_TRACE(frame->bh, "get_write_access");
                err = ext3_journal_get_write_access(handle, frame->bh);
                if (err)
@@ -1710,7 +1710,7 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext3_new_inode (handle, dir, mode);
+        inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext3_file_inode_operations;
@@ -1746,7 +1746,7 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext3_new_inode (handle, dir, mode);
+        inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                init_special_inode(inode, inode->i_mode, rdev);
@@ -1784,7 +1784,7 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
+        inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
@@ -2206,7 +2206,7 @@ retry:
        if (IS_DIRSYNC(dir))
                handle->h_sync = 1;
-        inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+        inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
                goto out_stop;
@@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry,
        dquot_initialize(dir);
-        /*
-         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-         * otherwise has the potential to corrupt the orphan inode list.
-         */
-        if (inode->i_nlink == 0)
-                return -ENOENT;
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 108b142e11ed..7916e4ce166a 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -1009,7 +1009,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        if (test_opt(sb, DEBUG))
                printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
-                       " upto "E3FSBLK" blocks\n",
+                       " up to "E3FSBLK" blocks\n",
                       o_blocks_count, n_blocks_count);
        if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 85c8cc8f2473..3c6a9e0eadc1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1464,6 +1464,13 @@ static void ext3_orphan_cleanup (struct super_block * sb,
                return;
        }
+        /* Check if feature set allows readwrite operations */
+        if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
+                ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+                         "unknown ROCOMPAT features");
+                return;
+        }
        if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
                if (es->s_last_orphan)
                        jbd_debug(1, "Errors on filesystem, "
@@ -1936,6 +1943,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sb->s_qcop = &ext3_qctl_operations;
        sb->dq_op = &ext3_quota_operations;
 #endif
+        memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);
        mutex_init(&sbi->s_resize_lock);
@@ -2917,7 +2925,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
 * we don't have to be afraid of races */
 static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off)
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 377fe7201169..2be4f69bfa64 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -128,10 +128,10 @@ exit_ext3_xattr(void)
 #ifdef CONFIG_EXT3_FS_SECURITY
 extern int ext3_init_security(handle_t *handle, struct inode *inode,
-                                struct inode *dir);
+                              struct inode *dir, const struct qstr *qstr);
 #else
 static inline int ext3_init_security(handle_t *handle, struct inode *inode,
-                                struct inode *dir)
+                                     struct inode *dir, const struct qstr *qstr)
 {
        return 0;
 }
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 03a99bfc59f9..b8d9f83aa5c5 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -49,14 +49,15 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
 }
 int
-ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+                   const struct qstr *qstr)
 {
        int err;
        size_t len;
        void *value;
        char *name;
-        err = security_inode_init_security(inode, dir, &name, &value, &len);
+        err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index e0270d1f8d82..21eacd7b7d79 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -433,7 +433,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
                return -EINVAL;
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value) {
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index adf96b822781..1c67139ad4b4 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -21,6 +21,8 @@
 #include "ext4_jbd2.h"
 #include "mballoc.h"
+#include <trace/events/ext4.h>
 /*
 * balloc.c contains the blocks allocation and deallocation routines
 */
@@ -342,6 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
         * We do it here so the bitmap uptodate bit
         * get set with buffer lock held.
         */
+        trace_ext4_read_block_bitmap_load(sb, block_group);
        set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
@@ -544,7 +547,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
 *
 * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
 * it is profitable to retry the operation, this function will wait
- * for the current or commiting transaction to complete, and then
+ * for the current or committing transaction to complete, and then
 * return TRUE.
 *
 * if the total number of retries exceed three times, return FALSE.
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0c8d97b56f34..4daaf2b753f4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -848,6 +848,7 @@ struct ext4_inode_info {
        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        /* current io_end structure for async DIO write*/
        ext4_io_end_t *cur_aio_dio;
+        atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
        spinlock_t i_block_reservation_lock;
@@ -922,14 +923,14 @@ struct ext4_inode_info {
 #define test_opt2(sb, opt)              (EXT4_SB(sb)->s_mount_opt2 & \
                                         EXT4_MOUNT2_##opt)
-#define ext4_set_bit                    ext2_set_bit
+#define ext4_set_bit                    __test_and_set_bit_le
 #define ext4_set_bit_atomic             ext2_set_bit_atomic
-#define ext4_clear_bit                  ext2_clear_bit
+#define ext4_clear_bit                  __test_and_clear_bit_le
 #define ext4_clear_bit_atomic           ext2_clear_bit_atomic
-#define ext4_test_bit                   ext2_test_bit
+#define ext4_test_bit                   test_bit_le
-#define ext4_find_first_zero_bit        ext2_find_first_zero_bit
+#define ext4_find_first_zero_bit        find_first_zero_bit_le
-#define ext4_find_next_zero_bit         ext2_find_next_zero_bit
+#define ext4_find_next_zero_bit         find_next_zero_bit_le
-#define ext4_find_next_bit              ext2_find_next_bit
+#define ext4_find_next_bit              find_next_bit_le
 /*
 * Maximal mount counts between two filesystem checks
@@ -2119,6 +2120,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
+/* For ioend & aio unwritten conversion wait queues */
+#define EXT4_WQ_HASH_SZ         37
+#define ext4_ioend_wq(v)   (&ext4__ioend_wq[((unsigned long)(v)) %\
+                                            EXT4_WQ_HASH_SZ])
+#define ext4_aio_mutex(v)  (&ext4__aio_mutex[((unsigned long)(v)) %\
+                                             EXT4_WQ_HASH_SZ])
+extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
 #endif  /* __KERNEL__ */
 #endif  /* _EXT4_H */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d8b992e658c1..d0f53538a57f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -86,8 +86,8 @@
 #ifdef CONFIG_QUOTA
 /* Amount of blocks needed for quota update - we know that the structure was
- * allocated so we need to update only inode+data */
+ * allocated so we need to update only data block */
-#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
+#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
 /* Amount of blocks needed for quota insert/delete - we do some block writes
 * but inode, sb and group updates are done only once */
 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
@@ -202,13 +202,6 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
        return 1;
 }
-static inline void ext4_journal_release_buffer(handle_t *handle,
-                                                struct buffer_head *bh)
-{
-        if (ext4_handle_valid(handle))
-                jbd2_journal_release_buffer(handle, bh);
-}
 static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 {
        return ext4_journal_start_sb(inode->i_sb, nblocks);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 63a75810b7c3..4890d6f3ad15 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,6 +44,8 @@
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
+#include <trace/events/ext4.h>
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
                                            struct inode *inode,
                                            int needed)
@@ -131,7 +133,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
                 * fragmenting the file system's free space.  Maybe we
                 * should have some hueristics or some way to allow
                 * userspace to pass a hint to file system,
-                 * especiially if the latter case turns out to be
+                 * especially if the latter case turns out to be
                 * common.
                 */
                ex = path[depth].p_ext;
@@ -664,6 +666,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                if (unlikely(!bh))
                        goto err;
                if (!bh_uptodate_or_lock(bh)) {
+                        trace_ext4_ext_load_extent(inode, block,
+                                                path[ppos].p_block);
                        if (bh_submit_read(bh) < 0) {
                                put_bh(bh);
                                goto err;
@@ -1034,7 +1038,7 @@ cleanup:
                for (i = 0; i < depth; i++) {
                        if (!ablocks[i])
                                continue;
-                        ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
+                        ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
                                         EXT4_FREE_BLOCKS_METADATA);
                }
        }
@@ -1725,7 +1729,7 @@ repeat:
                BUG_ON(npath->p_depth != path->p_depth);
                eh = npath[depth].p_hdr;
                if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
-                        ext_debug("next leaf isnt full(%d)\n",
+                        ext_debug("next leaf isn't full(%d)\n",
                                  le16_to_cpu(eh->eh_entries));
                        path = npath;
                        goto repeat;
@@ -2059,7 +2063,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
        if (err)
                return err;
        ext_debug("index is empty, remove it, free block %llu\n", leaf);
-        ext4_free_blocks(handle, inode, 0, leaf, 1,
+        ext4_free_blocks(handle, inode, NULL, leaf, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
        return err;
 }
@@ -2156,7 +2160,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
                num = le32_to_cpu(ex->ee_block) + ee_len - from;
                start = ext4_ext_pblock(ex) + ee_len - num;
                ext_debug("free last %u blocks starting %llu\n", num, start);
-                ext4_free_blocks(handle, inode, 0, start, num, flags);
+                ext4_free_blocks(handle, inode, NULL, start, num, flags);
        } else if (from == le32_to_cpu(ex->ee_block)
                   && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
                printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2529,7 +2533,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 /*
 * This function is called by ext4_ext_map_blocks() if someone tries to write
 * to an uninitialized extent. It may result in splitting the uninitialized
- * extent into multiple extents (upto three - one initialized and two
+ * extent into multiple extents (up to three - one initialized and two
 * uninitialized).
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be initialized
@@ -2844,7 +2848,7 @@ fix_extent_len:
 * ext4_get_blocks_dio_write() when DIO to write
 * to an uninitialized extent.
 *
- * Writing to an uninitized extent may result in splitting the uninitialized
+ * Writing to an uninitialized extent may result in splitting the uninitialized
 * extent into multiple /initialized uninitialized extents (up to three)
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be uninitialized
@@ -3108,14 +3112,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
 {
        int i, depth;
        struct ext4_extent_header *eh;
-        struct ext4_extent *ex, *last_ex;
+        struct ext4_extent *last_ex;
        if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
                return 0;
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
-        ex = path[depth].p_ext;
        if (unlikely(!eh->eh_entries)) {
                EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
@@ -3171,12 +3174,13 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                                                   path, flags);
                /*
                 * Flag the inode(non aio case) or end_io struct (aio case)
-                 * that this IO needs to convertion to written when IO is
+                 * that this IO needs to conversion to written when IO is
                 * completed
                 */
-                if (io)
+                if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
                        io->flag = EXT4_IO_END_UNWRITTEN;
-                else
+                        atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+                } else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
                        map->m_flags |= EXT4_MAP_UNINIT;
@@ -3294,9 +3298,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map, int flags)
 {
        struct ext4_ext_path *path = NULL;
-        struct ext4_extent_header *eh;
        struct ext4_extent newex, *ex;
-        ext4_fsblk_t newblock;
+        ext4_fsblk_t newblock = 0;
        int err = 0, depth, ret;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
@@ -3304,6 +3307,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        ext_debug("blocks %u/%u requested for inode %lu\n",
                  map->m_lblk, map->m_len, inode->i_ino);
+        trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        /* check in cache */
        if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
@@ -3351,7 +3355,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                err = -EIO;
                goto out2;
        }
-        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        if (ex) {
@@ -3457,15 +3460,16 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                ext4_ext_mark_uninitialized(&newex);
                /*
                 * io_end structure was created for every IO write to an
-                 * uninitialized extent. To avoid unecessary conversion,
+                 * uninitialized extent. To avoid unnecessary conversion,
                 * here we flag the IO that really needs the conversion.
                 * For non asycn direct IO case, flag the inode state
-                 * that we need to perform convertion when IO is done.
+                 * that we need to perform conversion when IO is done.
                 */
                if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                        if (io)
+                        if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
                                io->flag = EXT4_IO_END_UNWRITTEN;
-                        else
+                                atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+                        } else
                                ext4_set_inode_state(inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
                }
@@ -3483,7 +3487,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                /* not a good idea to call discard here directly,
                 * but otherwise we'd need to call it every free() */
                ext4_discard_preallocations(inode);
-                ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
+                ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
                                 ext4_ext_get_actual_len(&newex), 0);
                goto out2;
        }
@@ -3523,6 +3527,8 @@ out2:
                ext4_ext_drop_refs(path);
                kfree(path);
        }
+        trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+                newblock, map->m_len, err ? err : allocated);
        return err ? err : allocated;
 }
@@ -3656,6 +3662,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
+        trace_ext4_fallocate_enter(inode, offset, len, mode);
        map.m_lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
@@ -3671,6 +3678,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        ret = inode_newsize_ok(inode, (len + offset));
        if (ret) {
                mutex_unlock(&inode->i_mutex);
+                trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
                return ret;
        }
 retry:
@@ -3715,6 +3723,8 @@ retry:
                goto retry;
        }
        mutex_unlock(&inode->i_mutex);
+        trace_ext4_fallocate_exit(inode, offset, max_blocks,
+                                ret > 0 ? ret2 : ret);
        return ret > 0 ? ret2 : ret;
 }
@@ -3773,6 +3783,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
        }
        return ret > 0 ? ret2 : ret;
 }
 /*
 * Callback function called for each extent to gather FIEMAP information.
 */
@@ -3780,38 +3791,162 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
                       struct ext4_ext_cache *newex, struct ext4_extent *ex,
                       void *data)
 {
-        struct fiemap_extent_info *fieinfo = data;
-        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
        __u64   logical;
        __u64   physical;
        __u64   length;
+        loff_t  size;
        __u32   flags = 0;
-        int     error;
+        int             ret = 0;
+        struct fiemap_extent_info *fieinfo = data;
+        unsigned char blksize_bits;
-        logical =  (__u64)newex->ec_block << blksize_bits;
+        blksize_bits = inode->i_sb->s_blocksize_bits;
+        logical = (__u64)newex->ec_block << blksize_bits;
        if (newex->ec_start == 0) {
-                pgoff_t offset;
+                /*
-                struct page *page;
+                 * No extent in extent-tree contains block @newex->ec_start,
+                 * then the block may stay in 1)a hole or 2)delayed-extent.
+                 *
+                 * Holes or delayed-extents are processed as follows.
+                 * 1. lookup dirty pages with specified range in pagecache.
+                 *    If no page is got, then there is no delayed-extent and
+                 *    return with EXT_CONTINUE.
+                 * 2. find the 1st mapped buffer,
+                 * 3. check if the mapped buffer is both in the request range
+                 *    and a delayed buffer. If not, there is no delayed-extent,
+                 *    then return.
+                 * 4. a delayed-extent is found, the extent will be collected.
+                 */
+                ext4_lblk_t     end = 0;
+                pgoff_t         last_offset;
+                pgoff_t         offset;
+                pgoff_t         index;
+                struct page     **pages = NULL;
                struct buffer_head *bh = NULL;
+                struct buffer_head *head = NULL;
+                unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
+                pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
+                if (pages == NULL)
+                        return -ENOMEM;
                offset = logical >> PAGE_SHIFT;
-                page = find_get_page(inode->i_mapping, offset);
+repeat:
-                if (!page || !page_has_buffers(page))
+                last_offset = offset;
-                        return EXT_CONTINUE;
+                head = NULL;
+                ret = find_get_pages_tag(inode->i_mapping, &offset,
+                                        PAGECACHE_TAG_DIRTY, nr_pages, pages);
+                if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+                        /* First time, try to find a mapped buffer. */
+                        if (ret == 0) {
+out:
+                                for (index = 0; index < ret; index++)
+                                        page_cache_release(pages[index]);
+                                /* just a hole. */
+                                kfree(pages);
+                                return EXT_CONTINUE;
+                        }
-                bh = page_buffers(page);
+                        /* Try to find the 1st mapped buffer. */
+                        end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
+                                  blksize_bits;
+                        if (!page_has_buffers(pages[0]))
+                                goto out;
+                        head = page_buffers(pages[0]);
+                        if (!head)
+                                goto out;
-                if (!bh)
+                        bh = head;
-                        return EXT_CONTINUE;
+                        do {
+                                if (buffer_mapped(bh)) {
+                                        /* get the 1st mapped buffer. */
+                                        if (end > newex->ec_block +
+                                                newex->ec_len)
+                                                /* The buffer is out of
+                                                 * the request range.
+                                                 */
+                                                goto out;
+                                        goto found_mapped_buffer;
+                                }
+                                bh = bh->b_this_page;
+                                end++;
+                        } while (bh != head);
-                if (buffer_delay(bh)) {
+                        /* No mapped buffer found. */
-                        flags |= FIEMAP_EXTENT_DELALLOC;
+                        goto out;
-                        page_cache_release(page);
                } else {
-                        page_cache_release(page);
+                        /*Find contiguous delayed buffers. */
-                        return EXT_CONTINUE;
+                        if (ret > 0 && pages[0]->index == last_offset)
+                                head = page_buffers(pages[0]);
+                        bh = head;
+                }
+found_mapped_buffer:
+                if (bh != NULL && buffer_delay(bh)) {
+                        /* 1st or contiguous delayed buffer found. */
+                        if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+                                /*
+                                 * 1st delayed buffer found, record
+                                 * the start of extent.
+                                 */
+                                flags |= FIEMAP_EXTENT_DELALLOC;
+                                newex->ec_block = end;
+                                logical = (__u64)end << blksize_bits;
+                        }
+                        /* Find contiguous delayed buffers. */
+                        do {
+                                if (!buffer_delay(bh))
+                                        goto found_delayed_extent;
+                                bh = bh->b_this_page;
+                                end++;
+                        } while (bh != head);
+                        for (index = 1; index < ret; index++) {
+                                if (!page_has_buffers(pages[index])) {
+                                        bh = NULL;
+                                        break;
+                                }
+                                head = page_buffers(pages[index]);
+                                if (!head) {
+                                        bh = NULL;
+                                        break;
+                                }
+                                if (pages[index]->index !=
+                                        pages[0]->index + index) {
+                                        /* Blocks are not contiguous. */
+                                        bh = NULL;
+                                        break;
+                                }
+                                bh = head;
+                                do {
+                                        if (!buffer_delay(bh))
+                                                /* Delayed-extent ends. */
+                                                goto found_delayed_extent;
+                                        bh = bh->b_this_page;
+                                        end++;
+                                } while (bh != head);
+                        }
+                } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
+                        /* a hole found. */
+                        goto out;
+found_delayed_extent:
+                newex->ec_len = min(end - newex->ec_block,
+                                                (ext4_lblk_t)EXT_INIT_MAX_LEN);
+                if (ret == nr_pages && bh != NULL &&
+                        newex->ec_len < EXT_INIT_MAX_LEN &&
+                        buffer_delay(bh)) {
+                        /* Have not collected an extent and continue. */
+                        for (index = 0; index < ret; index++)
+                                page_cache_release(pages[index]);
+                        goto repeat;
                }
+                for (index = 0; index < ret; index++)
+                        page_cache_release(pages[index]);
+                kfree(pages);
        }
        physical = (__u64)newex->ec_start << blksize_bits;
@@ -3820,32 +3955,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
        if (ex && ext4_ext_is_uninitialized(ex))
                flags |= FIEMAP_EXTENT_UNWRITTEN;
-        /*
+        size = i_size_read(inode);
-         * If this extent reaches EXT_MAX_BLOCK, it must be last.
+        if (logical + length >= size)
-         *
-         * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
-         * this also indicates no more allocated blocks.
-         *
-         * XXX this might miss a single-block extent at EXT_MAX_BLOCK
-         */
-        if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
-            newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
-                loff_t size = i_size_read(inode);
-                loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
                flags |= FIEMAP_EXTENT_LAST;
-                if ((flags & FIEMAP_EXTENT_DELALLOC) &&
-                    logical+length > size)
-                        length = (size - logical + bs - 1) & ~(bs-1);
-        }
-        error = fiemap_fill_next_extent(fieinfo, logical, physical,
+        ret = fiemap_fill_next_extent(fieinfo, logical, physical,
                                        length, flags);
-        if (error < 0)
+        if (ret < 0)
-                return error;
+                return ret;
-        if (error == 1)
+        if (ret == 1)
                return EXT_BREAK;
        return EXT_CONTINUE;
 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2e8322c8aa88..7b80d543b89e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
        return 0;
 }
+static void ext4_aiodio_wait(struct inode *inode)
+{
+        wait_queue_head_t *wq = ext4_ioend_wq(inode);
+        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
+}
+/*
+ * This tests whether the IO in question is block-aligned or not.
+ * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
+ * are converted to written only after the IO is complete.  Until they are
+ * mapped, these blocks appear as holes, so dio_zero_block() will assume that
+ * it needs to zero out portions of the start and/or end block.  If 2 AIO
+ * threads are at work on the same unwritten block, they must be synchronized
+ * or one thread will zero the other's data, causing corruption.
+ */
+static int
+ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
+                   unsigned long nr_segs, loff_t pos)
+{
+        struct super_block *sb = inode->i_sb;
+        int blockmask = sb->s_blocksize - 1;
+        size_t count = iov_length(iov, nr_segs);
+        loff_t final_size = pos + count;
+        if (pos >= inode->i_size)
+                return 0;
+        if ((pos & blockmask) || (final_size & blockmask))
+                return 1;
+        return 0;
+}
 static ssize_t
 ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                unsigned long nr_segs, loff_t pos)
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+        int unaligned_aio = 0;
+        int ret;
        /*
         * If we have encountered a bitmap-format file, the size limit
@@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                        nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
                                              sbi->s_bitmap_maxbytes - pos);
                }
+        } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
+                   !is_sync_kiocb(iocb))) {
+                unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
        }
-        return generic_file_aio_write(iocb, iov, nr_segs, pos);
+        /* Unaligned direct AIO must be serialized; see comment above */
+        if (unaligned_aio) {
+                static unsigned long unaligned_warn_time;
+                /* Warn about this once per day */
+                if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
+                        ext4_msg(inode->i_sb, KERN_WARNING,
+                                 "Unaligned AIO/DIO on inode %ld by %s; "
+                                 "performance will be poor.",
+                                 inode->i_ino, current->comm);
+                mutex_lock(ext4_aio_mutex(inode));
+                ext4_aiodio_wait(inode);
+        }
+        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+        if (unaligned_aio)
+                mutex_unlock(ext4_aio_mutex(inode));
+        return ret;
 }
 static const struct vm_operations_struct ext4_file_vm_ops = {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 7829b287822a..e9473cbe80df 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -101,7 +101,7 @@ extern int ext4_flush_completed_IO(struct inode *inode)
                 * to the work-to-be schedule is freed.
                 *
                 * Thus we need to keep the io structure still valid here after
-                 * convertion finished. The io structure has a flag to
+                 * conversion finished. The io structure has a flag to
                 * avoid double converting from both fsync and background work
                 * queue work.
                 */
@@ -125,9 +125,11 @@ extern int ext4_flush_completed_IO(struct inode *inode)
 * the parent directory's parent as well, and so on recursively, if
 * they are also freshly created.
 */
-static void ext4_sync_parent(struct inode *inode)
+static int ext4_sync_parent(struct inode *inode)
 {
+        struct writeback_control wbc;
        struct dentry *dentry = NULL;
+        int ret = 0;
        while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
                ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -136,8 +138,17 @@ static void ext4_sync_parent(struct inode *inode)
                if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
                        break;
                inode = dentry->d_parent->d_inode;
-                sync_mapping_buffers(inode->i_mapping);
+                ret = sync_mapping_buffers(inode->i_mapping);
+                if (ret)
+                        break;
+                memset(&wbc, 0, sizeof(wbc));
+                wbc.sync_mode = WB_SYNC_ALL;
+                wbc.nr_to_write = 0;         /* only write out the inode */
+                ret = sync_inode(inode, &wbc);
+                if (ret)
+                        break;
        }
+        return ret;
 }
 /*
@@ -164,20 +175,20 @@ int ext4_sync_file(struct file *file, int datasync)
        J_ASSERT(ext4_journal_current_handle() == NULL);
-        trace_ext4_sync_file(file, datasync);
+        trace_ext4_sync_file_enter(file, datasync);
        if (inode->i_sb->s_flags & MS_RDONLY)
                return 0;
        ret = ext4_flush_completed_IO(inode);
        if (ret < 0)
-                return ret;
+                goto out;
        if (!journal) {
                ret = generic_file_fsync(file, datasync);
                if (!ret && !list_empty(&inode->i_dentry))
-                        ext4_sync_parent(inode);
+                        ret = ext4_sync_parent(inode);
-                return ret;
+                goto out;
        }
        /*
@@ -194,8 +205,10 @@ int ext4_sync_file(struct file *file, int datasync)
         *  (they were dirtied by commit).  But that's OK - the blocks are
         *  safe in-journal, which is all fsync() needs to ensure.
         */
-        if (ext4_should_journal_data(inode))
+        if (ext4_should_journal_data(inode)) {
-                return ext4_force_commit(inode->i_sb);
+                ret = ext4_force_commit(inode->i_sb);
+                goto out;
+        }
        commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
        if (jbd2_log_start_commit(journal, commit_tid)) {
@@ -215,5 +228,7 @@ int ext4_sync_file(struct file *file, int datasync)
                ret = jbd2_log_wait_commit(journal, commit_tid);
        } else if (journal->j_flags & JBD2_BARRIER)
                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ out:
+        trace_ext4_sync_file_exit(inode, ret);
        return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index eb9097aec6f0..21bb2f61e502 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -152,6 +152,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
         * We do it here so the bitmap uptodate bit
         * get set with buffer lock held.
         */
+        trace_ext4_load_inode_bitmap(sb, block_group);
        set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
@@ -649,7 +650,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                *group = parent_group + flex_size;
                if (*group > ngroups)
                        *group = 0;
-                return find_group_orlov(sb, parent, group, mode, 0);
+                return find_group_orlov(sb, parent, group, mode, NULL);
        }
        /*
@@ -1042,7 +1043,7 @@ got:
        if (err)
                goto fail_free_drop;
-        err = ext4_init_security(handle, inode, dir);
+        err = ext4_init_security(handle, inode, dir, qstr);
        if (err)
                goto fail_free_drop;
@@ -1054,6 +1055,11 @@ got:
                }
        }
+        if (ext4_handle_valid(handle)) {
+                ei->i_sync_tid = handle->h_transaction->t_tid;
+                ei->i_datasync_tid = handle->h_transaction->t_tid;
+        }
        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
                ext4_std_error(sb, err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9f7f9e49914f..f2fa5e8a582c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -173,7 +173,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
        jbd_debug(2, "restarting handle %p\n", handle);
        up_write(&EXT4_I(inode)->i_data_sem);
-        ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+        ret = ext4_journal_restart(handle, nblocks);
        down_write(&EXT4_I(inode)->i_data_sem);
        ext4_discard_preallocations(inode);
@@ -720,7 +720,7 @@ allocated:
        return ret;
 failed_out:
        for (i = 0; i < index; i++)
-                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
        return ret;
 }
@@ -823,20 +823,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
        return err;
 failed:
        /* Allocation failed, free what we already allocated */
-        ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
+        ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
        for (i = 1; i <= n ; i++) {
                /*
                 * branch[i].bh is newly allocated, so there is no
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
                 */
-                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
                                 EXT4_FREE_BLOCKS_FORGET);
        }
        for (i = n+1; i < indirect_blks; i++)
-                ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
+                ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
-        ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
+        ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
        return err;
 }
@@ -924,7 +924,7 @@ err_out:
                ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
                                 EXT4_FREE_BLOCKS_FORGET);
        }
-        ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
                         blks, 0);
        return err;
@@ -973,6 +973,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
        int count = 0;
        ext4_fsblk_t first_block = 0;
+        trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
        J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
@@ -1058,6 +1059,8 @@ cleanup:
                partial--;
        }
 out:
+        trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
+                                map->m_pblk, map->m_len, err);
        return err;
 }
@@ -2060,7 +2063,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
-                        int commit_write = 0, redirty_page = 0;
+                        int commit_write = 0, skip_page = 0;
                        struct page *page = pvec.pages[i];
                        index = page->index;
@@ -2086,14 +2089,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                         * If the page does not have buffers (for
                         * whatever reason), try to create them using
                         * __block_write_begin.  If this fails,
-                         * redirty the page and move on.
+                         * skip the page and move on.
                         */
                        if (!page_has_buffers(page)) {
                                if (__block_write_begin(page, 0, len,
                                                noalloc_get_block_write)) {
-                                redirty_page:
+                                skip_page:
-                                        redirty_page_for_writepage(mpd->wbc,
-                                                                   page);
                                        unlock_page(page);
                                        continue;
                                }
@@ -2104,7 +2105,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                        block_start = 0;
                        do {
                                if (!bh)
-                                        goto redirty_page;
+                                        goto skip_page;
                                if (map && (cur_logical >= map->m_lblk) &&
                                    (cur_logical <= (map->m_lblk +
                                                     (map->m_len - 1)))) {
@@ -2120,22 +2121,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                                        clear_buffer_unwritten(bh);
                                }
-                                /* redirty page if block allocation undone */
+                                /* skip page if block allocation undone */
                                if (buffer_delay(bh) || buffer_unwritten(bh))
-                                        redirty_page = 1;
+                                        skip_page = 1;
                                bh = bh->b_this_page;
                                block_start += bh->b_size;
                                cur_logical++;
                                pblock++;
                        } while (bh != page_bufs);
-                        if (redirty_page)
+                        if (skip_page)
-                                goto redirty_page;
+                                goto skip_page;
                        if (commit_write)
                                /* mark the buffer_heads as dirty & uptodate */
                                block_commit_write(page, 0, len);
+                        clear_page_dirty_for_io(page);
                        /*
                         * Delalloc doesn't support data journalling,
                         * but eventually maybe we'll lift this
@@ -2165,8 +2167,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
        return ret;
 }
-static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
+static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
-                                        sector_t logical, long blk_cnt)
 {
        int nr_pages, i;
        pgoff_t index, end;
@@ -2174,9 +2175,8 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
-        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        index = mpd->first_page;
-        end   = (logical + blk_cnt - 1) >>
+        end   = mpd->next_page - 1;
-                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
        while (index <= end) {
                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
                if (nr_pages == 0)
@@ -2279,9 +2279,8 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
                err = blks;
                /*
                 * If get block returns EAGAIN or ENOSPC and there
-                 * appears to be free blocks we will call
+                 * appears to be free blocks we will just let
-                 * ext4_writepage() for all of the pages which will
+                 * mpage_da_submit_io() unlock all of the pages.
-                 * just redirty the pages.
                 */
                if (err == -EAGAIN)
                        goto submit_io;
@@ -2312,8 +2311,10 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
                                ext4_print_free_blocks(mpd->inode);
                }
                /* invalidate all the pages */
-                ext4_da_block_invalidatepages(mpd, next,
+                ext4_da_block_invalidatepages(mpd);
-                                mpd->b_size >> mpd->inode->i_blkbits);
+                /* Mark this page range as having been completed */
+                mpd->io_done = 1;
                return;
        }
        BUG_ON(blks == 0);
@@ -2438,102 +2439,6 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 }
 /*
- * __mpage_da_writepage - finds extent of pages and blocks
- *
- * @page: page to consider
- * @wbc: not used, we just follow rules
- * @data: context
- *
- * The function finds extents of pages and scan them for all blocks.
- */
-static int __mpage_da_writepage(struct page *page,
-                                struct writeback_control *wbc,
-                                struct mpage_da_data *mpd)
-{
-        struct inode *inode = mpd->inode;
-        struct buffer_head *bh, *head;
-        sector_t logical;
-        /*
-         * Can we merge this page to current extent?
-         */
-        if (mpd->next_page != page->index) {
-                /*
-                 * Nope, we can't. So, we map non-allocated blocks
-                 * and start IO on them
-                 */
-                if (mpd->next_page != mpd->first_page) {
-                        mpage_da_map_and_submit(mpd);
-                        /*
-                         * skip rest of the page in the page_vec
-                         */
-                        redirty_page_for_writepage(wbc, page);
-                        unlock_page(page);
-                        return MPAGE_DA_EXTENT_TAIL;
-                }
-                /*
-                 * Start next extent of pages ...
-                 */
-                mpd->first_page = page->index;
-                /*
-                 * ... and blocks
-                 */
-                mpd->b_size = 0;
-                mpd->b_state = 0;
-                mpd->b_blocknr = 0;
-        }
-        mpd->next_page = page->index + 1;
-        logical = (sector_t) page->index <<
-                  (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        if (!page_has_buffers(page)) {
-                mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
-                                       (1 << BH_Dirty) | (1 << BH_Uptodate));
-                if (mpd->io_done)
-                        return MPAGE_DA_EXTENT_TAIL;
-        } else {
-                /*
-                 * Page with regular buffer heads, just add all dirty ones
-                 */
-                head = page_buffers(page);
-                bh = head;
-                do {
-                        BUG_ON(buffer_locked(bh));
-                        /*
-                         * We need to try to allocate
-                         * unmapped blocks in the same page.
-                         * Otherwise we won't make progress
-                         * with the page in ext4_writepage
-                         */
-                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
-                                mpage_add_bh_to_extent(mpd, logical,
-                                                       bh->b_size,
-                                                       bh->b_state);
-                                if (mpd->io_done)
-                                        return MPAGE_DA_EXTENT_TAIL;
-                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
-                                /*
-                                 * mapped dirty buffer. We need to update
-                                 * the b_state because we look at
-                                 * b_state in mpage_da_map_blocks. We don't
-                                 * update b_size because if we find an
-                                 * unmapped buffer_head later we need to
-                                 * use the b_state flag of that buffer_head.
-                                 */
-                                if (mpd->b_size == 0)
-                                        mpd->b_state = bh->b_state & BH_FLAGS;
-                        }
-                        logical++;
-                } while ((bh = bh->b_this_page) != head);
-        }
-        return 0;
-}
-/*
 * This is a special get_blocks_t callback which is used by
 * ext4_da_write_begin().  It will either return mapped block or
 * reserve space for a single block.
@@ -2684,7 +2589,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 * because we should have holes filled from ext4_page_mkwrite(). We even don't
 * need to file the inode to the transaction's list in ordered mode because if
 * we are writing back data added by write(), the inode is already there and if
- * we are writing back data modified via mmap(), noone guarantees in which
+ * we are writing back data modified via mmap(), no one guarantees in which
 * transaction the data will hit the disk. In case we are journaling data, we
 * cannot start transaction directly because transaction start ranks above page
 * lock so we have to do some magic.
@@ -2786,7 +2691,7 @@ static int ext4_writepage(struct page *page,
 /*
 * This is called via ext4_da_writepages() to
- * calulate the total number of credits to reserve to fit
+ * calculate the total number of credits to reserve to fit
 * a single extent allocation into a single transaction,
 * ext4_da_writpeages() will loop calling this before
 * the block allocation.
@@ -2811,27 +2716,27 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 /*
 * write_cache_pages_da - walk the list of dirty pages of the given
- * address space and call the callback function (which usually writes
+ * address space and accumulate pages that need writing, and call
- * the pages).
+ * mpage_da_map_and_submit to map a single contiguous memory region
- *
+ * and then write them.
- * This is a forked version of write_cache_pages().  Differences:
- *      Range cyclic is ignored.
- *      no_nrwrite_index_update is always presumed true
 */
 static int write_cache_pages_da(struct address_space *mapping,
                                struct writeback_control *wbc,
                                struct mpage_da_data *mpd,
                                pgoff_t *done_index)
 {
-        int ret = 0;
+        struct buffer_head      *bh, *head;
-        int done = 0;
+        struct inode            *inode = mapping->host;
-        struct pagevec pvec;
+        struct pagevec          pvec;
-        unsigned nr_pages;
+        unsigned int            nr_pages;
-        pgoff_t index;
+        sector_t                logical;
-        pgoff_t end;            /* Inclusive */
+        pgoff_t                 index, end;
-        long nr_to_write = wbc->nr_to_write;
+        long                    nr_to_write = wbc->nr_to_write;
-        int tag;
+        int                     i, tag, ret = 0;
+        memset(mpd, 0, sizeof(struct mpage_da_data));
+        mpd->wbc = wbc;
+        mpd->inode = inode;
        pagevec_init(&pvec, 0);
        index = wbc->range_start >> PAGE_CACHE_SHIFT;
        end = wbc->range_end >> PAGE_CACHE_SHIFT;
@@ -2842,13 +2747,11 @@ static int write_cache_pages_da(struct address_space *mapping,
                tag = PAGECACHE_TAG_DIRTY;
        *done_index = index;
-        while (!done && (index <= end)) {
+        while (index <= end) {
-                int i;
                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
-                        break;
+                        return 0;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
@@ -2860,60 +2763,100 @@ static int write_cache_pages_da(struct address_space *mapping,
                         * mapping. However, page->index will not change
                         * because we have a reference on the page.
                         */
-                        if (page->index > end) {
+                        if (page->index > end)
-                                done = 1;
+                                goto out;
-                                break;
-                        }
                        *done_index = page->index + 1;
+                        /*
+                         * If we can't merge this page, and we have
+                         * accumulated an contiguous region, write it
+                         */
+                        if ((mpd->next_page != page->index) &&
+                            (mpd->next_page != mpd->first_page)) {
+                                mpage_da_map_and_submit(mpd);
+                                goto ret_extent_tail;
+                        }
                        lock_page(page);
                        /*
-                         * Page truncated or invalidated. We can freely skip it
+                         * If the page is no longer dirty, or its
-                         * then, even for data integrity operations: the page
+                         * mapping no longer corresponds to inode we
-                         * has disappeared concurrently, so there could be no
+                         * are writing (which means it has been
-                         * real expectation of this data interity operation
+                         * truncated or invalidated), or the page is
-                         * even if there is now a new, dirty page at the same
+                         * already under writeback and we are not
-                         * pagecache address.
+                         * doing a data integrity writeback, skip the page
                         */
-                        if (unlikely(page->mapping != mapping)) {
+                        if (!PageDirty(page) ||
-continue_unlock:
+                            (PageWriteback(page) &&
+                             (wbc->sync_mode == WB_SYNC_NONE)) ||
+                            unlikely(page->mapping != mapping)) {
                                unlock_page(page);
                                continue;
                        }
-                        if (!PageDirty(page)) {
+                        if (PageWriteback(page))
-                                /* someone wrote it for us */
+                                wait_on_page_writeback(page);
-                                goto continue_unlock;
-                        }
-                        if (PageWriteback(page)) {
-                                if (wbc->sync_mode != WB_SYNC_NONE)
-                                        wait_on_page_writeback(page);
-                                else
-                                        goto continue_unlock;
-                        }
                        BUG_ON(PageWriteback(page));
-                        if (!clear_page_dirty_for_io(page))
-                                goto continue_unlock;
-                        ret = __mpage_da_writepage(page, wbc, mpd);
+                        if (mpd->next_page != page->index)
-                        if (unlikely(ret)) {
+                                mpd->first_page = page->index;
-                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
+                        mpd->next_page = page->index + 1;
-                                        unlock_page(page);
+                        logical = (sector_t) page->index <<
-                                        ret = 0;
+                                (PAGE_CACHE_SHIFT - inode->i_blkbits);
-                                } else {
-                                        done = 1;
+                        if (!page_has_buffers(page)) {
-                                        break;
+                                mpage_add_bh_to_extent(mpd, logical,
-                                }
+                                                       PAGE_CACHE_SIZE,
+                                                       (1 << BH_Dirty) | (1 << BH_Uptodate));
+                                if (mpd->io_done)
+                                        goto ret_extent_tail;
+                        } else {
+                                /*
+                                 * Page with regular buffer heads,
+                                 * just add all dirty ones
+                                 */
+                                head = page_buffers(page);
+                                bh = head;
+                                do {
+                                        BUG_ON(buffer_locked(bh));
+                                        /*
+                                         * We need to try to allocate
+                                         * unmapped blocks in the same page.
+                                         * Otherwise we won't make progress
+                                         * with the page in ext4_writepage
+                                         */
+                                        if (ext4_bh_delay_or_unwritten(NULL, bh)) {
+                                                mpage_add_bh_to_extent(mpd, logical,
+                                                                       bh->b_size,
+                                                                       bh->b_state);
+                                                if (mpd->io_done)
+                                                        goto ret_extent_tail;
+                                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
+                                                /*
+                                                 * mapped dirty buffer. We need
+                                                 * to update the b_state
+                                                 * because we look at b_state
+                                                 * in mpage_da_map_blocks.  We
+                                                 * don't update b_size because
+                                                 * if we find an unmapped
+                                                 * buffer_head later we need to
+                                                 * use the b_state flag of that
+                                                 * buffer_head.
+                                                 */
+                                                if (mpd->b_size == 0)
+                                                        mpd->b_state = bh->b_state & BH_FLAGS;
+                                        }
+                                        logical++;
+                                } while ((bh = bh->b_this_page) != head);
                        }
                        if (nr_to_write > 0) {
                                nr_to_write--;
                                if (nr_to_write == 0 &&
-                                    wbc->sync_mode == WB_SYNC_NONE) {
+                                    wbc->sync_mode == WB_SYNC_NONE)
                                        /*
                                         * We stop writing back only if we are
                                         * not doing integrity sync. In case of
@@ -2924,14 +2867,18 @@ continue_unlock:
                                         * pages, but have not synced all of the
                                         * old dirty pages.
                                         */
-                                        done = 1;
+                                        goto out;
-                                        break;
-                                }
                        }
                }
                pagevec_release(&pvec);
                cond_resched();
        }
+        return 0;
+ret_extent_tail:
+        ret = MPAGE_DA_EXTENT_TAIL;
+out:
+        pagevec_release(&pvec);
+        cond_resched();
        return ret;
 }
@@ -2945,7 +2892,6 @@ static int ext4_da_writepages(struct address_space *mapping,
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
        int pages_written = 0;
-        long pages_skipped;
        unsigned int max_pages;
        int range_cyclic, cycled = 1, io_done = 0;
        int needed_blocks, ret = 0;
@@ -3028,11 +2974,6 @@ static int ext4_da_writepages(struct address_space *mapping,
                wbc->nr_to_write = desired_nr_to_write;
        }
-        mpd.wbc = wbc;
-        mpd.inode = mapping->host;
-        pages_skipped = wbc->pages_skipped;
 retry:
        if (wbc->sync_mode == WB_SYNC_ALL)
                tag_pages_for_writeback(mapping, index, end);
@@ -3059,22 +3000,10 @@ retry:
                }
                /*
-                 * Now call __mpage_da_writepage to find the next
+                 * Now call write_cache_pages_da() to find the next
                 * contiguous region of logical blocks that need
-                 * blocks to be allocated by ext4.  We don't actually
+                 * blocks to be allocated by ext4 and submit them.
-                 * submit the blocks for I/O here, even though
-                 * write_cache_pages thinks it will, and will set the
-                 * pages as clean for write before calling
-                 * __mpage_da_writepage().
                 */
-                mpd.b_size = 0;
-                mpd.b_state = 0;
-                mpd.b_blocknr = 0;
-                mpd.first_page = 0;
-                mpd.next_page = 0;
-                mpd.io_done = 0;
-                mpd.pages_written = 0;
-                mpd.retval = 0;
                ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
                /*
                 * If we have a contiguous extent of pages and we
@@ -3096,7 +3025,6 @@ retry:
                         * and try again
                         */
                        jbd2_journal_force_commit_nested(sbi->s_journal);
-                        wbc->pages_skipped = pages_skipped;
                        ret = 0;
                } else if (ret == MPAGE_DA_EXTENT_TAIL) {
                        /*
@@ -3104,7 +3032,6 @@ retry:
                         * rest of the pages
                         */
                        pages_written += mpd.pages_written;
-                        wbc->pages_skipped = pages_skipped;
                        ret = 0;
                        io_done = 1;
                } else if (wbc->nr_to_write)
@@ -3122,11 +3049,6 @@ retry:
                wbc->range_end  = mapping->writeback_index - 1;
                goto retry;
        }
-        if (pages_skipped != wbc->pages_skipped)
-                ext4_msg(inode->i_sb, KERN_CRIT,
-                         "This should not happen leaving %s "
-                         "with nr_to_write = %ld ret = %d",
-                         __func__, wbc->nr_to_write, ret);
        /* Update index */
        wbc->range_cyclic = range_cyclic;
@@ -3383,7 +3305,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
         * the pages by calling redirty_page_for_writepage() but that
         * would be ugly in the extreme.  So instead we would need to
         * replicate parts of the code in the above functions,
-         * simplifying them becuase we wouldn't actually intend to
+         * simplifying them because we wouldn't actually intend to
         * write out the pages, but rather only collect contiguous
         * logical block extents, call the multi-block allocator, and
         * then update the buffer heads with the block allocations.
@@ -3460,6 +3382,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 static int ext4_readpage(struct file *file, struct page *page)
 {
+        trace_ext4_readpage(page);
        return mpage_readpage(page, ext4_get_block);
 }
@@ -3494,6 +3417,8 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
 {
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+        trace_ext4_invalidatepage(page, offset);
        /*
         * free any io_end structure allocated for buffers to be discarded
         */
@@ -3515,6 +3440,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 {
        journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+        trace_ext4_releasepage(page);
        WARN_ON(PageChecked(page));
        if (!page_has_buffers(page))
                return 0;
@@ -3768,7 +3695,7 @@ retry:
 *
 * The unwrritten extents will be converted to written when DIO is completed.
 * For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the convertion
+ * set up an end_io call back function, which will do the conversion
 * when async direct IO completed.
 *
 * If the O_DIRECT write will extend the file then add this inode to the
@@ -3791,7 +3718,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                 * We could direct write to holes and fallocate.
                 *
                 * Allocated blocks to fill the hole are marked as uninitialized
-                 * to prevent paralel buffered read to expose the stale data
+                 * to prevent parallel buffered read to expose the stale data
                 * before DIO complete the data IO.
                 *
                 * As to previously fallocated extents, ext4 get_block
@@ -3852,7 +3779,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                        int err;
                        /*
                         * for non AIO case, since the IO is already
-                         * completed, we could do the convertion right here
+                         * completed, we could do the conversion right here
                         */
                        err = ext4_convert_unwritten_extents(inode,
                                                             offset, ret);
@@ -3873,11 +3800,16 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+        ssize_t ret;
+        trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-                return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+                ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+        else
-        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+                ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+        trace_ext4_direct_IO_exit(inode, offset,
+                                iov_length(iov, nr_segs), rw, ret);
+        return ret;
 }
 /*
@@ -3903,7 +3835,6 @@ static const struct address_space_operations ext4_ordered_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_ordered_write_end,
        .bmap                   = ext4_bmap,
@@ -3919,7 +3850,6 @@ static const struct address_space_operations ext4_writeback_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_writeback_write_end,
        .bmap                   = ext4_bmap,
@@ -3935,7 +3865,6 @@ static const struct address_space_operations ext4_journalled_aops = {
        .readpage               = ext4_readpage,
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_write_begin,
        .write_end              = ext4_journalled_write_end,
        .set_page_dirty         = ext4_journalled_set_page_dirty,
@@ -3951,7 +3880,6 @@ static const struct address_space_operations ext4_da_aops = {
        .readpages              = ext4_readpages,
        .writepage              = ext4_writepage,
        .writepages             = ext4_da_writepages,
-        .sync_page              = block_sync_page,
        .write_begin            = ext4_da_write_begin,
        .write_end              = ext4_da_write_end,
        .bmap                   = ext4_bmap,
@@ -4098,7 +4026,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
 *
 *      When we do truncate() we may have to clean the ends of several
 *      indirect blocks but leave the blocks themselves alive. Block is
- *      partially truncated if some data below the new i_size is refered
+ *      partially truncated if some data below the new i_size is referred
 *      from it (and it is on the path to the first completely truncated
 *      data block, indeed).  We have to free the top of that path along
 *      with everything to the right of the path. Since no allocation
@@ -4177,6 +4105,9 @@ no_top:
 *
 * We release `count' blocks on disk, but (last - first) may be greater
 * than `count' because there can be holes in there.
+ *
+ * Return 0 on success, 1 on invalid block range
+ * and < 0 on fatal error.
 */
 static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh,
@@ -4203,33 +4134,32 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
                if (bh) {
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
                        err = ext4_handle_dirty_metadata(handle, inode, bh);
-                        if (unlikely(err)) {
+                        if (unlikely(err))
-                                ext4_std_error(inode->i_sb, err);
+                                goto out_err;
-                                return 1;
-                        }
                }
                err = ext4_mark_inode_dirty(handle, inode);
-                if (unlikely(err)) {
+                if (unlikely(err))
-                        ext4_std_error(inode->i_sb, err);
+                        goto out_err;
-                        return 1;
-                }
                err = ext4_truncate_restart_trans(handle, inode,
                                                  blocks_for_truncate(inode));
-                if (unlikely(err)) {
+                if (unlikely(err))
-                        ext4_std_error(inode->i_sb, err);
+                        goto out_err;
-                        return 1;
-                }
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
-                        ext4_journal_get_write_access(handle, bh);
+                        err = ext4_journal_get_write_access(handle, bh);
+                        if (unlikely(err))
+                                goto out_err;
                }
        }
        for (p = first; p < last; p++)
                *p = 0;
-        ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
+        ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
        return 0;
+out_err:
+        ext4_std_error(inode->i_sb, err);
+        return err;
 }
 /**
@@ -4240,7 +4170,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 * @first:      array of block numbers
 * @last:       points immediately past the end of array
 *
- * We are freeing all blocks refered from that array (numbers are stored as
+ * We are freeing all blocks referred from that array (numbers are stored as
 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
 *
 * We accumulate contiguous runs of blocks to free.  Conveniently, if these
@@ -4263,7 +4193,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
        ext4_fsblk_t nr;                    /* Current block # */
        __le32 *p;                          /* Pointer into inode/ind
                                               for current block */
-        int err;
+        int err = 0;
        if (this_bh) {                          /* For indirect block */
                BUFFER_TRACE(this_bh, "get_write_access");
@@ -4285,9 +4215,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                        } else if (nr == block_to_free + count) {
                                count++;
                        } else {
-                                if (ext4_clear_blocks(handle, inode, this_bh,
+                                err = ext4_clear_blocks(handle, inode, this_bh,
-                                                      block_to_free, count,
+                                                        block_to_free, count,
-                                                      block_to_free_p, p))
+                                                        block_to_free_p, p);
+                                if (err)
                                        break;
                                block_to_free = nr;
                                block_to_free_p = p;
@@ -4296,9 +4227,12 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                }
        }
-        if (count > 0)
+        if (!err && count > 0)
-                ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+                err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
-                                  count, block_to_free_p, p);
+                                        count, block_to_free_p, p);
+        if (err < 0)
+                /* fatal error */
+                return;
        if (this_bh) {
                BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
@@ -4328,7 +4262,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 *      @last:  pointer immediately past the end of array
 *      @depth: depth of the branches to free
 *
- *      We are freeing all blocks refered from these branches (numbers are
+ *      We are freeing all blocks referred from these branches (numbers are
 *      stored as little-endian 32-bit) and updating @inode->i_blocks
 *      appropriately.
 */
@@ -4416,7 +4350,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * transaction where the data blocks are
                         * actually freed.
                         */
-                        ext4_free_blocks(handle, inode, 0, nr, 1,
+                        ext4_free_blocks(handle, inode, NULL, nr, 1,
                                         EXT4_FREE_BLOCKS_METADATA|
                                         EXT4_FREE_BLOCKS_FORGET);
@@ -4496,10 +4430,12 @@ void ext4_truncate(struct inode *inode)
        Indirect chain[4];
        Indirect *partial;
        __le32 nr = 0;
-        int n;
+        int n = 0;
-        ext4_lblk_t last_block;
+        ext4_lblk_t last_block, max_block;
        unsigned blocksize = inode->i_sb->s_blocksize;
+        trace_ext4_truncate_enter(inode);
        if (!ext4_can_truncate(inode))
                return;
@@ -4510,6 +4446,7 @@ void ext4_truncate(struct inode *inode)
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ext4_ext_truncate(inode);
+                trace_ext4_truncate_exit(inode);
                return;
        }
@@ -4519,14 +4456,18 @@ void ext4_truncate(struct inode *inode)
        last_block = (inode->i_size + blocksize-1)
                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
+                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
        if (inode->i_size & (blocksize - 1))
                if (ext4_block_truncate_page(handle, mapping, inode->i_size))
                        goto out_stop;
-        n = ext4_block_to_path(inode, last_block, offsets, NULL);
+        if (last_block != max_block) {
-        if (n == 0)
+                n = ext4_block_to_path(inode, last_block, offsets, NULL);
-                goto out_stop;  /* error */
+                if (n == 0)
+                        goto out_stop;  /* error */
+        }
        /*
         * OK.  This truncate is going to happen.  We add the inode to the
@@ -4557,7 +4498,13 @@ void ext4_truncate(struct inode *inode)
         */
        ei->i_disksize = inode->i_size;
-        if (n == 1) {           /* direct blocks */
+        if (last_block == max_block) {
+                /*
+                 * It is unnecessary to free any data blocks if last_block is
+                 * equal to the indirect block limit.
+                 */
+                goto out_unlock;
+        } else if (n == 1) {            /* direct blocks */
                ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                               i_data + EXT4_NDIR_BLOCKS);
                goto do_indirects;
@@ -4617,6 +4564,7 @@ do_indirects:
                ;
        }
+out_unlock:
        up_write(&ei->i_data_sem);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
@@ -4639,6 +4587,7 @@ out_stop:
                ext4_orphan_del(handle, inode);
        ext4_journal_stop(handle);
+        trace_ext4_truncate_exit(inode);
 }
 /*
@@ -4770,6 +4719,7 @@ make_io:
                 * has in-inode xattrs, or we don't have this inode in memory.
                 * Read the block from disk.
                 */
+                trace_ext4_load_inode(inode);
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
                submit_bh(READ_META, bh);
@@ -4875,7 +4825,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                return inode;
        ei = EXT4_I(inode);
-        iloc.bh = 0;
+        iloc.bh = NULL;
        ret = __ext4_get_inode_loc(inode, &iloc, 0);
        if (ret < 0)
@@ -5460,13 +5410,12 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
        /* if nrblocks are contiguous */
        if (chunk) {
                /*
-                 * With N contiguous data blocks, it need at most
+                 * With N contiguous data blocks, we need at most
-                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
+                 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
-                 * 2 dindirect blocks
+                 * 2 dindirect blocks, and 1 tindirect block
-                 * 1 tindirect block
                 */
-                indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
+                return DIV_ROUND_UP(nrblocks,
-                return indirects + 3;
+                                    EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
        }
        /*
         * if nrblocks are not contiguous, worse case, each block touch
@@ -5540,7 +5489,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 }
 /*
- * Calulate the total number of credits to reserve to fit
+ * Calculate the total number of credits to reserve to fit
 * the modification of a single pages into a single transaction,
 * which may include multiple chunks of block allocations.
 *
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index eb3bc2fe647e..808c554e773f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                unsigned int oldflags;
                unsigned int jflag;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                if (get_user(flags, (int __user *) arg))
@@ -146,7 +146,7 @@ flags_out:
                __u32 generation;
                int err;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EPERM;
                err = mnt_want_write(filp->f_path.mnt);
@@ -298,7 +298,7 @@ mext_out:
        case EXT4_IOC_MIGRATE:
        {
                int err;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                err = mnt_want_write(filp->f_path.mnt);
@@ -320,7 +320,7 @@ mext_out:
        case EXT4_IOC_ALLOC_DA_BLKS:
        {
                int err;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                err = mnt_want_write(filp->f_path.mnt);
@@ -334,16 +334,22 @@ mext_out:
        case FITRIM:
        {
                struct super_block *sb = inode->i_sb;
+                struct request_queue *q = bdev_get_queue(sb->s_bdev);
                struct fstrim_range range;
                int ret = 0;
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
+                if (!blk_queue_discard(q))
+                        return -EOPNOTSUPP;
                if (copy_from_user(&range, (struct fstrim_range *)arg,
                    sizeof(range)))
                        return -EFAULT;
+                range.minlen = max((unsigned int)range.minlen,
+                                   q->limits.discard_granularity);
                ret = ext4_trim_fs(sb, &range);
                if (ret < 0)
                        return ret;
@@ -421,6 +427,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                return err;
        }
        case EXT4_IOC_MOVE_EXT:
+        case FITRIM:
                break;
        default:
                return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 851f49b2f9d2..d8a16eecf1d5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -92,7 +92,7 @@
 * between CPUs. It is possible to get scheduled at this point.
 *
 * The locality group prealloc space is used looking at whether we have
- * enough free space (pa_free) withing the prealloc space.
+ * enough free space (pa_free) within the prealloc space.
 *
 * If we can't allocate blocks via inode prealloc or/and locality group
 * prealloc then we look at the buddy cache. The buddy cache is represented
@@ -342,10 +342,15 @@ static struct kmem_cache *ext4_free_ext_cachep;
 /* We create slab caches for groupinfo data structures based on the
 * superblock block size.  There will be one per mounted filesystem for
 * each unique s_blocksize_bits */
-#define NR_GRPINFO_CACHES       \
+#define NR_GRPINFO_CACHES 8
-        (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
+        "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
+        "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
+        "ext4_groupinfo_64k", "ext4_groupinfo_128k"
+};
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -427,9 +432,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
        }
        /* at order 0 we see each particular block */
-        *max = 1 << (e4b->bd_blkbits + 3);
+        if (order == 0) {
-        if (order == 0)
+                *max = 1 << (e4b->bd_blkbits + 3);
                return EXT4_MB_BITMAP(e4b);
+        }
        bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
@@ -611,7 +617,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
        MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
        grp = ext4_get_group_info(sb, e4b->bd_group);
-        buddy = mb_find_buddy(e4b, 0, &max);
        list_for_each(cur, &grp->bb_prealloc_list) {
                ext4_group_t groupnr;
                struct ext4_prealloc_space *pa;
@@ -630,7 +635,12 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 #define mb_check_buddy(e4b)
 #endif
-/* FIXME!! need more doc */
+/*
+ * Divide blocks started from @first with length @len into
+ * smaller chunks with power of 2 blocks.
+ * Clear the bits in bitmap which the blocks of the chunk(s) covered,
+ * then increase bb_counters[] for corresponded chunk size.
+ */
 static void ext4_mb_mark_free_simple(struct super_block *sb,
                                void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
                                        struct ext4_group_info *grp)
@@ -2376,7 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
         * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
         * So a two level scheme suffices for now. */
-        sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
+        sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
        if (sbi->s_group_info == NULL) {
                printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
                return -ENOMEM;
@@ -2414,6 +2424,55 @@ err_freesgi:
        return -ENOMEM;
 }
+static void ext4_groupinfo_destroy_slabs(void)
+{
+        int i;
+        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+                if (ext4_groupinfo_caches[i])
+                        kmem_cache_destroy(ext4_groupinfo_caches[i]);
+                ext4_groupinfo_caches[i] = NULL;
+        }
+}
+static int ext4_groupinfo_create_slab(size_t size)
+{
+        static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
+        int slab_size;
+        int blocksize_bits = order_base_2(size);
+        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+        struct kmem_cache *cachep;
+        if (cache_index >= NR_GRPINFO_CACHES)
+                return -EINVAL;
+        if (unlikely(cache_index < 0))
+                cache_index = 0;
+        mutex_lock(&ext4_grpinfo_slab_create_mutex);
+        if (ext4_groupinfo_caches[cache_index]) {
+                mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+                return 0;       /* Already created */
+        }
+        slab_size = offsetof(struct ext4_group_info,
+                                bb_counters[blocksize_bits + 2]);
+        cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
+                                        slab_size, 0, SLAB_RECLAIM_ACCOUNT,
+                                        NULL);
+        mutex_unlock(&ext4_grpinfo_slab_create_mutex);
+        if (!cachep) {
+                printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
+                return -ENOMEM;
+        }
+        ext4_groupinfo_caches[cache_index] = cachep;
+        return 0;
+}
 int ext4_mb_init(struct super_block *sb, int needs_recovery)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2421,9 +2480,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        unsigned offset;
        unsigned max;
        int ret;
-        int cache_index;
-        struct kmem_cache *cachep;
-        char *namep = NULL;
        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
@@ -2440,30 +2496,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
                goto out;
        }
-        cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+        ret = ext4_groupinfo_create_slab(sb->s_blocksize);
-        cachep = ext4_groupinfo_caches[cache_index];
+        if (ret < 0)
-        if (!cachep) {
+                goto out;
-                char name[32];
-                int len = offsetof(struct ext4_group_info,
-                                        bb_counters[sb->s_blocksize_bits + 2]);
-                sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
-                namep = kstrdup(name, GFP_KERNEL);
-                if (!namep) {
-                        ret = -ENOMEM;
-                        goto out;
-                }
-                /* Need to free the kmem_cache_name() when we
-                 * destroy the slab */
-                cachep = kmem_cache_create(namep, len, 0,
-                                             SLAB_RECLAIM_ACCOUNT, NULL);
-                if (!cachep) {
-                        ret = -ENOMEM;
-                        goto out;
-                }
-                ext4_groupinfo_caches[cache_index] = cachep;
-        }
        /* order 0 is regular bitmap */
        sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
@@ -2520,7 +2555,6 @@ out:
        if (ret) {
                kfree(sbi->s_mb_offsets);
                kfree(sbi->s_mb_maxs);
-                kfree(namep);
        }
        return ret;
 }
@@ -2734,7 +2768,6 @@ int __init ext4_init_mballoc(void)
 void ext4_exit_mballoc(void)
 {
-        int i;
        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
@@ -2743,15 +2776,7 @@ void ext4_exit_mballoc(void)
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_ext_cachep);
+        ext4_groupinfo_destroy_slabs();
-        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
-                struct kmem_cache *cachep = ext4_groupinfo_caches[i];
-                if (cachep) {
-                        char *name = (char *)kmem_cache_name(cachep);
-                        kmem_cache_destroy(cachep);
-                        kfree(name);
-                }
-        }
        ext4_remove_debugfs_entry();
 }
@@ -3188,7 +3213,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
        cur_distance = abs(goal_block - cpa->pa_pstart);
        new_distance = abs(goal_block - pa->pa_pstart);
-        if (cur_distance < new_distance)
+        if (cur_distance <= new_distance)
                return cpa;
        /* drop the previous reference */
@@ -3887,7 +3912,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
        struct super_block *sb = ac->ac_sb;
        ext4_group_t ngroups, i;
-        if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+        if (!mb_enable_debug ||
+            (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
                return;
        printk(KERN_ERR "EXT4-fs: Can't allocate:"
@@ -4733,7 +4759,8 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
 * bitmap. Then issue a TRIM command on this extent and free the extent in
 * the group buddy bitmap. This is done until whole group is scanned.
 */
-ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+static ext4_grpblk_t
+ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
                ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
 {
        void *bitmap;
@@ -4843,10 +4870,15 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
                        break;
                }
-                if (len >= EXT4_BLOCKS_PER_GROUP(sb))
+                /*
-                        len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
+                 * For all the groups except the last one, last block will
-                else
+                 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
+                 * change it for the last group in which case start +
+                 * len < EXT4_BLOCKS_PER_GROUP(sb).
+                 */
+                if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
                        last_block = first_block + len;
+                len -= last_block - first_block;
                if (e4b.bd_info->bb_free >= minlen) {
                        cnt = ext4_trim_all_free(sb, &e4b, first_block,
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b619322c76f0..22bd4d7f289b 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -169,7 +169,7 @@ struct ext4_allocation_context {
        /* original request */
        struct ext4_free_extent ac_o_ex;
-        /* goal request (after normalization) */
+        /* goal request (normalized ac_o_ex) */
        struct ext4_free_extent ac_g_ex;
        /* the best found extent */
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b0a126f23c20..92816b4e0f16 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -263,7 +263,7 @@ static int free_dind_blocks(handle_t *handle,
        for (i = 0; i < max_entries; i++) {
                if (tmp_idata[i]) {
                        extend_credit_for_blkdel(handle, inode);
-                        ext4_free_blocks(handle, inode, 0,
+                        ext4_free_blocks(handle, inode, NULL,
                                         le32_to_cpu(tmp_idata[i]), 1,
                                         EXT4_FREE_BLOCKS_METADATA |
                                         EXT4_FREE_BLOCKS_FORGET);
@@ -271,7 +271,7 @@ static int free_dind_blocks(handle_t *handle,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
                         EXT4_FREE_BLOCKS_METADATA |
                         EXT4_FREE_BLOCKS_FORGET);
        return 0;
@@ -302,7 +302,7 @@ static int free_tind_blocks(handle_t *handle,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+        ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
                         EXT4_FREE_BLOCKS_METADATA |
                         EXT4_FREE_BLOCKS_FORGET);
        return 0;
@@ -315,7 +315,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
        /* ei->i_data[EXT4_IND_BLOCK] */
        if (i_data[0]) {
                extend_credit_for_blkdel(handle, inode);
-                ext4_free_blocks(handle, inode, 0,
+                ext4_free_blocks(handle, inode, NULL,
                                le32_to_cpu(i_data[0]), 1,
                                 EXT4_FREE_BLOCKS_METADATA |
                                 EXT4_FREE_BLOCKS_FORGET);
@@ -428,7 +428,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
        }
        put_bh(bh);
        extend_credit_for_blkdel(handle, inode);
-        ext4_free_blocks(handle, inode, 0, block, 1,
+        ext4_free_blocks(handle, inode, NULL, block, 1,
                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
        return retval;
 }
@@ -517,7 +517,7 @@ int ext4_ext_migrate(struct inode *inode)
         * start with one credit accounted for
         * superblock modification.
         *
-         * For the tmp_inode we already have commited the
+         * For the tmp_inode we already have committed the
         * trascation that created the inode. Later as and
         * when we add extents we extent the journal
         */
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5485390d32c5..67fd0b025858 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -40,6 +40,7 @@
 #include "xattr.h"
 #include "acl.h"
+#include <trace/events/ext4.h>
 /*
 * define how far ahead to read directories while searching them.
 */
@@ -2183,6 +2184,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
        struct ext4_dir_entry_2 *de;
        handle_t *handle;
+        trace_ext4_unlink_enter(dir, dentry);
        /* Initialize quotas before so that eventual writes go
         * in separate transaction */
        dquot_initialize(dir);
@@ -2228,6 +2230,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 end_unlink:
        ext4_journal_stop(handle);
        brelse(bh);
+        trace_ext4_unlink_exit(dentry, retval);
        return retval;
 }
@@ -2304,13 +2307,6 @@ static int ext4_link(struct dentry *old_dentry,
        dquot_initialize(dir);
-        /*
-         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-         * otherwise has the potential to corrupt the orphan inode list.
-         */
-        if (inode->i_nlink == 0)
-                return -ENOENT;
 retry:
        handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT4_INDEX_EXTRA_TRANS_BLOCKS);
@@ -2409,6 +2405,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (!new_inode && new_dir != old_dir &&
                    EXT4_DIR_LINK_MAX(new_dir))
                        goto end_rename;
+                BUFFER_TRACE(dir_bh, "get_write_access");
+                retval = ext4_journal_get_write_access(handle, dir_bh);
+                if (retval)
+                        goto end_rename;
        }
        if (!new_bh) {
                retval = ext4_add_entry(handle, new_dentry, old_inode);
@@ -2416,7 +2416,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                        goto end_rename;
        } else {
                BUFFER_TRACE(new_bh, "get write access");
-                ext4_journal_get_write_access(handle, new_bh);
+                retval = ext4_journal_get_write_access(handle, new_bh);
+                if (retval)
+                        goto end_rename;
                new_de->inode = cpu_to_le32(old_inode->i_ino);
                if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
                                              EXT4_FEATURE_INCOMPAT_FILETYPE))
@@ -2477,8 +2479,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
        ext4_update_dx_flag(old_dir);
        if (dir_bh) {
-                BUFFER_TRACE(dir_bh, "get_write_access");
-                ext4_journal_get_write_access(handle, dir_bh);
                PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
                                                cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7270dcfca92a..b6dbd056fcb1 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,14 +32,8 @@
 static struct kmem_cache *io_page_cachep, *io_end_cachep;
-#define WQ_HASH_SZ              37
-#define to_ioend_wq(v)  (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
-static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
 int __init ext4_init_pageio(void)
 {
-        int i;
        io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
        if (io_page_cachep == NULL)
                return -ENOMEM;
@@ -48,9 +42,6 @@ int __init ext4_init_pageio(void)
                kmem_cache_destroy(io_page_cachep);
                return -ENOMEM;
        }
-        for (i = 0; i < WQ_HASH_SZ; i++)
-                init_waitqueue_head(&ioend_wq[i]);
        return 0;
 }
@@ -62,7 +53,7 @@ void ext4_exit_pageio(void)
 void ext4_ioend_wait(struct inode *inode)
 {
-        wait_queue_head_t *wq = to_ioend_wq(inode);
+        wait_queue_head_t *wq = ext4_ioend_wq(inode);
        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
 }
@@ -87,7 +78,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
        for (i = 0; i < io->num_io_pages; i++)
                put_io_page(io->pages[i]);
        io->num_io_pages = 0;
-        wq = to_ioend_wq(io->inode);
+        wq = ext4_ioend_wq(io->inode);
        if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
            waitqueue_active(wq))
                wake_up_all(wq);
@@ -102,6 +93,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
        struct inode *inode = io->inode;
        loff_t offset = io->offset;
        ssize_t size = io->size;
+        wait_queue_head_t *wq;
        int ret = 0;
        ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
@@ -126,7 +118,16 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
        if (io->iocb)
                aio_complete(io->iocb, io->result, 0);
        /* clear the DIO AIO unwritten flag */
-        io->flag &= ~EXT4_IO_END_UNWRITTEN;
+        if (io->flag & EXT4_IO_END_UNWRITTEN) {
+                io->flag &= ~EXT4_IO_END_UNWRITTEN;
+                /* Wake up anyone waiting on unwritten extent conversion */
+                wq = ext4_ioend_wq(io->inode);
+                if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
+                    waitqueue_active(wq)) {
+                        wake_up_all(wq);
+                }
+        }
        return ret;
 }
@@ -190,6 +191,7 @@ static void ext4_end_bio(struct bio *bio, int error)
        struct inode *inode;
        unsigned long flags;
        int i;
+        sector_t bi_sector = bio->bi_sector;
        BUG_ON(!io_end);
        bio->bi_private = NULL;
@@ -207,9 +209,7 @@ static void ext4_end_bio(struct bio *bio, int error)
                if (error)
                        SetPageError(page);
                BUG_ON(!head);
-                if (head->b_size == PAGE_CACHE_SIZE)
+                if (head->b_size != PAGE_CACHE_SIZE) {
-                        clear_buffer_dirty(head);
-                else {
                        loff_t offset;
                        loff_t io_end_offset = io_end->offset + io_end->size;
@@ -221,7 +221,6 @@ static void ext4_end_bio(struct bio *bio, int error)
                                        if (error)
                                                buffer_io_error(bh);
-                                        clear_buffer_dirty(bh);
                                }
                                if (buffer_delay(bh))
                                        partial_write = 1;
@@ -257,7 +256,12 @@ static void ext4_end_bio(struct bio *bio, int error)
                             (unsigned long long) io_end->offset,
                             (long) io_end->size,
                             (unsigned long long)
-                             bio->bi_sector >> (inode->i_blkbits - 9));
+                             bi_sector >> (inode->i_blkbits - 9));
+        }
+        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+                ext4_free_io_end(io_end);
+                return;
        }
        /* Add the io_end to per-inode completed io list*/
@@ -280,9 +284,9 @@ void ext4_io_submit(struct ext4_io_submit *io)
                BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
                bio_put(io->io_bio);
        }
-        io->io_bio = 0;
+        io->io_bio = NULL;
        io->io_op = 0;
-        io->io_end = 0;
+        io->io_end = NULL;
 }
 static int io_submit_init(struct ext4_io_submit *io,
@@ -311,8 +315,7 @@ static int io_submit_init(struct ext4_io_submit *io,
        io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
        io->io_bio = bio;
-        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?
+        io->io_op = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : WRITE);
-                        WRITE_SYNC_PLUG : WRITE);
        io->io_next_block = bh->b_blocknr;
        return 0;
 }
@@ -380,9 +383,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
        blocksize = 1 << inode->i_blkbits;
+        BUG_ON(!PageLocked(page));
        BUG_ON(PageWriteback(page));
-        set_page_writeback(page);
-        ClearPageError(page);
        io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
        if (!io_page) {
@@ -393,16 +395,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
        io_page->p_page = page;
        atomic_set(&io_page->p_count, 1);
        get_page(page);
+        set_page_writeback(page);
+        ClearPageError(page);
        for (bh = head = page_buffers(page), block_start = 0;
             bh != head || !block_start;
             block_start = block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_start >= len) {
                        clear_buffer_dirty(bh);
                        set_buffer_uptodate(bh);
                        continue;
                }
+                clear_buffer_dirty(bh);
                ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
                if (ret) {
                        /*
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3ecc6e45d2f9..80bbc9c60c24 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -230,7 +230,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        }
        /* Zero out all of the reserved backup group descriptor table blocks */
-        ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+        ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
                        block, sbi->s_itb_per_group);
        err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
                               GFP_NOFS);
@@ -248,7 +248,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        /* Zero out all of the inode table blocks */
        block = input->inode_table;
-        ext4_debug("clear inode table blocks %#04llx -> %#04llx\n",
+        ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
                        block, sbi->s_itb_per_group);
        err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
        if (err)
@@ -499,12 +499,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        return err;
 exit_inode:
-        /* ext4_journal_release_buffer(handle, iloc.bh); */
+        /* ext4_handle_release_buffer(handle, iloc.bh); */
        brelse(iloc.bh);
 exit_dindj:
-        /* ext4_journal_release_buffer(handle, dind); */
+        /* ext4_handle_release_buffer(handle, dind); */
 exit_sbh:
-        /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
+        /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
 exit_dind:
        brelse(dind);
 exit_bh:
@@ -586,7 +586,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
                        /*
                        int j;
                        for (j = 0; j < i; j++)
-                                ext4_journal_release_buffer(handle, primary[j]);
+                                ext4_handle_release_buffer(handle, primary[j]);
                         */
                        goto exit_bh;
                }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 48ce561fafac..8553dfb310af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -54,9 +54,9 @@
 static struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
-struct ext4_lazy_init *ext4_li_info;
+static struct ext4_lazy_init *ext4_li_info;
-struct mutex ext4_li_mtx;
+static struct mutex ext4_li_mtx;
-struct ext4_features *ext4_feat;
+static struct ext4_features *ext4_feat;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
@@ -75,8 +75,10 @@ static void ext4_write_super(struct super_block *sb);
 static int ext4_freeze(struct super_block *sb);
 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
                       const char *dev_name, void *data);
+static int ext4_feature_set_ok(struct super_block *sb, int readonly);
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
+static void ext4_clear_request_list(void);
 #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext3_fs_type = {
@@ -240,27 +242,44 @@ static void ext4_put_nojournal(handle_t *handle)
 * journal_end calls result in the superblock being marked dirty, so
 * that sync() will call the filesystem's write_super callback if
 * appropriate.
+ *
+ * To avoid j_barrier hold in userspace when a user calls freeze(),
+ * ext4 prevents a new handle from being started by s_frozen, which
+ * is in an upper layer.
 */
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 {
        journal_t *journal;
+        handle_t  *handle;
        if (sb->s_flags & MS_RDONLY)
                return ERR_PTR(-EROFS);
-        vfs_check_frozen(sb, SB_FREEZE_TRANS);
-        /* Special case here: if the journal has aborted behind our
-         * backs (eg. EIO in the commit thread), then we still need to
-         * take the FS itself readonly cleanly. */
        journal = EXT4_SB(sb)->s_journal;
-        if (journal) {
+        handle = ext4_journal_current_handle();
-                if (is_journal_aborted(journal)) {
-                        ext4_abort(sb, "Detected aborted journal");
+        /*
-                        return ERR_PTR(-EROFS);
+         * If a handle has been started, it should be allowed to
-                }
+         * finish, otherwise deadlock could happen between freeze
-                return jbd2_journal_start(journal, nblocks);
+         * and others(e.g. truncate) due to the restart of the
+         * journal handle if the filesystem is forzen and active
+         * handles are not stopped.
+         */
+        if (!handle)
+                vfs_check_frozen(sb, SB_FREEZE_TRANS);
+        if (!journal)
+                return ext4_get_nojournal();
+        /*
+         * Special case here: if the journal has aborted behind our
+         * backs (eg. EIO in the commit thread), then we still need to
+         * take the FS itself readonly cleanly.
+         */
+        if (is_journal_aborted(journal)) {
+                ext4_abort(sb, "Detected aborted journal");
+                return ERR_PTR(-EROFS);
        }
-        return ext4_get_nojournal();
+        return jbd2_journal_start(journal, nblocks);
 }
 /*
@@ -593,7 +612,7 @@ __acquires(bitlock)
        vaf.fmt = fmt;
        vaf.va = &args;
-        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
               sb->s_id, function, line, grp);
        if (ino)
                printk(KERN_CONT "inode %lu: ", ino);
@@ -615,7 +634,7 @@ __acquires(bitlock)
         * filesystem will have already been marked read/only and the
         * journal has been aborted.  We return 1 as a hint to callers
         * who might what to use the return value from
-         * ext4_grp_locked_error() to distinguish beween the
+         * ext4_grp_locked_error() to distinguish between the
         * ERRORS_CONT and ERRORS_RO case, and perhaps return more
         * aggressively from the ext4 function in question, with a
         * more appropriate error code.
@@ -832,6 +851,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
        atomic_set(&ei->i_ioend_count, 0);
+        atomic_set(&ei->i_aiodio_unwritten, 0);
        return &ei->vfs_inode;
 }
@@ -995,13 +1015,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, OLDALLOC))
                seq_puts(seq, ",oldalloc");
 #ifdef CONFIG_EXT4_FS_XATTR
-        if (test_opt(sb, XATTR_USER) &&
+        if (test_opt(sb, XATTR_USER))
-                !(def_mount_opts & EXT4_DEFM_XATTR_USER))
                seq_puts(seq, ",user_xattr");
-        if (!test_opt(sb, XATTR_USER) &&
+        if (!test_opt(sb, XATTR_USER))
-            (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
                seq_puts(seq, ",nouser_xattr");
-        }
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
@@ -1039,8 +1056,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
            !(def_mount_opts & EXT4_DEFM_NODELALLOC))
                seq_puts(seq, ",nodelalloc");
-        if (test_opt(sb, MBLK_IO_SUBMIT))
+        if (!test_opt(sb, MBLK_IO_SUBMIT))
-                seq_puts(seq, ",mblk_io_submit");
+                seq_puts(seq, ",nomblk_io_submit");
        if (sbi->s_stripe)
                seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
        /*
@@ -1449,7 +1466,7 @@ static int parse_options(char *options, struct super_block *sb,
                 * Initialize args struct so we know whether arg was
                 * found; some options take optional arguments.
                 */
-                args[0].to = args[0].from = 0;
+                args[0].to = args[0].from = NULL;
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_bsd_df:
@@ -1769,7 +1786,7 @@ set_qf_format:
                                return 0;
                        if (option < 0 || option > (1 << 30))
                                return 0;
-                        if (!is_power_of_2(option)) {
+                        if (option && !is_power_of_2(option)) {
                                ext4_msg(sb, KERN_ERR,
                                         "EXT4-fs: inode_readahead_blks"
                                         " must be a power of 2");
@@ -2118,6 +2135,13 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                return;
        }
+        /* Check if feature set would not allow a r/w mount */
+        if (!ext4_feature_set_ok(sb, 0)) {
+                ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
+                         "unknown ROCOMPAT features");
+                return;
+        }
        if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
                if (es->s_last_orphan)
                        jbd_debug(1, "Errors on filesystem, "
@@ -2410,7 +2434,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
        if (parse_strtoul(buf, 0x40000000, &t))
                return -EINVAL;
-        if (!is_power_of_2(t))
+        if (t && !is_power_of_2(t))
                return -EINVAL;
        sbi->s_inode_readahead_blks = t;
@@ -2716,6 +2740,8 @@ static void ext4_unregister_li_request(struct super_block *sb)
        mutex_unlock(&ext4_li_info->li_list_mtx);
 }
+static struct task_struct *ext4_lazyinit_task;
 /*
 * This is the function where ext4lazyinit thread lives. It walks
 * through the request list searching for next scheduled filesystem.
@@ -2784,6 +2810,10 @@ cont_thread:
                if (time_before(jiffies, next_wakeup))
                        schedule();
                finish_wait(&eli->li_wait_daemon, &wait);
+                if (kthread_should_stop()) {
+                        ext4_clear_request_list();
+                        goto exit_thread;
+                }
        }
 exit_thread:
@@ -2808,6 +2838,7 @@ exit_thread:
        wake_up(&eli->li_wait_task);
        kfree(ext4_li_info);
+        ext4_lazyinit_task = NULL;
        ext4_li_info = NULL;
        mutex_unlock(&ext4_li_mtx);
@@ -2830,11 +2861,10 @@ static void ext4_clear_request_list(void)
 static int ext4_run_lazyinit_thread(void)
 {
-        struct task_struct *t;
+        ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
+                                         ext4_li_info, "ext4lazyinit");
-        t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
+        if (IS_ERR(ext4_lazyinit_task)) {
-        if (IS_ERR(t)) {
+                int err = PTR_ERR(ext4_lazyinit_task);
-                int err = PTR_ERR(t);
                ext4_clear_request_list();
                del_timer_sync(&ext4_li_info->li_timer);
                kfree(ext4_li_info);
@@ -2962,6 +2992,12 @@ static int ext4_register_li_request(struct super_block *sb,
        mutex_unlock(&ext4_li_info->li_list_mtx);
        sbi->s_li_request = elr;
+        /*
+         * set elr to NULL here since it has been inserted to
+         * the request_list and the removal and free of it is
+         * handled by ext4_clear_request_list from now on.
+         */
+        elr = NULL;
        if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
                ret = ext4_run_lazyinit_thread();
@@ -2985,16 +3021,10 @@ static void ext4_destroy_lazyinit_thread(void)
         * If thread exited earlier
         * there's nothing to be done.
         */
-        if (!ext4_li_info)
+        if (!ext4_li_info || !ext4_lazyinit_task)
                return;
-        ext4_clear_request_list();
+        kthread_stop(ext4_lazyinit_task);
-        while (ext4_li_info->li_task) {
-                wake_up(&ext4_li_info->li_wait_daemon);
-                wait_event(ext4_li_info->li_wait_task,
-                           ext4_li_info->li_task == NULL);
-        }
 }
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
@@ -3093,14 +3123,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (def_mount_opts & EXT4_DEFM_UID16)
                set_opt(sb, NO_UID32);
+        /* xattr user namespace & acls are now defaulted on */
 #ifdef CONFIG_EXT4_FS_XATTR
-        if (def_mount_opts & EXT4_DEFM_XATTR_USER)
+        set_opt(sb, XATTR_USER);
-                set_opt(sb, XATTR_USER);
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
-        if (def_mount_opts & EXT4_DEFM_ACL)
+        set_opt(sb, POSIX_ACL);
-                set_opt(sb, POSIX_ACL);
 #endif
+        set_opt(sb, MBLK_IO_SUBMIT);
        if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
                set_opt(sb, JOURNAL_DATA);
        else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
@@ -3378,6 +3408,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
+        init_timer(&sbi->s_err_report);
+        sbi->s_err_report.function = print_daily_error_info;
+        sbi->s_err_report.data = (unsigned long) sb;
        err = percpu_counter_init(&sbi->s_freeblocks_counter,
                        ext4_count_free_blocks(sb));
        if (!err) {
@@ -3413,6 +3447,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_qcop = &ext4_qctl_operations;
        sb->dq_op = &ext4_quota_operations;
 #endif
+        memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
        mutex_init(&sbi->s_orphan_lock);
        mutex_init(&sbi->s_resize_lock);
@@ -3507,7 +3543,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
 no_journal:
-        EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+        /*
+         * The maximum number of concurrent works can be high and
+         * concurrency isn't really necessary.  Limit it to 1.
+         */
+        EXT4_SB(sb)->dio_unwritten_wq =
+                alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
        if (!EXT4_SB(sb)->dio_unwritten_wq) {
                printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
                goto failed_mount_wq;
@@ -3522,17 +3563,16 @@ no_journal:
        if (IS_ERR(root)) {
                ext4_msg(sb, KERN_ERR, "get root inode failed");
                ret = PTR_ERR(root);
+                root = NULL;
                goto failed_mount4;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
-                iput(root);
                ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
                goto failed_mount4;
        }
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
                ext4_msg(sb, KERN_ERR, "get root dentry failed");
-                iput(root);
                ret = -ENOMEM;
                goto failed_mount4;
        }
@@ -3633,9 +3673,6 @@ no_journal:
                 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
                 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
-        init_timer(&sbi->s_err_report);
-        sbi->s_err_report.function = print_daily_error_info;
-        sbi->s_err_report.data = (unsigned long) sb;
        if (es->s_error_count)
                mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -3648,6 +3685,8 @@ cantfind_ext4:
        goto failed_mount;
 failed_mount4:
+        iput(root);
+        sb->s_root = NULL;
        ext4_msg(sb, KERN_ERR, "mount failed");
        destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
 failed_mount_wq:
@@ -3657,6 +3696,7 @@ failed_mount_wq:
                sbi->s_journal = NULL;
        }
 failed_mount3:
+        del_timer(&sbi->s_err_report);
        if (sbi->s_flex_groups) {
                if (is_vmalloc_addr(sbi->s_flex_groups))
                        vfree(sbi->s_flex_groups);
@@ -4123,6 +4163,11 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 /*
 * LVM calls this function before a (read-only) snapshot is created.  This
 * gives us a chance to flush the journal completely and mark the fs clean.
+ *
+ * Note that only this function cannot bring a filesystem to be in a clean
+ * state independently, because ext4 prevents a new handle from being started
+ * by @sb->s_frozen, which stays in an upper layer.  It thus needs help from
+ * the upper layer.
 */
 static int ext4_freeze(struct super_block *sb)
 {
@@ -4599,17 +4644,30 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 static int ext4_quota_off(struct super_block *sb, int type)
 {
+        struct inode *inode = sb_dqopt(sb)->files[type];
+        handle_t *handle;
        /* Force all delayed allocation blocks to be allocated.
         * Caller already holds s_umount sem */
        if (test_opt(sb, DELALLOC))
                sync_filesystem(sb);
+        /* Update modification times of quota files when userspace can
+         * start looking at them */
+        handle = ext4_journal_start(inode, 1);
+        if (IS_ERR(handle))
+                goto out;
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        ext4_mark_inode_dirty(handle, inode);
+        ext4_journal_stop(handle);
+out:
        return dquot_quota_off(sb, type);
 }
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
 * we don't have to be afraid of races */
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off)
@@ -4699,9 +4757,8 @@ out:
        if (inode->i_size < off + len) {
                i_size_write(inode, off + len);
                EXT4_I(inode)->i_disksize = inode->i_size;
+                ext4_mark_inode_dirty(handle, inode);
        }
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-        ext4_mark_inode_dirty(handle, inode);
        mutex_unlock(&inode->i_mutex);
        return len;
 }
@@ -4768,7 +4825,7 @@ static struct file_system_type ext4_fs_type = {
        .fs_flags       = FS_REQUIRES_DEV,
 };
-int __init ext4_init_feat_adverts(void)
+static int __init ext4_init_feat_adverts(void)
 {
        struct ext4_features *ef;
        int ret = -ENOMEM;
@@ -4792,23 +4849,44 @@ out:
        return ret;
 }
+static void ext4_exit_feat_adverts(void)
+{
+        kobject_put(&ext4_feat->f_kobj);
+        wait_for_completion(&ext4_feat->f_kobj_unregister);
+        kfree(ext4_feat);
+}
+/* Shared across all ext4 file systems */
+wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
+struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
 static int __init ext4_init_fs(void)
 {
-        int err;
+        int i, err;
        ext4_check_flag_values();
+        for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
+                mutex_init(&ext4__aio_mutex[i]);
+                init_waitqueue_head(&ext4__ioend_wq[i]);
+        }
        err = ext4_init_pageio();
        if (err)
                return err;
        err = ext4_init_system_zone();
        if (err)
-                goto out5;
+                goto out7;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
-                goto out4;
+                goto out6;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+        if (!ext4_proc_root)
+                goto out5;
        err = ext4_init_feat_adverts();
+        if (err)
+                goto out4;
        err = ext4_init_mballoc();
        if (err)
@@ -4838,12 +4916,14 @@ out1:
 out2:
        ext4_exit_mballoc();
 out3:
-        kfree(ext4_feat);
+        ext4_exit_feat_adverts();
+out4:
        remove_proc_entry("fs/ext4", NULL);
+out5:
        kset_unregister(ext4_kset);
-out4:
+out6:
        ext4_exit_system_zone();
-out5:
+out7:
        ext4_exit_pageio();
        return err;
 }
@@ -4857,6 +4937,7 @@ static void __exit ext4_exit_fs(void)
        destroy_inodecache();
        ext4_exit_xattr();
        ext4_exit_mballoc();
+        ext4_exit_feat_adverts();
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
        ext4_exit_system_zone();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fc32176eee39..b545ca1c459c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -735,7 +735,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                        int offset = (char *)s->here - bs->bh->b_data;
                        unlock_buffer(bs->bh);
-                        jbd2_journal_release_buffer(handle, bs->bh);
+                        ext4_handle_release_buffer(handle, bs->bh);
                        if (ce) {
                                mb_cache_entry_release(ce);
                                ce = NULL;
@@ -833,7 +833,7 @@ inserted:
                        new_bh = sb_getblk(sb, block);
                        if (!new_bh) {
 getblk_failed:
-                                ext4_free_blocks(handle, inode, 0, block, 1,
+                                ext4_free_blocks(handle, inode, NULL, block, 1,
                                                 EXT4_FREE_BLOCKS_METADATA);
                                error = -EIO;
                                goto cleanup;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 1ef16520b950..25b7387ff183 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -145,10 +145,10 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 #ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
-                                struct inode *dir);
+                              struct inode *dir, const struct qstr *qstr);
 #else
 static inline int ext4_init_security(handle_t *handle, struct inode *inode,
-                                struct inode *dir)
+                                     struct inode *dir, const struct qstr *qstr)
 {
        return 0;
 }
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 9b21268e121c..007c3bfbf094 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -49,14 +49,15 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
 }
 int
-ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
+                   const struct qstr *qstr)
 {
        int err;
        size_t len;
        void *value;
        char *name;
-        err = security_inode_init_security(inode, dir, &name, &value, &len);
+        err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 86753fe10bd1..8d68690bdcf1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -236,7 +236,6 @@ static const struct address_space_operations fat_aops = {
        .readpages      = fat_readpages,
        .writepage      = fat_writepage,
        .writepages     = fat_writepages,
-        .sync_page      = block_sync_page,
        .write_begin    = fat_write_begin,
        .write_end      = fat_write_end,
        .direct_IO      = fat_direct_IO,
@@ -757,8 +756,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
        struct inode *inode =  de->d_inode;
        u32 ipos_h, ipos_m, ipos_l;
-        if (len < 5)
+        if (len < 5) {
+                *lenp = 5;
                return 255; /* no room */
+        }
        ipos_h = MSDOS_I(inode)->i_pos >> 8;
        ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f88f752babd9..adae3fb7451a 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,7 +43,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        /* This is not negative dentry. Always valid. */
@@ -54,7 +54,7 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        /*
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ecc8b3954ed6..22764c7c8382 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 SYSCALL_DEFINE1(dup, unsigned int, fildes)
 {
        int ret = -EBADF;
-        struct file *file = fget(fildes);
+        struct file *file = fget_raw(fildes);
        if (file) {
                ret = get_unused_fd();
@@ -159,7 +159,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
        /* O_NOATIME can only be set by the owner or superuser */
        if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EPERM;
        /* required for strict SunOS emulation */
@@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        return err;
 }
+static int check_fcntl_cmd(unsigned cmd)
+{
+        switch (cmd) {
+        case F_DUPFD:
+        case F_DUPFD_CLOEXEC:
+        case F_GETFD:
+        case F_SETFD:
+        case F_GETFL:
+                return 1;
+        }
+        return 0;
+}
 SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {       
        struct file *filp;
        long err = -EBADF;
-        filp = fget(fd);
+        filp = fget_raw(fd);
        if (!filp)
                goto out;
+        if (unlikely(filp->f_mode & FMODE_PATH)) {
+                if (!check_fcntl_cmd(cmd)) {
+                        fput(filp);
+                        goto out;
+                }
+        }
        err = security_file_fcntl(filp, cmd, arg);
        if (err) {
                fput(filp);
@@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
        long err;
        err = -EBADF;
-        filp = fget(fd);
+        filp = fget_raw(fd);
        if (!filp)
                goto out;
+        if (unlikely(filp->f_mode & FMODE_PATH)) {
+                if (!check_fcntl_cmd(cmd)) {
+                        fput(filp);
+                        goto out;
+                }
+        }
        err = security_file_fcntl(filp, cmd, arg);
        if (err) {
                fput(filp);
@@ -808,14 +835,14 @@ static int __init fcntl_init(void)
         * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
         * is defined as O_NONBLOCK on some platforms and not on others.
         */
-        BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+        BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
                O_RDONLY        | O_WRONLY      | O_RDWR        |
                O_CREAT         | O_EXCL        | O_NOCTTY      |
                O_TRUNC         | O_APPEND      | /* O_NONBLOCK | */
                __O_SYNC        | O_DSYNC       | FASYNC        |
                O_DIRECT        | O_LARGEFILE   | O_DIRECTORY   |
                O_NOFOLLOW      | O_NOATIME     | O_CLOEXEC     |
-                FMODE_EXEC
+                __FMODE_EXEC    | O_PATH
                ));
        fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/fhandle.c b/fs/fhandle.c
new file mode 100644
index 000000000000..6b088641f5bf
--- /dev/null
+++ b/fs/fhandle.c
@@ -0,0 +1,266 @@
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include <linux/fs_struct.h>
+#include <linux/fsnotify.h>
+#include <linux/personality.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+static long do_sys_name_to_handle(struct path *path,
+                                  struct file_handle __user *ufh,
+                                  int __user *mnt_id)
+{
+        long retval;
+        struct file_handle f_handle;
+        int handle_dwords, handle_bytes;
+        struct file_handle *handle = NULL;
+        /*
+         * We need t make sure wether the file system
+         * support decoding of the file handle
+         */
+        if (!path->mnt->mnt_sb->s_export_op ||
+            !path->mnt->mnt_sb->s_export_op->fh_to_dentry)
+                return -EOPNOTSUPP;
+        if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
+                return -EFAULT;
+        if (f_handle.handle_bytes > MAX_HANDLE_SZ)
+                return -EINVAL;
+        handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+                         GFP_KERNEL);
+        if (!handle)
+                return -ENOMEM;
+        /* convert handle size to  multiple of sizeof(u32) */
+        handle_dwords = f_handle.handle_bytes >> 2;
+        /* we ask for a non connected handle */
+        retval = exportfs_encode_fh(path->dentry,
+                                    (struct fid *)handle->f_handle,
+                                    &handle_dwords,  0);
+        handle->handle_type = retval;
+        /* convert handle size to bytes */
+        handle_bytes = handle_dwords * sizeof(u32);
+        handle->handle_bytes = handle_bytes;
+        if ((handle->handle_bytes > f_handle.handle_bytes) ||
+            (retval == 255) || (retval == -ENOSPC)) {
+                /* As per old exportfs_encode_fh documentation
+                 * we could return ENOSPC to indicate overflow
+                 * But file system returned 255 always. So handle
+                 * both the values
+                 */
+                /*
+                 * set the handle size to zero so we copy only
+                 * non variable part of the file_handle
+                 */
+                handle_bytes = 0;
+                retval = -EOVERFLOW;
+        } else
+                retval = 0;
+        /* copy the mount id */
+        if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) ||
+            copy_to_user(ufh, handle,
+                         sizeof(struct file_handle) + handle_bytes))
+                retval = -EFAULT;
+        kfree(handle);
+        return retval;
+}
+/**
+ * sys_name_to_handle_at: convert name to handle
+ * @dfd: directory relative to which name is interpreted if not absolute
+ * @name: name that should be converted to handle.
+ * @handle: resulting file handle
+ * @mnt_id: mount id of the file system containing the file
+ * @flag: flag value to indicate whether to follow symlink or not
+ *
+ * @handle->handle_size indicate the space available to store the
+ * variable part of the file handle in bytes. If there is not
+ * enough space, the field is updated to return the minimum
+ * value required.
+ */
+SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
+                struct file_handle __user *, handle, int __user *, mnt_id,
+                int, flag)
+{
+        struct path path;
+        int lookup_flags;
+        int err;
+        if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+                return -EINVAL;
+        lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
+        if (flag & AT_EMPTY_PATH)
+                lookup_flags |= LOOKUP_EMPTY;
+        err = user_path_at(dfd, name, lookup_flags, &path);
+        if (!err) {
+                err = do_sys_name_to_handle(&path, handle, mnt_id);
+                path_put(&path);
+        }
+        return err;
+}
+static struct vfsmount *get_vfsmount_from_fd(int fd)
+{
+        struct path path;
+        if (fd == AT_FDCWD) {
+                struct fs_struct *fs = current->fs;
+                spin_lock(&fs->lock);
+                path = fs->pwd;
+                mntget(path.mnt);
+                spin_unlock(&fs->lock);
+        } else {
+                int fput_needed;
+                struct file *file = fget_light(fd, &fput_needed);
+                if (!file)
+                        return ERR_PTR(-EBADF);
+                path = file->f_path;
+                mntget(path.mnt);
+                fput_light(file, fput_needed);
+        }
+        return path.mnt;
+}
+static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
+{
+        return 1;
+}
+static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
+                             struct path *path)
+{
+        int retval = 0;
+        int handle_dwords;
+        path->mnt = get_vfsmount_from_fd(mountdirfd);
+        if (IS_ERR(path->mnt)) {
+                retval = PTR_ERR(path->mnt);
+                goto out_err;
+        }
+        /* change the handle size to multiple of sizeof(u32) */
+        handle_dwords = handle->handle_bytes >> 2;
+        path->dentry = exportfs_decode_fh(path->mnt,
+                                          (struct fid *)handle->f_handle,
+                                          handle_dwords, handle->handle_type,
+                                          vfs_dentry_acceptable, NULL);
+        if (IS_ERR(path->dentry)) {
+                retval = PTR_ERR(path->dentry);
+                goto out_mnt;
+        }
+        return 0;
+out_mnt:
+        mntput(path->mnt);
+out_err:
+        return retval;
+}
+static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
+                   struct path *path)
+{
+        int retval = 0;
+        struct file_handle f_handle;
+        struct file_handle *handle = NULL;
+        /*
+         * With handle we don't look at the execute bit on the
+         * the directory. Ideally we would like CAP_DAC_SEARCH.
+         * But we don't have that
+         */
+        if (!capable(CAP_DAC_READ_SEARCH)) {
+                retval = -EPERM;
+                goto out_err;
+        }
+        if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
+                retval = -EFAULT;
+                goto out_err;
+        }
+        if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
+            (f_handle.handle_bytes == 0)) {
+                retval = -EINVAL;
+                goto out_err;
+        }
+        handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+                         GFP_KERNEL);
+        if (!handle) {
+                retval = -ENOMEM;
+                goto out_err;
+        }
+        /* copy the full handle */
+        if (copy_from_user(handle, ufh,
+                           sizeof(struct file_handle) +
+                           f_handle.handle_bytes)) {
+                retval = -EFAULT;
+                goto out_handle;
+        }
+        retval = do_handle_to_path(mountdirfd, handle, path);
+out_handle:
+        kfree(handle);
+out_err:
+        return retval;
+}
+long do_handle_open(int mountdirfd,
+                    struct file_handle __user *ufh, int open_flag)
+{
+        long retval = 0;
+        struct path path;
+        struct file *file;
+        int fd;
+        retval = handle_to_path(mountdirfd, ufh, &path);
+        if (retval)
+                return retval;
+        fd = get_unused_fd_flags(open_flag);
+        if (fd < 0) {
+                path_put(&path);
+                return fd;
+        }
+        file = file_open_root(path.dentry, path.mnt, "", open_flag);
+        if (IS_ERR(file)) {
+                put_unused_fd(fd);
+                retval =  PTR_ERR(file);
+        } else {
+                retval = fd;
+                fsnotify_open(file);
+                fd_install(fd, file);
+        }
+        path_put(&path);
+        return retval;
+}
+/**
+ * sys_open_by_handle_at: Open the file handle
+ * @mountdirfd: directory file descriptor
+ * @handle: file handle to be opened
+ * @flag: open flags.
+ *
+ * @mountdirfd indicate the directory file descriptor
+ * of the mount point. file handle is decoded relative
+ * to the vfsmount pointed by the @mountdirfd. @flags
+ * value is same as the open(2) flags.
+ */
+SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+                struct file_handle __user *, handle,
+                int, flags)
+{
+        long ret;
+        if (force_o_largefile())
+                flags |= O_LARGEFILE;
+        ret = do_handle_open(mountdirfd, handle, flags);
+        return ret;
+}
diff --git a/fs/fifo.c b/fs/fifo.c
index 4e303c22d5ee..b1a524d798e7 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -66,8 +66,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
                                /* suppress POLLHUP until we have
                                 * seen a writer */
                                filp->f_version = pipe->w_counter;
-                        } else 
+                        } else {
-                        {
                                wait_for_partner(inode, &pipe->w_counter);
                                if(signal_pending(current))
                                        goto err_rd;
diff --git a/fs/file_table.c b/fs/file_table.c
index c3e89adf53c0..01e4c1e8e6b6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -125,13 +125,13 @@ struct file *get_empty_filp(void)
                goto fail;
        percpu_counter_inc(&nr_files);
+        f->f_cred = get_cred(cred);
        if (security_file_alloc(f))
                goto fail_sec;
        INIT_LIST_HEAD(&f->f_u.fu_list);
        atomic_long_set(&f->f_count, 1);
        rwlock_init(&f->f_owner.lock);
-        f->f_cred = get_cred(cred);
        spin_lock_init(&f->f_lock);
        eventpoll_init_file(f);
        /* f->f_version: 0 */
@@ -190,7 +190,8 @@ struct file *alloc_file(struct path *path, fmode_t mode,
                file_take_write(file);
                WARN_ON(mnt_clone_write(path->mnt));
        }
-        ima_counts_get(file);
+        if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+                i_readcount_inc(path->dentry->d_inode);
        return file;
 }
 EXPORT_SYMBOL(alloc_file);
@@ -246,11 +247,15 @@ static void __fput(struct file *file)
                file->f_op->release(inode, file);
        security_file_free(file);
        ima_file_free(file);
-        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
+        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
+                     !(file->f_mode & FMODE_PATH))) {
                cdev_put(inode->i_cdev);
+        }
        fops_put(file->f_op);
        put_pid(file->f_owner.pid);
        file_sb_list_del(file);
+        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+                i_readcount_dec(inode);
        if (file->f_mode & FMODE_WRITE)
                drop_file_write_access(file);
        file->f_path.dentry = NULL;
@@ -276,11 +281,10 @@ struct file *fget(unsigned int fd)
        rcu_read_lock();
        file = fcheck_files(files, fd);
        if (file) {
-                if (!atomic_long_inc_not_zero(&file->f_count)) {
+                /* File object ref couldn't be taken */
-                        /* File object ref couldn't be taken */
+                if (file->f_mode & FMODE_PATH ||
-                        rcu_read_unlock();
+                    !atomic_long_inc_not_zero(&file->f_count))
-                        return NULL;
+                        file = NULL;
-                }
        }
        rcu_read_unlock();
@@ -289,6 +293,25 @@ struct file *fget(unsigned int fd)
 EXPORT_SYMBOL(fget);
+struct file *fget_raw(unsigned int fd)
+{
+        struct file *file;
+        struct files_struct *files = current->files;
+        rcu_read_lock();
+        file = fcheck_files(files, fd);
+        if (file) {
+                /* File object ref couldn't be taken */
+                if (!atomic_long_inc_not_zero(&file->f_count))
+                        file = NULL;
+        }
+        rcu_read_unlock();
+        return file;
+}
+EXPORT_SYMBOL(fget_raw);
 /*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
 *
@@ -313,6 +336,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
        *fput_needed = 0;
        if (atomic_read(&files->count) == 1) {
                file = fcheck_files(files, fd);
+                if (file && (file->f_mode & FMODE_PATH))
+                        file = NULL;
+        } else {
+                rcu_read_lock();
+                file = fcheck_files(files, fd);
+                if (file) {
+                        if (!(file->f_mode & FMODE_PATH) &&
+                            atomic_long_inc_not_zero(&file->f_count))
+                                *fput_needed = 1;
+                        else
+                                /* Didn't get the reference, someone's freed */
+                                file = NULL;
+                }
+                rcu_read_unlock();
+        }
+        return file;
+}
+struct file *fget_raw_light(unsigned int fd, int *fput_needed)
+{
+        struct file *file;
+        struct files_struct *files = current->files;
+        *fput_needed = 0;
+        if (atomic_read(&files->count) == 1) {
+                file = fcheck_files(files, fd);
        } else {
                rcu_read_lock();
                file = fcheck_files(files, fd);
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 751d6b255a12..0845f84f2a5f 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -110,14 +110,13 @@ int unregister_filesystem(struct file_system_type * fs)
                        *tmp = fs->next;
                        fs->next = NULL;
                        write_unlock(&file_systems_lock);
+                        synchronize_rcu();
                        return 0;
                }
                tmp = &(*tmp)->next;
        }
        write_unlock(&file_systems_lock);
-        synchronize_rcu();
        return -EINVAL;
 }
diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c
index 78948b4b1894..c9a6a94e58e9 100644
--- a/fs/freevxfs/vxfs_fshead.c
+++ b/fs/freevxfs/vxfs_fshead.c
@@ -164,7 +164,7 @@ vxfs_read_fshead(struct super_block *sbp)
                goto out_free_pfp;
        }
        if (!VXFS_ISILT(VXFS_INO(infp->vsi_stilist))) {
-                printk(KERN_ERR "vxfs: structual list inode is of wrong type (%x)\n",
+                printk(KERN_ERR "vxfs: structural list inode is of wrong type (%x)\n",
                                VXFS_INO(infp->vsi_stilist)->vii_mode & VXFS_TYPE_MASK); 
                goto out_iput_stilist;
        }
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 6c5131d592f0..3360f1e678ad 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -162,7 +162,7 @@ vxfs_find_entry(struct inode *ip, struct dentry *dp, struct page **ppp)
 /**
 * vxfs_inode_by_name - find inode number for dentry
 * @dip:        directory to search in
- * @dp:         dentry we seach for
+ * @dp:         dentry we search for
 *
 * Description:
 *   vxfs_inode_by_name finds out the inode number of
diff --git a/fs/freevxfs/vxfs_olt.h b/fs/freevxfs/vxfs_olt.h
index d8324296486f..b7b3af502615 100644
--- a/fs/freevxfs/vxfs_olt.h
+++ b/fs/freevxfs/vxfs_olt.h
@@ -60,7 +60,7 @@ enum {
 *
 * The Object Location Table header is placed at the beginning of each
 * OLT extent.  It is used to fing certain filesystem-wide metadata, e.g.
- * the inital inode list, the fileset header or the device configuration.
+ * the initial inode list, the fileset header or the device configuration.
 */
 struct vxfs_olt {
        u_int32_t       olt_magic;      /* magic number                 */
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 1429f3ae1e86..5d318c44f855 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -44,7 +44,6 @@ static sector_t		vxfs_bmap(struct address_space *, sector_t);
 const struct address_space_operations vxfs_aops = {
        .readpage =             vxfs_readpage,
        .bmap =                 vxfs_bmap,
-        .sync_page =            block_sync_page,
 };
 inline void
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 59c6e4956786..34591ee804b5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -144,7 +144,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 *
 * Description:
 *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
- *   started when this function returns, we make no guarentees on
+ *   started when this function returns, we make no guarantees on
 *   completion. Caller need not hold sb s_umount semaphore.
 *
 */
@@ -176,6 +176,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
 }
 /*
+ * Remove the inode from the writeback list it is on.
+ */
+void inode_wb_list_del(struct inode *inode)
+{
+        spin_lock(&inode_wb_list_lock);
+        list_del_init(&inode->i_wb_list);
+        spin_unlock(&inode_wb_list_lock);
+}
+/*
 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
 * furthest end of its superblock's dirty-inode list.
 *
@@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode)
 {
        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+        assert_spin_locked(&inode_wb_list_lock);
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;
@@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode)
 {
        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+        assert_spin_locked(&inode_wb_list_lock);
        list_move(&inode->i_wb_list, &wb->b_more_io);
 }
 static void inode_sync_complete(struct inode *inode)
 {
        /*
-         * Prevent speculative execution through spin_unlock(&inode_lock);
+         * Prevent speculative execution through
+         * spin_unlock(&inode_wb_list_lock);
         */
        smp_mb();
        wake_up_bit(&inode->i_state, __I_SYNC);
 }
@@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 */
 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 {
+        assert_spin_locked(&inode_wb_list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
        move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
@@ -306,25 +322,25 @@ static void inode_wait_for_writeback(struct inode *inode)
        wait_queue_head_t *wqh;
        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-         while (inode->i_state & I_SYNC) {
+        while (inode->i_state & I_SYNC) {
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode->i_lock);
+                spin_unlock(&inode_wb_list_lock);
                __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
-                spin_lock(&inode_lock);
+                spin_lock(&inode_wb_list_lock);
+                spin_lock(&inode->i_lock);
        }
 }
 /*
- * Write out an inode's dirty pages.  Called under inode_lock.  Either the
+ * Write out an inode's dirty pages.  Called under inode_wb_list_lock and
- * caller has ref on the inode (either via __iget or via syscall against an fd)
+ * inode->i_lock.  Either the caller has an active reference on the inode or
- * or the inode has I_WILL_FREE set (via generic_forget_inode)
+ * the inode has I_WILL_FREE set.
 *
 * If `wait' is set, wait on the writeout.
 *
 * The whole writeout design is quite complex and fragile.  We want to avoid
 * starvation of particular inodes when others are being redirtied, prevent
 * livelocks, etc.
- *
- * Called under inode_lock.
 */
 static int
 writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
@@ -333,6 +349,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        unsigned dirty;
        int ret;
+        assert_spin_locked(&inode_wb_list_lock);
+        assert_spin_locked(&inode->i_lock);
        if (!atomic_read(&inode->i_count))
                WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
        else
@@ -363,7 +382,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        /* Set I_SYNC, reset I_DIRTY_PAGES */
        inode->i_state |= I_SYNC;
        inode->i_state &= ~I_DIRTY_PAGES;
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&inode_wb_list_lock);
        ret = do_writepages(mapping, wbc);
@@ -383,10 +403,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
         * due to delalloc, clear dirty metadata flags right before
         * write_inode()
         */
-        spin_lock(&inode_lock);
+        spin_lock(&inode->i_lock);
        dirty = inode->i_state & I_DIRTY;
        inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
                int err = write_inode(inode, wbc);
@@ -394,7 +414,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                        ret = err;
        }
-        spin_lock(&inode_lock);
+        spin_lock(&inode_wb_list_lock);
+        spin_lock(&inode->i_lock);
        inode->i_state &= ~I_SYNC;
        if (!(inode->i_state & I_FREEING)) {
                if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
@@ -506,7 +527,9 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                 * kind does not need peridic writeout yet, and for the latter
                 * kind writeout is handled by the freer.
                 */
+                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+                        spin_unlock(&inode->i_lock);
                        requeue_io(inode);
                        continue;
                }
@@ -515,10 +538,13 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                 * Was this inode dirtied after sync_sb_inodes was called?
                 * This keeps sync from extra jobs and livelock.
                 */
-                if (inode_dirtied_after(inode, wbc->wb_start))
+                if (inode_dirtied_after(inode, wbc->wb_start)) {
+                        spin_unlock(&inode->i_lock);
                        return 1;
+                }
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
                writeback_single_inode(inode, wbc);
                if (wbc->pages_skipped != pages_skipped) {
@@ -528,10 +554,11 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
                         */
                        redirty_tail(inode);
                }
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode->i_lock);
+                spin_unlock(&inode_wb_list_lock);
                iput(inode);
                cond_resched();
-                spin_lock(&inode_lock);
+                spin_lock(&inode_wb_list_lock);
                if (wbc->nr_to_write <= 0) {
                        wbc->more_io = 1;
                        return 1;
@@ -550,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
        if (!wbc->wb_start)
                wbc->wb_start = jiffies; /* livelock avoidance */
-        spin_lock(&inode_lock);
+        spin_lock(&inode_wb_list_lock);
        if (!wbc->for_kupdate || list_empty(&wb->b_io))
                queue_io(wb, wbc->older_than_this);
@@ -568,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
                if (ret)
                        break;
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_wb_list_lock);
        /* Leave any unwritten inodes on b_io */
 }
@@ -577,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb,
 {
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-        spin_lock(&inode_lock);
+        spin_lock(&inode_wb_list_lock);
        if (!wbc->for_kupdate || list_empty(&wb->b_io))
                queue_io(wb, wbc->older_than_this);
        writeback_sb_inodes(sb, wb, wbc, true);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_wb_list_lock);
 }
 /*
@@ -720,13 +747,15 @@ static long wb_writeback(struct bdi_writeback *wb,
                 * become available for writeback. Otherwise
                 * we'll just busyloop.
                 */
-                spin_lock(&inode_lock);
+                spin_lock(&inode_wb_list_lock);
                if (!list_empty(&wb->b_more_io))  {
                        inode = wb_inode(wb->b_more_io.prev);
                        trace_wbc_writeback_wait(&wbc, wb->bdi);
+                        spin_lock(&inode->i_lock);
                        inode_wait_for_writeback(inode);
+                        spin_unlock(&inode->i_lock);
                }
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode_wb_list_lock);
        }
        return wrote;
@@ -992,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 {
        struct super_block *sb = inode->i_sb;
        struct backing_dev_info *bdi = NULL;
-        bool wakeup_bdi = false;
        /*
         * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -1016,7 +1044,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
        if (unlikely(block_dump))
                block_dump___mark_inode_dirty(inode);
-        spin_lock(&inode_lock);
+        spin_lock(&inode->i_lock);
        if ((inode->i_state & flags) != flags) {
                const int was_dirty = inode->i_state & I_DIRTY;
@@ -1028,7 +1056,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                 * superblock list, based upon its state.
                 */
                if (inode->i_state & I_SYNC)
-                        goto out;
+                        goto out_unlock_inode;
                /*
                 * Only add valid (hashed) inodes to the superblock's
@@ -1036,16 +1064,17 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                 */
                if (!S_ISBLK(inode->i_mode)) {
                        if (inode_unhashed(inode))
-                                goto out;
+                                goto out_unlock_inode;
                }
                if (inode->i_state & I_FREEING)
-                        goto out;
+                        goto out_unlock_inode;
                /*
                 * If the inode was already on b_dirty/b_io/b_more_io, don't
                 * reposition it (that would break b_dirty time-ordering).
                 */
                if (!was_dirty) {
+                        bool wakeup_bdi = false;
                        bdi = inode_to_bdi(inode);
                        if (bdi_cap_writeback_dirty(bdi)) {
@@ -1062,15 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                                        wakeup_bdi = true;
                        }
+                        spin_unlock(&inode->i_lock);
+                        spin_lock(&inode_wb_list_lock);
                        inode->dirtied_when = jiffies;
                        list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+                        spin_unlock(&inode_wb_list_lock);
+                        if (wakeup_bdi)
+                                bdi_wakeup_thread_delayed(bdi);
+                        return;
                }
        }
-out:
+out_unlock_inode:
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
-        if (wakeup_bdi)
-                bdi_wakeup_thread_delayed(bdi);
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
@@ -1101,7 +1135,7 @@ static void wait_sb_inodes(struct super_block *sb)
         */
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
-        spin_lock(&inode_lock);
+        spin_lock(&inode_sb_list_lock);
        /*
         * Data integrity sync. Must wait for all pages under writeback,
@@ -1111,22 +1145,25 @@ static void wait_sb_inodes(struct super_block *sb)
         * we still have to wait for that writeout.
         */
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-                struct address_space *mapping;
+                struct address_space *mapping = inode->i_mapping;
-                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+                spin_lock(&inode->i_lock);
-                        continue;
+                if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-                mapping = inode->i_mapping;
+                    (mapping->nrpages == 0)) {
-                if (mapping->nrpages == 0)
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
                __iget(inode);
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode->i_lock);
+                spin_unlock(&inode_sb_list_lock);
                /*
-                 * We hold a reference to 'inode' so it couldn't have
+                 * We hold a reference to 'inode' so it couldn't have been
-                 * been removed from s_inodes list while we dropped the
+                 * removed from s_inodes list while we dropped the
-                 * inode_lock.  We cannot iput the inode now as we can
+                 * inode_sb_list_lock.  We cannot iput the inode now as we can
-                 * be holding the last reference and we cannot iput it
+                 * be holding the last reference and we cannot iput it under
-                 * under inode_lock. So we keep the reference and iput
+                 * inode_sb_list_lock. So we keep the reference and iput it
-                 * it later.
+                 * later.
                 */
                iput(old_inode);
                old_inode = inode;
@@ -1135,9 +1172,9 @@ static void wait_sb_inodes(struct super_block *sb)
                cond_resched();
-                spin_lock(&inode_lock);
+                spin_lock(&inode_sb_list_lock);
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
        iput(old_inode);
 }
@@ -1271,9 +1308,11 @@ int write_inode_now(struct inode *inode, int sync)
                wbc.nr_to_write = 0;
        might_sleep();
-        spin_lock(&inode_lock);
+        spin_lock(&inode_wb_list_lock);
+        spin_lock(&inode->i_lock);
        ret = writeback_single_inode(inode, &wbc);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&inode_wb_list_lock);
        if (sync)
                inode_sync_wait(inode);
        return ret;
@@ -1295,9 +1334,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
 {
        int ret;
-        spin_lock(&inode_lock);
+        spin_lock(&inode_wb_list_lock);
+        spin_lock(&inode->i_lock);
        ret = writeback_single_inode(inode, wbc);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&inode_wb_list_lock);
        return ret;
 }
 EXPORT_SYMBOL(sync_inode);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 3e87cce5837d..b6cca47f7b07 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -305,7 +305,7 @@ static void cuse_gendev_release(struct device *dev)
 static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 {
        struct cuse_conn *cc = fc_to_cc(fc);
-        struct cuse_init_out *arg = &req->misc.cuse_init_out;
+        struct cuse_init_out *arg = req->out.args[0].value;
        struct page *page = req->pages[0];
        struct cuse_devinfo devinfo = { };
        struct device *dev;
@@ -384,6 +384,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
        dev_set_uevent_suppress(dev, 0);
        kobject_uevent(&dev->kobj, KOBJ_ADD);
 out:
+        kfree(arg);
        __free_page(page);
        return;
@@ -405,6 +406,7 @@ static int cuse_send_init(struct cuse_conn *cc)
        struct page *page;
        struct fuse_conn *fc = &cc->fc;
        struct cuse_init_in *arg;
+        void *outarg;
        BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
@@ -419,6 +421,10 @@ static int cuse_send_init(struct cuse_conn *cc)
        if (!page)
                goto err_put_req;
+        outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL);
+        if (!outarg)
+                goto err_free_page;
        arg = &req->misc.cuse_init_in;
        arg->major = FUSE_KERNEL_VERSION;
        arg->minor = FUSE_KERNEL_MINOR_VERSION;
@@ -429,7 +435,7 @@ static int cuse_send_init(struct cuse_conn *cc)
        req->in.args[0].value = arg;
        req->out.numargs = 2;
        req->out.args[0].size = sizeof(struct cuse_init_out);
-        req->out.args[0].value = &req->misc.cuse_init_out;
+        req->out.args[0].value = outarg;
        req->out.args[1].size = CUSE_INIT_INFO_MAX;
        req->out.argvar = 1;
        req->out.argpages = 1;
@@ -440,6 +446,8 @@ static int cuse_send_init(struct cuse_conn *cc)
        return 0;
+err_free_page:
+        __free_page(page);
 err_put_req:
        fuse_put_request(fc, req);
 err:
@@ -458,7 +466,7 @@ static void cuse_fc_release(struct fuse_conn *fc)
 * @file: file struct being opened
 *
 * Userland CUSE server can create a CUSE device by opening /dev/cuse
- * and replying to the initilaization request kernel sends.  This
+ * and replying to the initialization request kernel sends.  This
 * function is responsible for handling CUSE device initialization.
 * Because the fd opened by this function is used during
 * initialization, this function only creates cuse_conn and sends
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cf8d28d1fbad..640fc229df10 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -737,14 +737,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
        if (WARN_ON(PageMlocked(oldpage)))
                goto out_fallback_unlock;
-        remove_from_page_cache(oldpage);
+        err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
-        page_cache_release(oldpage);
-        err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
        if (err) {
-                printk(KERN_WARNING "fuse_try_move_page: failed to add page");
+                unlock_page(newpage);
-                goto out_fallback_unlock;
+                return err;
        }
        page_cache_get(newpage);
        if (!(buf->flags & PIPE_BUF_FLAG_LRU))
@@ -1910,6 +1908,21 @@ __acquires(fc->lock)
                kfree(dequeue_forget(fc, 1, NULL));
 }
+static void end_polls(struct fuse_conn *fc)
+{
+        struct rb_node *p;
+        p = rb_first(&fc->polled_files);
+        while (p) {
+                struct fuse_file *ff;
+                ff = rb_entry(p, struct fuse_file, polled_node);
+                wake_up_interruptible_all(&ff->poll_wait);
+                p = rb_next(p);
+        }
+}
 /*
 * Abort all requests.
 *
@@ -1937,6 +1950,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
                fc->blocked = 0;
                end_io_requests(fc);
                end_queued_requests(fc);
+                end_polls(fc);
                wake_up_all(&fc->waitq);
                wake_up_all(&fc->blocked_waitq);
                kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -1953,6 +1967,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
                fc->connected = 0;
                fc->blocked = 0;
                end_queued_requests(fc);
+                end_polls(fc);
                wake_up_all(&fc->blocked_waitq);
                spin_unlock(&fc->lock);
                fuse_conn_put(fc);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bfed8447ed80..c6ba49bd95b3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -158,10 +158,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
        struct inode *inode;
-        if (nd->flags & LOOKUP_RCU)
+        inode = ACCESS_ONCE(entry->d_inode);
-                return -ECHILD;
-        inode = entry->d_inode;
        if (inode && is_bad_inode(inode))
                return 0;
        else if (fuse_dentry_time(entry) < get_jiffies_64()) {
@@ -177,6 +174,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                if (!inode)
                        return 0;
+                if (nd->flags & LOOKUP_RCU)
+                        return -ECHILD;
                fc = get_fuse_conn(inode);
                req = fuse_get_req(fc);
                if (IS_ERR(req))
@@ -970,6 +970,14 @@ static int fuse_access(struct inode *inode, int mask)
        return err;
 }
+static int fuse_perm_getattr(struct inode *inode, int flags)
+{
+        if (flags & IPERM_FLAG_RCU)
+                return -ECHILD;
+        return fuse_do_getattr(inode, NULL, NULL);
+}
 /*
 * Check permission.  The two basic access models of FUSE are:
 *
@@ -989,9 +997,6 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
        bool refreshed = false;
        int err = 0;
-        if (flags & IPERM_FLAG_RCU)
-                return -ECHILD;
        if (!fuse_allow_task(fc, current))
                return -EACCES;
@@ -1000,9 +1005,15 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
         */
        if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) ||
            ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
-                err = fuse_update_attributes(inode, NULL, NULL, &refreshed);
+                struct fuse_inode *fi = get_fuse_inode(inode);
-                if (err)
-                        return err;
+                if (fi->i_time < get_jiffies_64()) {
+                        refreshed = true;
+                        err = fuse_perm_getattr(inode, flags);
+                        if (err)
+                                return err;
+                }
        }
        if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
@@ -1012,7 +1023,7 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
                   attributes.  This is also needed, because the root
                   node will at first have no permissions */
                if (err == -EACCES && !refreshed) {
-                        err = fuse_do_getattr(inode, NULL, NULL);
+                        err = fuse_perm_getattr(inode, flags);
                        if (!err)
                                err = generic_permission(inode, mask,
                                                        flags, NULL);
@@ -1023,13 +1034,16 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
                   noticed immediately, only after the attribute
                   timeout has expired */
        } else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
+                if (flags & IPERM_FLAG_RCU)
+                        return -ECHILD;
                err = fuse_access(inode, mask);
        } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
                if (!(inode->i_mode & S_IXUGO)) {
                        if (refreshed)
                                return -EACCES;
-                        err = fuse_do_getattr(inode, NULL, NULL);
+                        err = fuse_perm_getattr(inode, flags);
                        if (!err && !(inode->i_mode & S_IXUGO))
                                return -EACCES;
                }
@@ -1283,8 +1297,11 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
        if (err)
                return err;
-        if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc)
+        if (attr->ia_valid & ATTR_OPEN) {
-                return 0;
+                if (fc->atomic_o_trunc)
+                        return 0;
+                file = NULL;
+        }
        if (attr->ia_valid & ATTR_SIZE)
                is_truncate = true;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 95da1bc1c826..82a66466a24c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -86,18 +86,52 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff)
        return ff;
 }
+static void fuse_release_async(struct work_struct *work)
+{
+        struct fuse_req *req;
+        struct fuse_conn *fc;
+        struct path path;
+        req = container_of(work, struct fuse_req, misc.release.work);
+        path = req->misc.release.path;
+        fc = get_fuse_conn(path.dentry->d_inode);
+        fuse_put_request(fc, req);
+        path_put(&path);
+}
 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-        path_put(&req->misc.release.path);
+        if (fc->destroy_req) {
+                /*
+                 * If this is a fuseblk mount, then it's possible that
+                 * releasing the path will result in releasing the
+                 * super block and sending the DESTROY request.  If
+                 * the server is single threaded, this would hang.
+                 * For this reason do the path_put() in a separate
+                 * thread.
+                 */
+                atomic_inc(&req->count);
+                INIT_WORK(&req->misc.release.work, fuse_release_async);
+                schedule_work(&req->misc.release.work);
+        } else {
+                path_put(&req->misc.release.path);
+        }
 }
-static void fuse_file_put(struct fuse_file *ff)
+static void fuse_file_put(struct fuse_file *ff, bool sync)
 {
        if (atomic_dec_and_test(&ff->count)) {
                struct fuse_req *req = ff->reserved_req;
-                req->end = fuse_release_end;
+                if (sync) {
-                fuse_request_send_background(ff->fc, req);
+                        fuse_request_send(ff->fc, req);
+                        path_put(&req->misc.release.path);
+                        fuse_put_request(ff->fc, req);
+                } else {
+                        req->end = fuse_release_end;
+                        fuse_request_send_background(ff->fc, req);
+                }
                kfree(ff);
        }
 }
@@ -188,7 +222,7 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
                rb_erase(&ff->polled_node, &fc->polled_files);
        spin_unlock(&fc->lock);
-        wake_up_interruptible_sync(&ff->poll_wait);
+        wake_up_interruptible_all(&ff->poll_wait);
        inarg->fh = ff->fh;
        inarg->flags = flags;
@@ -219,8 +253,12 @@ void fuse_release_common(struct file *file, int opcode)
         * Normally this will send the RELEASE request, however if
         * some asynchronous READ or WRITE requests are outstanding,
         * the sending will be delayed.
+         *
+         * Make the release synchronous if this is a fuseblk mount,
+         * synchronous RELEASE is allowed (and desirable) in this case
+         * because the server can be trusted not to screw up.
         */
-        fuse_file_put(ff);
+        fuse_file_put(ff, ff->fc->destroy_req != NULL);
 }
 static int fuse_open(struct inode *inode, struct file *file)
@@ -485,7 +523,7 @@ static int fuse_readpage(struct file *file, struct page *page)
                goto out;
        /*
-         * Page writeback can extend beyond the liftime of the
+         * Page writeback can extend beyond the lifetime of the
         * page-cache page, so make sure we read a properly synced
         * page.
         */
@@ -558,7 +596,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
                page_cache_release(page);
        }
        if (req->ff)
-                fuse_file_put(req->ff);
+                fuse_file_put(req->ff, false);
 }
 static void fuse_send_readpages(struct fuse_req *req, struct file *file)
@@ -1137,7 +1175,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
        __free_page(req->pages[0]);
-        fuse_file_put(req->ff);
+        fuse_file_put(req->ff, false);
 }
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ae5744a2f9e9..b788becada76 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,7 @@
 #include <linux/rwsem.h>
 #include <linux/rbtree.h>
 #include <linux/poll.h>
+#include <linux/workqueue.h>
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -262,13 +263,15 @@ struct fuse_req {
        /** Data for asynchronous requests */
        union {
                struct {
-                        struct fuse_release_in in;
+                        union {
+                                struct fuse_release_in in;
+                                struct work_struct work;
+                        };
                        struct path path;
                } release;
                struct fuse_init_in init_in;
                struct fuse_init_out init_out;
                struct cuse_init_in cuse_init_in;
-                struct cuse_init_out cuse_init_out;
                struct {
                        struct fuse_read_in in;
                        u64 attr_ver;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9e3f68cc1bd1..cc6ec4b2f0ff 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        u64 nodeid;
        u32 generation;
-        if (*max_len < len)
+        if (*max_len < len) {
+                *max_len = len;
                return  255;
+        }
        nodeid = get_fuse_inode(inode)->nodeid;
        generation = inode->i_generation;
@@ -868,7 +870,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
        fc->bdi.name = "fuse";
        fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-        fc->bdi.unplug_io_fn = default_unplug_io_fn;
        /* fuse does it's own writeback accounting */
        fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 06c48a891832..8f26d1a58912 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -74,7 +74,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
                return -EINVAL;
        if (S_ISLNK(inode->i_mode))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value) {
                acl = posix_acl_from_xattr(value, size);
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 21f7e46da4c0..f3d23ef4e876 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,4 +1,4 @@
-EXTRA_CFLAGS := -I$(src)
+ccflags-y := -I$(src)
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
        glops.o inode.o log.o lops.o main.o meta_io.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 7118f1a780a9..cbc07155b1a0 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -80,8 +80,11 @@ int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
        struct posix_acl *acl;
        int error;
-        if (flags & IPERM_FLAG_RCU)
+        if (flags & IPERM_FLAG_RCU) {
-                return -ECHILD;
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        return -ECHILD;
+                return -EAGAIN;
+        }
        acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 4f36f8832b9b..c71995b111bf 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -695,6 +695,7 @@ out:
        if (error == 0)
                return 0;
+        unlock_page(page);
        page_cache_release(page);
        gfs2_trans_end(sdp);
@@ -1116,7 +1117,6 @@ static const struct address_space_operations gfs2_writeback_aops = {
        .writepages = gfs2_writeback_writepages,
        .readpage = gfs2_readpage,
        .readpages = gfs2_readpages,
-        .sync_page = block_sync_page,
        .write_begin = gfs2_write_begin,
        .write_end = gfs2_write_end,
        .bmap = gfs2_bmap,
@@ -1132,7 +1132,6 @@ static const struct address_space_operations gfs2_ordered_aops = {
        .writepage = gfs2_ordered_writepage,
        .readpage = gfs2_readpage,
        .readpages = gfs2_readpages,
-        .sync_page = block_sync_page,
        .write_begin = gfs2_write_begin,
        .write_end = gfs2_write_end,
        .set_page_dirty = gfs2_set_page_dirty,
@@ -1150,7 +1149,6 @@ static const struct address_space_operations gfs2_jdata_aops = {
        .writepages = gfs2_jdata_writepages,
        .readpage = gfs2_readpage,
        .readpages = gfs2_readpages,
-        .sync_page = block_sync_page,
        .write_begin = gfs2_write_begin,
        .write_end = gfs2_write_end,
        .set_page_dirty = gfs2_set_page_dirty,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3c4039d5eef1..74add2ddcc3f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -21,6 +21,7 @@
 #include "meta_io.h"
 #include "quota.h"
 #include "rgrp.h"
+#include "super.h"
 #include "trans.h"
 #include "dir.h"
 #include "util.h"
@@ -757,7 +758,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrp_list rlist;
        u64 bn, bstart;
-        u32 blen;
+        u32 blen, btotal;
        __be64 *p;
        unsigned int rg_blocks = 0;
        int metadata;
@@ -839,6 +840,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        bstart = 0;
        blen = 0;
+        btotal = 0;
        for (p = top; p < bottom; p++) {
                if (!*p)
@@ -851,9 +853,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
                else {
                        if (bstart) {
                                if (metadata)
-                                        gfs2_free_meta(ip, bstart, blen);
+                                        __gfs2_free_meta(ip, bstart, blen);
                                else
-                                        gfs2_free_data(ip, bstart, blen);
+                                        __gfs2_free_data(ip, bstart, blen);
+                                btotal += blen;
                        }
                        bstart = bn;
@@ -865,11 +869,17 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        }
        if (bstart) {
                if (metadata)
-                        gfs2_free_meta(ip, bstart, blen);
+                        __gfs2_free_meta(ip, bstart, blen);
                else
-                        gfs2_free_data(ip, bstart, blen);
+                        __gfs2_free_data(ip, bstart, blen);
+                btotal += blen;
        }
+        gfs2_statfs_change(sdp, 0, +btotal, 0);
+        gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
+                          ip->i_inode.i_gid);
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_dinode_out(ip, dibh->b_data);
@@ -1126,7 +1136,7 @@ void gfs2_trim_blocks(struct inode *inode)
 * earlier versions of GFS2 have a bug in the stuffed file reading
 * code which will result in a buffer overrun if the size is larger
 * than the max stuffed file size. In order to prevent this from
- * occuring, such files are unstuffed, but in other cases we can
+ * occurring, such files are unstuffed, but in other cases we can
 * just update the inode size directly.
 *
 * Returns: 0 on success, or -ve on error
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4a456338b873..0da8da2c991d 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -44,7 +44,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
        int error;
        int had_lock = 0;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        parent = dget_parent(dentry);
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9023db8184f9..b5a5e60df0d5 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
        struct super_block *sb = inode->i_sb;
        struct gfs2_inode *ip = GFS2_I(inode);
-        if (*len < GFS2_SMALL_FH_SIZE ||
+        if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
-            (connectable && *len < GFS2_LARGE_FH_SIZE))
+                *len = GFS2_LARGE_FH_SIZE;
                return 255;
+        } else if (*len < GFS2_SMALL_FH_SIZE) {
+                *len = GFS2_SMALL_FH_SIZE;
+                return 255;
+        }
        fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
        fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7cfdcb913363..b2682e073eee 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -221,7 +221,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
                goto out_drop_write;
        error = -EACCES;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                goto out;
        error = 0;
@@ -448,15 +448,20 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-        if (!(file->f_flags & O_NOATIME)) {
+        if (!(file->f_flags & O_NOATIME) &&
+            !IS_NOATIME(&ip->i_inode)) {
                struct gfs2_holder i_gh;
                int error;
-                gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+                gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                error = gfs2_glock_nq(&i_gh);
-                file_accessed(file);
+                if (error == 0) {
-                if (error == 0)
+                        file_accessed(file);
-                        gfs2_glock_dq_uninit(&i_gh);
+                        gfs2_glock_dq(&i_gh);
+                }
+                gfs2_holder_uninit(&i_gh);
+                if (error)
+                        return error;
        }
        vma->vm_ops = &gfs2_vm_ops;
        vma->vm_flags |= VM_CAN_NONLINEAR;
@@ -617,8 +622,7 @@ static void empty_write_end(struct page *page, unsigned from,
 {
        struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-        page_zero_new_buffers(page, from, to);
+        zero_user(page, from, to-from);
-        flush_dcache_page(page);
        mark_page_accessed(page);
        if (!gfs2_is_writeback(ip))
@@ -627,36 +631,43 @@ static void empty_write_end(struct page *page, unsigned from,
        block_commit_write(page, from, to);
 }
-static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
+static int needs_empty_write(sector_t block, struct inode *inode)
 {
-        unsigned start, end, next;
-        struct buffer_head *bh, *head;
        int error;
+        struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
-        if (!page_has_buffers(page)) {
+        bh_map.b_size = 1 << inode->i_blkbits;
-                error = __block_write_begin(page, from, to - from, gfs2_block_map);
+        error = gfs2_block_map(inode, block, &bh_map, 0);
-                if (unlikely(error))
+        if (unlikely(error))
-                        return error;
+                return error;
+        return !buffer_mapped(&bh_map);
+}
-                empty_write_end(page, from, to);
+static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
-                return 0;
+{
-        }
+        struct inode *inode = page->mapping->host;
+        unsigned start, end, next, blksize;
+        sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        int ret;
-        bh = head = page_buffers(page);
+        blksize = 1 << inode->i_blkbits;
        next = end = 0;
        while (next < from) {
-                next += bh->b_size;
+                next += blksize;
-                bh = bh->b_this_page;
+                block++;
        }
        start = next;
        do {
-                next += bh->b_size;
+                next += blksize;
-                if (buffer_mapped(bh)) {
+                ret = needs_empty_write(block, inode);
+                if (unlikely(ret < 0))
+                        return ret;
+                if (ret == 0) {
                        if (end) {
-                                error = __block_write_begin(page, start, end - start,
+                                ret = __block_write_begin(page, start, end - start,
-                                                            gfs2_block_map);
+                                                          gfs2_block_map);
-                                if (unlikely(error))
+                                if (unlikely(ret))
-                                        return error;
+                                        return ret;
                                empty_write_end(page, start, end);
                                end = 0;
                        }
@@ -664,13 +675,13 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
                }
                else
                        end = next;
-                bh = bh->b_this_page;
+                block++;
        } while (next < to);
        if (end) {
-                error = __block_write_begin(page, start, end - start, gfs2_block_map);
+                ret = __block_write_begin(page, start, end - start, gfs2_block_map);
-                if (unlikely(error))
+                if (unlikely(ret))
-                        return error;
+                        return ret;
                empty_write_end(page, start, end);
        }
@@ -976,8 +987,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
        mutex_lock(&fp->f_fl_mutex);
        flock_lock_file_wait(file, fl);
-        if (fl_gh->gh_gl)
+        if (fl_gh->gh_gl) {
-                gfs2_glock_dq_uninit(fl_gh);
+                gfs2_glock_dq_wait(fl_gh);
+                gfs2_holder_uninit(fl_gh);
+        }
        mutex_unlock(&fp->f_fl_mutex);
 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 08a8beb152e6..f07643e21bfa 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -26,6 +26,9 @@
 #include <linux/freezer.h>
 #include <linux/workqueue.h>
 #include <linux/jiffies.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/bit_spinlock.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -41,10 +44,6 @@
 #define CREATE_TRACE_POINTS
 #include "trace_gfs2.h"
-struct gfs2_gl_hash_bucket {
-        struct hlist_head hb_list;
-};
 struct gfs2_glock_iter {
        int hash;                       /* hash bucket index         */
        struct gfs2_sbd *sdp;           /* incore superblock         */
@@ -54,7 +53,6 @@ struct gfs2_glock_iter {
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
-static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
 static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
@@ -70,57 +68,9 @@ static DEFINE_SPINLOCK(lru_lock);
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
 #define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
-static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
+static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
 static struct dentry *gfs2_root;
-/*
- * Despite what you might think, the numbers below are not arbitrary :-)
- * They are taken from the ipv4 routing hash code, which is well tested
- * and thus should be nearly optimal. Later on we might tweek the numbers
- * but for now this should be fine.
- *
- * The reason for putting the locks in a separate array from the list heads
- * is that we can have fewer locks than list heads and save memory. We use
- * the same hash function for both, but with a different hash mask.
- */
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-        defined(CONFIG_PROVE_LOCKING)
-#ifdef CONFIG_LOCKDEP
-# define GL_HASH_LOCK_SZ        256
-#else
-# if NR_CPUS >= 32
-#  define GL_HASH_LOCK_SZ       4096
-# elif NR_CPUS >= 16
-#  define GL_HASH_LOCK_SZ       2048
-# elif NR_CPUS >= 8
-#  define GL_HASH_LOCK_SZ       1024
-# elif NR_CPUS >= 4
-#  define GL_HASH_LOCK_SZ       512
-# else
-#  define GL_HASH_LOCK_SZ       256
-# endif
-#endif
-/* We never want more locks than chains */
-#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
-# undef GL_HASH_LOCK_SZ
-# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
-#endif
-static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ];
-static inline rwlock_t *gl_lock_addr(unsigned int x)
-{
-        return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
-}
-#else /* not SMP, so no spinlocks required */
-static inline rwlock_t *gl_lock_addr(unsigned int x)
-{
-        return NULL;
-}
-#endif
 /**
 * gl_hash() - Turn glock number into hash bucket number
 * @lock: The glock number
@@ -141,25 +91,35 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
        return h;
 }
-/**
+static inline void spin_lock_bucket(unsigned int hash)
- * glock_free() - Perform a few checks and then release struct gfs2_glock
+{
- * @gl: The glock to release
+        struct hlist_bl_head *bl = &gl_hash_table[hash];
- *
+        bit_spin_lock(0, (unsigned long *)bl);
- * Also calls lock module to release its internal structure for this glock.
+}
- *
- */
-static void glock_free(struct gfs2_glock *gl)
+static inline void spin_unlock_bucket(unsigned int hash)
+{
+        struct hlist_bl_head *bl = &gl_hash_table[hash];
+        __bit_spin_unlock(0, (unsigned long *)bl);
+}
+static void gfs2_glock_dealloc(struct rcu_head *rcu)
+{
+        struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
+        if (gl->gl_ops->go_flags & GLOF_ASPACE)
+                kmem_cache_free(gfs2_glock_aspace_cachep, gl);
+        else
+                kmem_cache_free(gfs2_glock_cachep, gl);
+}
+void gfs2_glock_free(struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        struct address_space *mapping = gfs2_glock2aspace(gl);
-        struct kmem_cache *cachep = gfs2_glock_cachep;
-        GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
+        call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
-        trace_gfs2_glock_put(gl);
+        if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-        if (mapping)
+                wake_up(&sdp->sd_glock_wait);
-                cachep = gfs2_glock_aspace_cachep;
-        sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
 }
 /**
@@ -185,34 +145,49 @@ static int demote_ok(const struct gfs2_glock *gl)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        /* assert_spin_locked(&gl->gl_spin); */
        if (gl->gl_state == LM_ST_UNLOCKED)
                return 0;
-        if (!list_empty(&gl->gl_holders))
+        if (test_bit(GLF_LFLUSH, &gl->gl_flags))
+                return 0;
+        if ((gl->gl_name.ln_type != LM_TYPE_INODE) &&
+            !list_empty(&gl->gl_holders))
                return 0;
        if (glops->go_demote_ok)
                return glops->go_demote_ok(gl);
        return 1;
 }
 /**
- * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
 * @gl: the glock
 *
+ * If the glock is demotable, then we add it (or move it) to the end
+ * of the glock LRU list.
 */
-static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 {
-        int may_reclaim;
+        if (demote_ok(gl)) {
-        may_reclaim = (demote_ok(gl) &&
+                spin_lock(&lru_lock);
-                       (atomic_read(&gl->gl_ref) == 1 ||
-                        (gl->gl_name.ln_type == LM_TYPE_INODE &&
+                if (!list_empty(&gl->gl_lru))
-                         atomic_read(&gl->gl_ref) <= 2)));
+                        list_del_init(&gl->gl_lru);
-        spin_lock(&lru_lock);
+                else
-        if (list_empty(&gl->gl_lru) && may_reclaim) {
+                        atomic_inc(&lru_count);
                list_add_tail(&gl->gl_lru, &lru_list);
-                atomic_inc(&lru_count);
+                spin_unlock(&lru_lock);
        }
-        spin_unlock(&lru_lock);
+}
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+        spin_lock(&gl->gl_spin);
+        __gfs2_glock_schedule_for_reclaim(gl);
+        spin_unlock(&gl->gl_spin);
 }
 /**
@@ -227,7 +202,6 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl)
 {
        if (atomic_dec_and_test(&gl->gl_ref))
                GLOCK_BUG_ON(gl, 1);
-        gfs2_glock_schedule_for_reclaim(gl);
 }
 /**
@@ -236,30 +210,26 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl)
 *
 */
-int gfs2_glock_put(struct gfs2_glock *gl)
+void gfs2_glock_put(struct gfs2_glock *gl)
 {
-        int rv = 0;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct address_space *mapping = gfs2_glock2aspace(gl);
-        write_lock(gl_lock_addr(gl->gl_hash));
+        if (atomic_dec_and_test(&gl->gl_ref)) {
-        if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
+                spin_lock_bucket(gl->gl_hash);
-                hlist_del(&gl->gl_list);
+                hlist_bl_del_rcu(&gl->gl_list);
+                spin_unlock_bucket(gl->gl_hash);
+                spin_lock(&lru_lock);
                if (!list_empty(&gl->gl_lru)) {
                        list_del_init(&gl->gl_lru);
                        atomic_dec(&lru_count);
                }
                spin_unlock(&lru_lock);
-                write_unlock(gl_lock_addr(gl->gl_hash));
                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
-                glock_free(gl);
+                GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
-                rv = 1;
+                trace_gfs2_glock_put(gl);
-                goto out;
+                sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
        }
-        spin_lock(&gl->gl_spin);
-        gfs2_glock_schedule_for_reclaim(gl);
-        spin_unlock(&gl->gl_spin);
-        write_unlock(gl_lock_addr(gl->gl_hash));
-out:
-        return rv;
 }
 /**
@@ -275,17 +245,15 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
                                        const struct lm_lockname *name)
 {
        struct gfs2_glock *gl;
-        struct hlist_node *h;
+        struct hlist_bl_node *h;
-        hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) {
+        hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) {
                if (!lm_name_equal(&gl->gl_name, name))
                        continue;
                if (gl->gl_sbd != sdp)
                        continue;
+                if (atomic_inc_not_zero(&gl->gl_ref))
-                atomic_inc(&gl->gl_ref);
+                        return gl;
-                return gl;
        }
        return NULL;
@@ -743,10 +711,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        struct gfs2_glock *gl, *tmp;
        unsigned int hash = gl_hash(sdp, &name);
        struct address_space *mapping;
+        struct kmem_cache *cachep;
-        read_lock(gl_lock_addr(hash));
+        rcu_read_lock();
        gl = search_bucket(hash, sdp, &name);
-        read_unlock(gl_lock_addr(hash));
+        rcu_read_unlock();
        *glp = gl;
        if (gl)
@@ -755,9 +724,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                return -ENOENT;
        if (glops->go_flags & GLOF_ASPACE)
-                gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL);
+                cachep = gfs2_glock_aspace_cachep;
        else
-                gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
+                cachep = gfs2_glock_cachep;
+        gl = kmem_cache_alloc(cachep, GFP_KERNEL);
        if (!gl)
                return -ENOMEM;
@@ -790,15 +760,16 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                mapping->writeback_index = 0;
        }
-        write_lock(gl_lock_addr(hash));
+        spin_lock_bucket(hash);
        tmp = search_bucket(hash, sdp, &name);
        if (tmp) {
-                write_unlock(gl_lock_addr(hash));
+                spin_unlock_bucket(hash);
-                glock_free(gl);
+                kmem_cache_free(cachep, gl);
+                atomic_dec(&sdp->sd_glock_disposal);
                gl = tmp;
        } else {
-                hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list);
+                hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
-                write_unlock(gl_lock_addr(hash));
+                spin_unlock_bucket(hash);
        }
        *glp = gl;
@@ -1007,13 +978,13 @@ fail:
                        insert_pt = &gh2->gh_list;
        }
        set_bit(GLF_QUEUED, &gl->gl_flags);
+        trace_gfs2_glock_queue(gh, 1);
        if (likely(insert_pt == NULL)) {
                list_add_tail(&gh->gh_list, &gl->gl_holders);
                if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
                        goto do_cancel;
                return;
        }
-        trace_gfs2_glock_queue(gh, 1);
        list_add_tail(&gh->gh_list, insert_pt);
 do_cancel:
        gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
@@ -1113,6 +1084,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
                    !test_bit(GLF_DEMOTE, &gl->gl_flags))
                        fast_path = 1;
        }
+        __gfs2_glock_schedule_for_reclaim(gl);
        trace_gfs2_glock_queue(gh, 0);
        spin_unlock(&gl->gl_spin);
        if (likely(fast_path))
@@ -1151,7 +1123,7 @@ void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
 * @number: the lock number
 * @glops: the glock operations for the type of glock
 * @state: the state to acquire the glock in
- * @flags: modifier flags for the aquisition
+ * @flags: modifier flags for the acquisition
 * @gh: the struct gfs2_holder
 *
 * Returns: errno
@@ -1276,10 +1248,8 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 {
-        unsigned int x;
+        while (num_gh--)
+                gfs2_glock_dq(&ghs[num_gh]);
-        for (x = 0; x < num_gh; x++)
-                gfs2_glock_dq(&ghs[x]);
 }
 /**
@@ -1291,10 +1261,8 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 {
-        unsigned int x;
+        while (num_gh--)
+                gfs2_glock_dq_uninit(&ghs[num_gh]);
-        for (x = 0; x < num_gh; x++)
-                gfs2_glock_dq_uninit(&ghs[x]);
 }
 void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
@@ -1440,42 +1408,30 @@ static struct shrinker glock_shrinker = {
 * @sdp: the filesystem
 * @bucket: the bucket
 *
- * Returns: 1 if the bucket has entries
 */
-static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
+static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
                          unsigned int hash)
 {
-        struct gfs2_glock *gl, *prev = NULL;
+        struct gfs2_glock *gl;
-        int has_entries = 0;
+        struct hlist_bl_head *head = &gl_hash_table[hash];
-        struct hlist_head *head = &gl_hash_table[hash].hb_list;
+        struct hlist_bl_node *pos;
-        read_lock(gl_lock_addr(hash));
+        rcu_read_lock();
-        /* Can't use hlist_for_each_entry - don't want prefetch here */
+        hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
-        if (hlist_empty(head))
+                if ((gl->gl_sbd == sdp) && atomic_read(&gl->gl_ref))
-                goto out;
-        gl = list_entry(head->first, struct gfs2_glock, gl_list);
-        while(1) {
-                if (!sdp || gl->gl_sbd == sdp) {
-                        gfs2_glock_hold(gl);
-                        read_unlock(gl_lock_addr(hash));
-                        if (prev)
-                                gfs2_glock_put(prev);
-                        prev = gl;
                        examiner(gl);
-                        has_entries = 1;
-                        read_lock(gl_lock_addr(hash));
-                }
-                if (gl->gl_list.next == NULL)
-                        break;
-                gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list);
        }
-out:
+        rcu_read_unlock();
-        read_unlock(gl_lock_addr(hash));
-        if (prev)
-                gfs2_glock_put(prev);
        cond_resched();
-        return has_entries;
+}
+static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
+{
+        unsigned x;
+        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+                examine_bucket(examiner, sdp, x);
 }
@@ -1529,10 +1485,21 @@ static void clear_glock(struct gfs2_glock *gl)
 void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 {
-        unsigned x;
+        glock_hash_walk(thaw_glock, sdp);
+}
-        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
-                examine_bucket(thaw_glock, sdp, x);
+{
+        int ret;
+        spin_lock(&gl->gl_spin);
+        ret = __dump_glock(seq, gl);
+        spin_unlock(&gl->gl_spin);
+        return ret;
+}
+static void dump_glock_func(struct gfs2_glock *gl)
+{
+        dump_glock(NULL, gl);
 }
 /**
@@ -1545,13 +1512,10 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
-        unsigned int x;
+        glock_hash_walk(clear_glock, sdp);
-        for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
-                examine_bucket(clear_glock, sdp, x);
        flush_workqueue(glock_workqueue);
        wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
-        gfs2_dump_lockstate(sdp);
+        glock_hash_walk(dump_glock_func, sdp);
 }
 void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
@@ -1717,73 +1681,22 @@ out:
        return error;
 }
-static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
-{
-        int ret;
-        spin_lock(&gl->gl_spin);
-        ret = __dump_glock(seq, gl);
-        spin_unlock(&gl->gl_spin);
-        return ret;
-}
-/**
- * gfs2_dump_lockstate - print out the current lockstate
- * @sdp: the filesystem
- * @ub: the buffer to copy the information into
- *
- * If @ub is NULL, dump the lockstate to the console.
- *
- */
-static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
-{
-        struct gfs2_glock *gl;
-        struct hlist_node *h;
-        unsigned int x;
-        int error = 0;
-        for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
-                read_lock(gl_lock_addr(x));
-                hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) {
-                        if (gl->gl_sbd != sdp)
-                                continue;
-                        error = dump_glock(NULL, gl);
-                        if (error)
-                                break;
-                }
-                read_unlock(gl_lock_addr(x));
-                if (error)
-                        break;
-        }
-        return error;
-}
 int __init gfs2_glock_init(void)
 {
        unsigned i;
        for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
-                INIT_HLIST_HEAD(&gl_hash_table[i].hb_list);
+                INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
-        }
-#ifdef GL_HASH_LOCK_SZ
-        for(i = 0; i < GL_HASH_LOCK_SZ; i++) {
-                rwlock_init(&gl_hash_locks[i]);
        }
-#endif
        glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
-                                          WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+                                          WQ_HIGHPRI | WQ_FREEZABLE, 0);
        if (IS_ERR(glock_workqueue))
                return PTR_ERR(glock_workqueue);
        gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
-                                                WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+                                                WQ_MEM_RECLAIM | WQ_FREEZABLE,
                                                0);
        if (IS_ERR(gfs2_delete_workqueue)) {
                destroy_workqueue(glock_workqueue);
@@ -1802,62 +1715,54 @@ void gfs2_glock_exit(void)
        destroy_workqueue(gfs2_delete_workqueue);
 }
+static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
+{
+        return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]),
+                              struct gfs2_glock, gl_list);
+}
+static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
+{
+        return hlist_bl_entry(rcu_dereference(gl->gl_list.next),
+                              struct gfs2_glock, gl_list);
+}
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
        struct gfs2_glock *gl;
-restart:
+        do {
-        read_lock(gl_lock_addr(gi->hash));
+                gl = gi->gl;
-        gl = gi->gl;
+                if (gl) {
-        if (gl) {
+                        gi->gl = glock_hash_next(gl);
-                gi->gl = hlist_entry(gl->gl_list.next,
+                } else {
-                                     struct gfs2_glock, gl_list);
+                        gi->gl = glock_hash_chain(gi->hash);
-        } else {
+                }
-                gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
+                while (gi->gl == NULL) {
-                                     struct gfs2_glock, gl_list);
+                        gi->hash++;
-        }
+                        if (gi->hash >= GFS2_GL_HASH_SIZE) {
-        if (gi->gl)
+                                rcu_read_unlock();
-                gfs2_glock_hold(gi->gl);
+                                return 1;
-        read_unlock(gl_lock_addr(gi->hash));
+                        }
-        if (gl)
+                        gi->gl = glock_hash_chain(gi->hash);
-                gfs2_glock_put(gl);
+                }
-        while (gi->gl == NULL) {
+        /* Skip entries for other sb and dead entries */
-                gi->hash++;
+        } while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0);
-                if (gi->hash >= GFS2_GL_HASH_SIZE)
-                        return 1;
-                read_lock(gl_lock_addr(gi->hash));
-                gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
-                                     struct gfs2_glock, gl_list);
-                if (gi->gl)
-                        gfs2_glock_hold(gi->gl);
-                read_unlock(gl_lock_addr(gi->hash));
-        }
-        if (gi->sdp != gi->gl->gl_sbd)
-                goto restart;
        return 0;
 }
-static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
-{
-        if (gi->gl)
-                gfs2_glock_put(gi->gl);
-        gi->gl = NULL;
-}
 static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
 {
        struct gfs2_glock_iter *gi = seq->private;
        loff_t n = *pos;
        gi->hash = 0;
+        rcu_read_lock();
        do {
-                if (gfs2_glock_iter_next(gi)) {
+                if (gfs2_glock_iter_next(gi))
-                        gfs2_glock_iter_free(gi);
                        return NULL;
-                }
        } while (n--);
        return gi->gl;
@@ -1870,10 +1775,8 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
        (*pos)++;
-        if (gfs2_glock_iter_next(gi)) {
+        if (gfs2_glock_iter_next(gi))
-                gfs2_glock_iter_free(gi);
                return NULL;
-        }
        return gi->gl;
 }
@@ -1881,7 +1784,10 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
 static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 {
        struct gfs2_glock_iter *gi = seq->private;
-        gfs2_glock_iter_free(gi);
+        if (gi->gl)
+                rcu_read_unlock();
+        gi->gl = NULL;
 }
 static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 691851ceb615..aea160690e94 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -118,7 +118,7 @@ struct lm_lockops {
        int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
        void (*lm_unmount) (struct gfs2_sbd *sdp);
        void (*lm_withdraw) (struct gfs2_sbd *sdp);
-        void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
+        void (*lm_put_lock) (struct gfs2_glock *gl);
        int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
                        unsigned int flags);
        void (*lm_cancel) (struct gfs2_glock *gl);
@@ -174,7 +174,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp,
                   int create, struct gfs2_glock **glp);
 void gfs2_glock_hold(struct gfs2_glock *gl);
 void gfs2_glock_put_nolock(struct gfs2_glock *gl);
-int gfs2_glock_put(struct gfs2_glock *gl);
+void gfs2_glock_put(struct gfs2_glock *gl);
 void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
                      struct gfs2_holder *gh);
 void gfs2_holder_reinit(unsigned int state, unsigned flags,
@@ -223,25 +223,22 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
        return error;
 }
-/*  Lock Value Block functions  */
+extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
-int gfs2_lvb_hold(struct gfs2_glock *gl);
+extern void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_lvb_unhold(struct gfs2_glock *gl);
+extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
-void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
-void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
+extern void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
-void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
+extern void gfs2_glock_free(struct gfs2_glock *gl);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
-void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
+extern int __init gfs2_glock_init(void);
-void gfs2_glock_thaw(struct gfs2_sbd *sdp);
+extern void gfs2_glock_exit(void);
-int __init gfs2_glock_init(void);
+extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
-void gfs2_glock_exit(void);
+extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
+extern int gfs2_register_debugfs(void);
-int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
+extern void gfs2_unregister_debugfs(void);
-void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
-int gfs2_register_debugfs(void);
-void gfs2_unregister_debugfs(void);
 extern const struct lm_lockops gfs2_dlm_ops;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 263561bf1a50..3754e3cbf02b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -56,20 +56,26 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
        BUG_ON(current->journal_info);
        current->journal_info = &tr;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        while (!list_empty(head)) {
                bd = list_entry(head->next, struct gfs2_bufdata,
                                bd_ail_gl_list);
                bh = bd->bd_bh;
                gfs2_remove_from_ail(bd);
+                spin_unlock(&sdp->sd_ail_lock);
                bd->bd_bh = NULL;
                bh->b_private = NULL;
                bd->bd_blkno = bh->b_blocknr;
+                gfs2_log_lock(sdp);
                gfs2_assert_withdraw(sdp, !buffer_busy(bh));
                gfs2_trans_add_revoke(sdp, bd);
+                gfs2_log_unlock(sdp);
+                spin_lock(&sdp->sd_ail_lock);
        }
        gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
        gfs2_trans_end(sdp);
        gfs2_log_flush(sdp, NULL);
@@ -206,8 +212,17 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 static int inode_go_demote_ok(const struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_holder *gh;
        if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
                return 0;
+        if (!list_empty(&gl->gl_holders)) {
+                gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+                if (gh->gh_list.next != &gl->gl_holders)
+                        return 0;
+        }
        return 1;
 }
@@ -272,19 +287,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 }
 /**
- * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
-{
-        const struct address_space *mapping = (const struct address_space *)(gl + 1);
-        return !mapping->nrpages;
-}
-/**
 * rgrp_go_lock - operation done after an rgrp lock is locked by
 *    a first holder on this node.
 * @gl: the glock
@@ -410,7 +412,6 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_xmote_th = rgrp_go_sync,
        .go_inval = rgrp_go_inval,
-        .go_demote_ok = rgrp_go_demote_ok,
        .go_lock = rgrp_go_lock,
        .go_unlock = rgrp_go_unlock,
        .go_dump = gfs2_rgrp_dump,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a79790c06275..870a89d6d4dc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -15,6 +15,8 @@
 #include <linux/workqueue.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
 #define DIO_WAIT        0x00000010
 #define DIO_METADATA    0x00000020
@@ -201,7 +203,7 @@ enum {
 };
 struct gfs2_glock {
-        struct hlist_node gl_list;
+        struct hlist_bl_node gl_list;
        unsigned long gl_flags;         /* GLF_... */
        struct lm_lockname gl_name;
        atomic_t gl_ref;
@@ -234,6 +236,7 @@ struct gfs2_glock {
        atomic_t gl_ail_count;
        struct delayed_work gl_work;
        struct work_struct gl_delete;
+        struct rcu_head gl_rcu;
 };
 #define GFS2_MIN_LVB_SIZE 32    /* Min size of LVB that gfs2 supports */
@@ -314,6 +317,7 @@ enum {
        QDF_USER                = 0,
        QDF_CHANGE              = 1,
        QDF_LOCKED              = 2,
+        QDF_REFRESH             = 3,
 };
 struct gfs2_quota_data {
@@ -647,6 +651,7 @@ struct gfs2_sbd {
        unsigned int sd_log_flush_head;
        u64 sd_log_flush_wrapped;
+        spinlock_t sd_ail_lock;
        struct list_head sd_ail1_list;
        struct list_head sd_ail2_list;
        u64 sd_ail_sync_gen;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7aa7d4f8984a..97d54a28776a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -763,14 +763,15 @@ fail:
        return error;
 }
-static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
+static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
+                              const struct qstr *qstr)
 {
        int err;
        size_t len;
        void *value;
        char *name;
-        err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
+        err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
                                           &name, &value, &len);
        if (err) {
@@ -854,7 +855,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (error)
                goto fail_gunlock2;
-        error = gfs2_security_init(dip, GFS2_I(inode));
+        error = gfs2_security_init(dip, GFS2_I(inode), name);
        if (error)
                goto fail_gunlock2;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 6e493aee28f8..98c80d8c2a62 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -22,7 +22,6 @@ static void gdlm_ast(void *arg)
 {
        struct gfs2_glock *gl = arg;
        unsigned ret = gl->gl_state;
-        struct gfs2_sbd *sdp = gl->gl_sbd;
        BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
@@ -31,12 +30,7 @@ static void gdlm_ast(void *arg)
        switch (gl->gl_lksb.sb_status) {
        case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
-                if (gl->gl_ops->go_flags & GLOF_ASPACE)
+                gfs2_glock_free(gl);
-                        kmem_cache_free(gfs2_glock_aspace_cachep, gl);
-                else
-                        kmem_cache_free(gfs2_glock_cachep, gl);
-                if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-                        wake_up(&sdp->sd_glock_wait);
                return;
        case -DLM_ECANCEL: /* Cancel while getting lock */
                ret |= LM_OUT_CANCELED;
@@ -164,16 +158,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
                        GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
 }
-static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
+static void gdlm_put_lock(struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
        int error;
        if (gl->gl_lksb.sb_lkid == 0) {
-                kmem_cache_free(cachep, gl);
+                gfs2_glock_free(gl);
-                if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-                        wake_up(&sdp->sd_glock_wait);
                return;
        }
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index eb01f3575e10..5b102c1887fd 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -67,7 +67,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 * @mapping: The associated mapping (maybe NULL)
 * @bd: The gfs2_bufdata to remove
 *
- * The log lock _must_ be held when calling this function
+ * The ail lock _must_ be held when calling this function
 *
 */
@@ -88,8 +88,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
 */
 static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
-__releases(&sdp->sd_log_lock)
+__releases(&sdp->sd_ail_lock)
-__acquires(&sdp->sd_log_lock)
+__acquires(&sdp->sd_ail_lock)
 {
        struct gfs2_bufdata *bd, *s;
        struct buffer_head *bh;
@@ -117,16 +117,16 @@ __acquires(&sdp->sd_log_lock)
                        list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
                        get_bh(bh);
-                        gfs2_log_unlock(sdp);
+                        spin_unlock(&sdp->sd_ail_lock);
                        lock_buffer(bh);
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
-                                submit_bh(WRITE_SYNC_PLUG, bh);
+                                submit_bh(WRITE_SYNC, bh);
                        } else {
                                unlock_buffer(bh);
                                brelse(bh);
                        }
-                        gfs2_log_lock(sdp);
+                        spin_lock(&sdp->sd_ail_lock);
                        retry = 1;
                        break;
@@ -175,10 +175,10 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
        struct gfs2_ail *ai;
        int done = 0;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        head = &sdp->sd_ail1_list;
        if (list_empty(head)) {
-                gfs2_log_unlock(sdp);
+                spin_unlock(&sdp->sd_ail_lock);
                return;
        }
        sync_gen = sdp->sd_ail_sync_gen++;
@@ -189,13 +189,13 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
                        if (ai->ai_sync_gen >= sync_gen)
                                continue;
                        ai->ai_sync_gen = sync_gen;
-                        gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */
+                        gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */
                        done = 0;
                        break;
                }
        }
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
 }
 static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
@@ -203,7 +203,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
        struct gfs2_ail *ai, *s;
        int ret;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
                if (gfs2_ail1_empty_one(sdp, ai, flags))
@@ -214,7 +214,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
        ret = list_empty(&sdp->sd_ail1_list);
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
        return ret;
 }
@@ -247,7 +247,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
        int wrap = (new_tail < old_tail);
        int a, b, rm;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
                a = (old_tail <= ai->ai_first);
@@ -263,7 +263,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
                kfree(ai);
        }
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
 }
 /**
@@ -421,7 +421,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
        struct gfs2_ail *ai;
        unsigned int tail;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        if (list_empty(&sdp->sd_ail1_list)) {
                tail = sdp->sd_log_head;
@@ -430,7 +430,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
                tail = ai->ai_first;
        }
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ail_lock);
        return tail;
 }
@@ -647,7 +647,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
                lock_buffer(bh);
                if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
                        bh->b_end_io = end_buffer_write_sync;
-                        submit_bh(WRITE_SYNC_PLUG, bh);
+                        submit_bh(WRITE_SYNC, bh);
                } else {
                        unlock_buffer(bh);
                        brelse(bh);
@@ -743,10 +743,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
        sdp->sd_log_commited_databuf = 0;
        sdp->sd_log_commited_revoke = 0;
+        spin_lock(&sdp->sd_ail_lock);
        if (!list_empty(&ai->ai_ail1_list)) {
                list_add(&ai->ai_list, &sdp->sd_ail1_list);
                ai = NULL;
        }
+        spin_unlock(&sdp->sd_ail_lock);
        gfs2_log_unlock(sdp);
        trace_gfs2_log_flush(sdp, 0);
        up_write(&sdp->sd_log_flush_lock);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index bf33f822058d..51d27f00ebb4 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -51,8 +51,10 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
        /* If this buffer is in the AIL and it has already been written
         * to in-place disk block, remove it from the AIL.
         */
+        spin_lock(&sdp->sd_ail_lock);
        if (bd->bd_ail)
                list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
+        spin_unlock(&sdp->sd_ail_lock);
        get_bh(bh);
        atomic_inc(&sdp->sd_log_pinned);
        trace_gfs2_pin(bd, 1);
@@ -80,7 +82,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        mark_buffer_dirty(bh);
        clear_buffer_pinned(bh);
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ail_lock);
        if (bd->bd_ail) {
                list_del(&bd->bd_ail_st_list);
                brelse(bh);
@@ -91,9 +93,11 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        }
        bd->bd_ail = ai;
        list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
-        clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+        spin_unlock(&sdp->sd_ail_lock);
+        if (test_and_clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags))
+                gfs2_glock_schedule_for_reclaim(bd->bd_gl);
        trace_gfs2_pin(bd, 0);
-        gfs2_log_unlock(sdp);
        unlock_buffer(bh);
        atomic_dec(&sdp->sd_log_pinned);
 }
@@ -200,7 +204,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
                }
                gfs2_log_unlock(sdp);
-                submit_bh(WRITE_SYNC_PLUG, bh);
+                submit_bh(WRITE_SYNC, bh);
                gfs2_log_lock(sdp);
                n = 0;
@@ -210,7 +214,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
                        gfs2_log_unlock(sdp);
                        lock_buffer(bd2->bd_bh);
                        bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
-                        submit_bh(WRITE_SYNC_PLUG, bh);
+                        submit_bh(WRITE_SYNC, bh);
                        gfs2_log_lock(sdp);
                        if (++n >= num)
                                break;
@@ -352,7 +356,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
                sdp->sd_log_num_revoke--;
                if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
-                        submit_bh(WRITE_SYNC_PLUG, bh);
+                        submit_bh(WRITE_SYNC, bh);
                        bh = gfs2_log_get_buf(sdp);
                        mh = (struct gfs2_meta_header *)bh->b_data;
@@ -369,7 +373,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
        }
        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
-        submit_bh(WRITE_SYNC_PLUG, bh);
+        submit_bh(WRITE_SYNC, bh);
 }
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -571,7 +575,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
        ptr = bh_log_ptr(bh);
        
        get_bh(bh);
-        submit_bh(WRITE_SYNC_PLUG, bh);
+        submit_bh(WRITE_SYNC, bh);
        gfs2_log_lock(sdp);
        while(!list_empty(list)) {
                bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -597,7 +601,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
                } else {
                        bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
                }
-                submit_bh(WRITE_SYNC_PLUG, bh1);
+                submit_bh(WRITE_SYNC, bh1);
                gfs2_log_lock(sdp);
                ptr += 2;
        }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index ebef7ab6e17e..888a5f5a1a58 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -14,6 +14,8 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
 #include <asm/atomic.h>
 #include "gfs2.h"
@@ -45,7 +47,7 @@ static void gfs2_init_glock_once(void *foo)
 {
        struct gfs2_glock *gl = foo;
-        INIT_HLIST_NODE(&gl->gl_list);
+        INIT_HLIST_BL_NODE(&gl->gl_list);
        spin_lock_init(&gl->gl_spin);
        INIT_LIST_HEAD(&gl->gl_holders);
        INIT_LIST_HEAD(&gl->gl_lru);
@@ -59,14 +61,7 @@ static void gfs2_init_gl_aspace_once(void *foo)
        struct address_space *mapping = (struct address_space *)(gl + 1);
        gfs2_init_glock_once(gl);
-        memset(mapping, 0, sizeof(*mapping));
+        address_space_init_once(mapping);
-        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-        spin_lock_init(&mapping->tree_lock);
-        spin_lock_init(&mapping->i_mmap_lock);
-        INIT_LIST_HEAD(&mapping->private_list);
-        spin_lock_init(&mapping->private_lock);
-        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
 }
 /**
@@ -144,7 +139,7 @@ static int __init init_gfs2_fs(void)
        error = -ENOMEM;
        gfs_recovery_wq = alloc_workqueue("gfs_recovery",
-                                          WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0);
+                                          WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
        if (!gfs_recovery_wq)
                goto fail_wq;
@@ -198,6 +193,8 @@ static void __exit exit_gfs2_fs(void)
        unregister_filesystem(&gfs2meta_fs_type);
        destroy_workqueue(gfs_recovery_wq);
+        rcu_barrier();
        kmem_cache_destroy(gfs2_quotad_cachep);
        kmem_cache_destroy(gfs2_rgrpd_cachep);
        kmem_cache_destroy(gfs2_bufdata_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 939739c7b3f9..675349b5a133 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
        struct buffer_head *bh, *head;
        int nr_underway = 0;
        int write_op = REQ_META |
-                (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE);
+                (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
        BUG_ON(!PageLocked(page));
        BUG_ON(!page_has_buffers(page));
@@ -94,7 +94,6 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 const struct address_space_operations gfs2_meta_aops = {
        .writepage = gfs2_aspace_writepage,
        .releasepage = gfs2_releasepage,
-        .sync_page = block_sync_page,
 };
 /**
@@ -326,6 +325,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
                brelse(bh);
        }
        if (bd) {
+                spin_lock(&sdp->sd_ail_lock);
                if (bd->bd_ail) {
                        gfs2_remove_from_ail(bd);
                        bh->b_private = NULL;
@@ -333,6 +333,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
                        bd->bd_blkno = bh->b_blocknr;
                        gfs2_trans_add_revoke(sdp, bd);
                }
+                spin_unlock(&sdp->sd_ail_lock);
        }
        clear_buffer_dirty(bh);
        clear_buffer_uptodate(bh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 777927ce6f79..42ef24355afb 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -99,6 +99,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        init_waitqueue_head(&sdp->sd_log_waitq);
        init_waitqueue_head(&sdp->sd_logd_waitq);
+        spin_lock_init(&sdp->sd_ail_lock);
        INIT_LIST_HEAD(&sdp->sd_ail1_list);
        INIT_LIST_HEAD(&sdp->sd_ail2_list);
@@ -928,17 +929,9 @@ static const match_table_t nolock_tokens = {
        { Opt_err, NULL },
 };
-static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        kmem_cache_free(cachep, gl);
-        if (atomic_dec_and_test(&sdp->sd_glock_disposal))
-                wake_up(&sdp->sd_glock_wait);
-}
 static const struct lm_lockops nolock_ops = {
        .lm_proto_name = "lock_nolock",
-        .lm_put_lock = nolock_put_lock,
+        .lm_put_lock = gfs2_glock_free,
        .lm_tokens = &nolock_tokens,
 };
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d8b26ac2e20b..09e436a50723 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1026,9 +1026,9 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 /**
 * gfs2_permission -
- * @inode:
+ * @inode: The inode
- * @mask:
+ * @mask: The mask to be tested
- * @nd: passed from Linux VFS, ignored by us
+ * @flags: Indicates whether this is an RCU path walk or not
 *
 * This may be called from the VFS directly, or from within GFS2 with the
 * inode locked, so we look to see if the glock is already locked and only
@@ -1044,11 +1044,11 @@ int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
        int error;
        int unlock = 0;
-        if (flags & IPERM_FLAG_RCU)
-                return -ECHILD;
        ip = GFS2_I(inode);
        if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+                if (flags & IPERM_FLAG_RCU)
+                        return -ECHILD;
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                if (error)
                        return error;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a689901963de..e23d9864c418 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -834,6 +834,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
                        goto out_end_trans;
                do_qc(qd, -qd->qd_change_sync);
+                set_bit(QDF_REFRESH, &qd->qd_flags);
        }
        error = 0;
@@ -929,6 +930,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
+        struct gfs2_quota_data *qd;
        unsigned int x;
        int error = 0;
@@ -942,7 +944,11 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
             sort_qd, NULL);
        for (x = 0; x < al->al_qd_num; x++) {
-                error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
+                int force = NO_FORCE;
+                qd = al->al_qd[x];
+                if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
+                        force = FORCE;
+                error = do_glock(qd, force, &al->al_qd_ghs[x]);
                if (error)
                        break;
        }
@@ -1587,6 +1593,8 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
        offset = qd2offset(qd);
        alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
+        if (gfs2_is_stuffed(ip))
+                alloc_required = 1;
        if (alloc_required) {
                al = gfs2_alloc_get(ip);
                if (al == NULL)
@@ -1600,7 +1608,9 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
                blocks += gfs2_rg_blocks(al);
        }
-        error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
+        /* Some quotas span block boundaries and can update two blocks,
+           adding an extra block to the transaction to handle such quotas */
+        error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0);
        if (error)
                goto out_release;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7293ea27020c..cf930cd9664a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1602,7 +1602,7 @@ rgrp_error:
 *
 */
-void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd;
@@ -1617,7 +1617,21 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
+}
+/**
+ * gfs2_free_data - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        __gfs2_free_data(ip, bstart, blen);
        gfs2_statfs_change(sdp, 0, +blen, 0);
        gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
 }
@@ -1630,7 +1644,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 *
 */
-void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *rgd;
@@ -1645,10 +1659,24 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
+        gfs2_meta_wipe(ip, bstart, blen);
+}
+/**
+ * gfs2_free_meta - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        __gfs2_free_meta(ip, bstart, blen);
        gfs2_statfs_change(sdp, 0, +blen, 0);
        gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
-        gfs2_meta_wipe(ip, bstart, blen);
 }
 void gfs2_unlink_di(struct inode *inode)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 50c2bb04369c..a80e3034ac47 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -52,7 +52,9 @@ extern int gfs2_ri_update(struct gfs2_inode *ip);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
 extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
+extern void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
 extern void gfs2_unlink_di(struct inode *inode);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ec73ed70bae1..a4e23d68a398 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -657,7 +657,7 @@ out:
 * @sdp: the file system
 *
 * This function flushes data and meta data for all machines by
- * aquiring the transaction log exclusively.  All journals are
+ * acquiring the transaction log exclusively.  All journals are
 * ensured to be in a clean state as well.
 *
 * Returns: errno
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index afa66aaa2237..b4d70b13be92 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -238,46 +238,22 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 /*
- * hfs_unlink()
+ * hfs_remove()
 *
- * This is the unlink() entry in the inode_operations structure for
+ * This serves as both unlink() and rmdir() in the inode_operations
- * regular HFS directories.  The purpose is to delete an existing
+ * structure for regular HFS directories.  The purpose is to delete
- * file, given the inode for the parent directory and the name
+ * an existing child, given the inode for the parent directory and
- * (and its length) of the existing file.
+ * the name (and its length) of the existing directory.
- */
-static int hfs_unlink(struct inode *dir, struct dentry *dentry)
-{
-        struct inode *inode;
-        int res;
-        inode = dentry->d_inode;
-        res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
-        if (res)
-                return res;
-        drop_nlink(inode);
-        hfs_delete_inode(inode);
-        inode->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(inode);
-        return res;
-}
-/*
- * hfs_rmdir()
 *
- * This is the rmdir() entry in the inode_operations structure for
+ * HFS does not have hardlinks, so both rmdir and unlink set the
- * regular HFS directories.  The purpose is to delete an existing
+ * link count to 0.  The only difference is the emptiness check.
- * directory, given the inode for the parent directory and the name
- * (and its length) of the existing directory.
 */
-static int hfs_rmdir(struct inode *dir, struct dentry *dentry)
+static int hfs_remove(struct inode *dir, struct dentry *dentry)
 {
-        struct inode *inode;
+        struct inode *inode = dentry->d_inode;
        int res;
-        inode = dentry->d_inode;
+        if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
-        if (inode->i_size != 2)
                return -ENOTEMPTY;
        res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
        if (res)
@@ -307,7 +283,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* Unlink destination if it already exists */
        if (new_dentry->d_inode) {
-                res = hfs_unlink(new_dir, new_dentry);
+                res = hfs_remove(new_dir, new_dentry);
                if (res)
                        return res;
        }
@@ -332,9 +308,9 @@ const struct file_operations hfs_dir_operations = {
 const struct inode_operations hfs_dir_inode_operations = {
        .create         = hfs_create,
        .lookup         = hfs_lookup,
-        .unlink         = hfs_unlink,
+        .unlink         = hfs_remove,
        .mkdir          = hfs_mkdir,
-        .rmdir          = hfs_rmdir,
+        .rmdir          = hfs_remove,
        .rename         = hfs_rename,
        .setattr        = hfs_inode_setattr,
 };
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index dffb4e996643..fff16c968e67 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -150,7 +150,6 @@ static int hfs_writepages(struct address_space *mapping,
 const struct address_space_operations hfs_btree_aops = {
        .readpage       = hfs_readpage,
        .writepage      = hfs_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = hfs_write_begin,
        .write_end      = generic_write_end,
        .bmap           = hfs_bmap,
@@ -160,7 +159,6 @@ const struct address_space_operations hfs_btree_aops = {
 const struct address_space_operations hfs_aops = {
        .readpage       = hfs_readpage,
        .writepage      = hfs_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = hfs_write_begin,
        .write_end      = generic_write_end,
        .bmap           = hfs_bmap,
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 52a0bcaa7b6d..b1991a2a08e0 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -397,8 +397,8 @@ int hfsplus_file_extend(struct inode *inode)
        u32 start, len, goal;
        int res;
-        if (sbi->total_blocks - sbi->free_blocks + 8 >
+        if (sbi->alloc_file->i_size * 8 <
-                        sbi->alloc_file->i_size * 8) {
+            sbi->total_blocks - sbi->free_blocks + 8) {
                /* extend alloc file */
                printk(KERN_ERR "hfs: extend alloc file! "
                                "(%llu,%u,%u)\n",
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index a8df651747f0..b248a6cfcad9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -146,7 +146,6 @@ static int hfsplus_writepages(struct address_space *mapping,
 const struct address_space_operations hfsplus_btree_aops = {
        .readpage       = hfsplus_readpage,
        .writepage      = hfsplus_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = hfsplus_write_begin,
        .write_end      = generic_write_end,
        .bmap           = hfsplus_bmap,
@@ -156,7 +155,6 @@ const struct address_space_operations hfsplus_btree_aops = {
 const struct address_space_operations hfsplus_aops = {
        .readpage       = hfsplus_readpage,
        .writepage      = hfsplus_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = hfsplus_write_begin,
        .write_end      = generic_write_end,
        .bmap           = hfsplus_bmap,
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 508ce662ce12..fbaa6690c8e0 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -47,7 +47,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
        if (err)
                goto out;
-        if (!is_owner_or_cap(inode)) {
+        if (!inode_owner_or_capable(inode)) {
                err = -EACCES;
                goto out_drop_write;
        }
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index d66ad113b1cc..40ad88c12c64 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -134,7 +134,7 @@ int hfs_part_find(struct super_block *sb,
        res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK,
                                 data, READ);
        if (res)
-                return res;
+                goto out;
        switch (be16_to_cpu(*((__be16 *)data))) {
        case HFS_OLD_PMAP_MAGIC:
@@ -147,7 +147,7 @@ int hfs_part_find(struct super_block *sb,
                res = -ENOENT;
                break;
        }
+out:
        kfree(data);
        return res;
 }
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9a3b4795f43c..b49b55584c84 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -338,20 +338,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *root, *inode;
        struct qstr str;
        struct nls_table *nls = NULL;
-        int err = -EINVAL;
+        int err;
+        err = -EINVAL;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
-                return -ENOMEM;
+                goto out;
        sb->s_fs_info = sbi;
        mutex_init(&sbi->alloc_mutex);
        mutex_init(&sbi->vh_mutex);
        hfsplus_fill_defaults(sbi);
+        err = -EINVAL;
        if (!hfsplus_parse_options(data, sbi)) {
                printk(KERN_ERR "hfs: unable to parse mount options\n");
-                err = -EINVAL;
+                goto out_unload_nls;
-                goto cleanup;
        }
        /* temporarily use utf8 to correctly find the hidden dir below */
@@ -359,16 +361,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        sbi->nls = load_nls("utf8");
        if (!sbi->nls) {
                printk(KERN_ERR "hfs: unable to load nls for utf8\n");
-                err = -EINVAL;
+                goto out_unload_nls;
-                goto cleanup;
        }
        /* Grab the volume header */
        if (hfsplus_read_wrapper(sb)) {
                if (!silent)
                        printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n");
-                err = -EINVAL;
+                goto out_unload_nls;
-                goto cleanup;
        }
        vhdr = sbi->s_vhdr;
@@ -377,7 +377,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
            be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
                printk(KERN_ERR "hfs: wrong filesystem version\n");
-                goto cleanup;
+                goto out_free_vhdr;
        }
        sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
        sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
@@ -421,19 +421,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
        if (!sbi->ext_tree) {
                printk(KERN_ERR "hfs: failed to load extents file\n");
-                goto cleanup;
+                goto out_free_vhdr;
        }
        sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
        if (!sbi->cat_tree) {
                printk(KERN_ERR "hfs: failed to load catalog file\n");
-                goto cleanup;
+                goto out_close_ext_tree;
        }
        inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
        if (IS_ERR(inode)) {
                printk(KERN_ERR "hfs: failed to load allocation file\n");
                err = PTR_ERR(inode);
-                goto cleanup;
+                goto out_close_cat_tree;
        }
        sbi->alloc_file = inode;
@@ -442,14 +442,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        if (IS_ERR(root)) {
                printk(KERN_ERR "hfs: failed to load root directory\n");
                err = PTR_ERR(root);
-                goto cleanup;
+                goto out_put_alloc_file;
-        }
-        sb->s_d_op = &hfsplus_dentry_operations;
-        sb->s_root = d_alloc_root(root);
-        if (!sb->s_root) {
-                iput(root);
-                err = -ENOMEM;
-                goto cleanup;
        }
        str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
@@ -459,46 +452,69 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
                hfs_find_exit(&fd);
                if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
-                        goto cleanup;
+                        goto out_put_root;
                inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
-                        goto cleanup;
+                        goto out_put_root;
                }
                sbi->hidden_dir = inode;
        } else
                hfs_find_exit(&fd);
-        if (sb->s_flags & MS_RDONLY)
+        if (!(sb->s_flags & MS_RDONLY)) {
-                goto out;
+                /*
+                 * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
+                 * all three are registered with Apple for our use
+                 */
+                vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
+                vhdr->modify_date = hfsp_now2mt();
+                be32_add_cpu(&vhdr->write_count, 1);
+                vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
+                vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
+                hfsplus_sync_fs(sb, 1);
-        /* H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
+                if (!sbi->hidden_dir) {
-         * all three are registered with Apple for our use
+                        mutex_lock(&sbi->vh_mutex);
-         */
+                        sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-        vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
+                        hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
-        vhdr->modify_date = hfsp_now2mt();
+                                           sbi->hidden_dir);
-        be32_add_cpu(&vhdr->write_count, 1);
+                        mutex_unlock(&sbi->vh_mutex);
-        vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
-        vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
+                        hfsplus_mark_inode_dirty(sbi->hidden_dir,
-        hfsplus_sync_fs(sb, 1);
+                                                 HFSPLUS_I_CAT_DIRTY);
+                }
-        if (!sbi->hidden_dir) {
-                mutex_lock(&sbi->vh_mutex);
-                sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-                hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
-                                   &str, sbi->hidden_dir);
-                mutex_unlock(&sbi->vh_mutex);
-                hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY);
        }
-out:
+        sb->s_d_op = &hfsplus_dentry_operations;
+        sb->s_root = d_alloc_root(root);
+        if (!sb->s_root) {
+                err = -ENOMEM;
+                goto out_put_hidden_dir;
+        }
        unload_nls(sbi->nls);
        sbi->nls = nls;
        return 0;
-cleanup:
+out_put_hidden_dir:
-        hfsplus_put_super(sb);
+        iput(sbi->hidden_dir);
+out_put_root:
+        iput(sbi->alloc_file);
+out_put_alloc_file:
+        iput(sbi->alloc_file);
+out_close_cat_tree:
+        hfs_btree_close(sbi->cat_tree);
+out_close_ext_tree:
+        hfs_btree_close(sbi->ext_tree);
+out_free_vhdr:
+        kfree(sbi->s_vhdr);
+        kfree(sbi->s_backup_vhdr);
+out_unload_nls:
+        unload_nls(sbi->nls);
        unload_nls(nls);
+        kfree(sbi);
+out:
        return err;
 }
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 196231794f64..3031d81f5f0f 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -167,7 +167,7 @@ reread:
                break;
        case cpu_to_be16(HFSP_WRAP_MAGIC):
                if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
-                        goto out;
+                        goto out_free_backup_vhdr;
                wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
                part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
                part_size = wd.embed_count * wd.ablk_size;
@@ -179,7 +179,7 @@ reread:
                 * (should do this only for cdrom/loop though)
                 */
                if (hfs_part_find(sb, &part_start, &part_size))
-                        goto out;
+                        goto out_free_backup_vhdr;
                goto reread;
        }
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 63b6f5632318..0c39dc3ef7d7 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,7 +1,7 @@
 config HPFS_FS
        tristate "OS/2 HPFS file system support"
        depends on BLOCK
-        depends on BKL # nontrivial to fix
+        depends on BROKEN || !PREEMPT
        help
          OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
          is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index d32f63a569f7..b3d7c0ddb609 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -6,16 +6,15 @@
 *  directory VFS functions
 */
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include "hpfs_fn.h"
 static int hpfs_dir_release(struct inode *inode, struct file *filp)
 {
-        lock_kernel();
+        hpfs_lock(inode->i_sb);
        hpfs_del_pos(inode, &filp->f_pos);
        /*hpfs_write_if_changed(inode);*/
-        unlock_kernel();
+        hpfs_unlock(inode->i_sb);
        return 0;
 }
@@ -30,7 +29,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
        struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
        struct super_block *s = i->i_sb;
-        lock_kernel();
+        hpfs_lock(s);
        /*printk("dir lseek\n");*/
        if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
@@ -43,12 +42,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
        }
        mutex_unlock(&i->i_mutex);
 ok:
-        unlock_kernel();
+        hpfs_unlock(s);
        return filp->f_pos = new_off;
 fail:
        mutex_unlock(&i->i_mutex);
        /*printk("illegal lseek: %016llx\n", new_off);*/
-        unlock_kernel();
+        hpfs_unlock(s);
        return -ESPIPE;
 }
@@ -64,7 +63,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int c1, c2 = 0;
        int ret = 0;
-        lock_kernel();
+        hpfs_lock(inode->i_sb);
        if (hpfs_sb(inode->i_sb)->sb_chk) {
                if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) {
@@ -167,7 +166,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                hpfs_brelse4(&qbh);
        }
 out:
-        unlock_kernel();
+        hpfs_unlock(inode->i_sb);
        return ret;
 }
@@ -197,10 +196,10 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
        struct inode *result = NULL;
        struct hpfs_inode_info *hpfs_result;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        if ((err = hpfs_chk_name(name, &len))) {
                if (err == -ENAMETOOLONG) {
-                        unlock_kernel();
+                        hpfs_unlock(dir->i_sb);
                        return ERR_PTR(-ENAMETOOLONG);
                }
                goto end_add;
@@ -298,7 +297,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
        end:
        end_add:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        d_add(dentry, result);
        return NULL;
@@ -311,7 +310,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
        
        /*bail:*/
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return ERR_PTR(-ENOENT);
 }
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index c0340887c7ea..9b9eb6933e43 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -6,16 +6,15 @@
 *  file VFS functions
 */
-#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 #define BLOCKS(size) (((size) + 511) >> 9)
 static int hpfs_file_release(struct inode *inode, struct file *file)
 {
-        lock_kernel();
+        hpfs_lock(inode->i_sb);
        hpfs_write_if_changed(inode);
-        unlock_kernel();
+        hpfs_unlock(inode->i_sb);
        return 0;
 }
@@ -49,14 +48,14 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
 static void hpfs_truncate(struct inode *i)
 {
        if (IS_IMMUTABLE(i)) return /*-EPERM*/;
-        lock_kernel();
+        hpfs_lock(i->i_sb);
        hpfs_i(i)->i_n_secs = 0;
        i->i_blocks = 1 + ((i->i_size + 511) >> 9);
        hpfs_i(i)->mmu_private = i->i_size;
        hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9));
        hpfs_write_inode(i);
        hpfs_i(i)->i_n_secs = 0;
-        unlock_kernel();
+        hpfs_unlock(i->i_sb);
 }
 static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
@@ -120,7 +119,6 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations hpfs_aops = {
        .readpage = hpfs_readpage,
        .writepage = hpfs_writepage,
-        .sync_page = block_sync_page,
        .write_begin = hpfs_write_begin,
        .write_end = generic_write_end,
        .bmap = _hpfs_bmap
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 1c43dbea55e8..c15adbca07ff 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -342,3 +342,25 @@ static inline time32_t gmt_to_local(struct super_block *s, time_t t)
        extern struct timezone sys_tz;
        return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
 }
+/*
+ * Locking:
+ *
+ * hpfs_lock() is a leftover from the big kernel lock.
+ * Right now, these functions are empty and only left
+ * for documentation purposes. The file system no longer
+ * works on SMP systems, so the lock is not needed
+ * any more.
+ *
+ * If someone is interested in making it work again, this
+ * would be the place to start by adding a per-superblock
+ * mutex and fixing all the bugs and performance issues
+ * caused by that.
+ */
+static inline void hpfs_lock(struct super_block *s)
+{
+}
+static inline void hpfs_unlock(struct super_block *s)
+{
+}
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 1ae35baa539e..87f1f787e767 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -6,7 +6,6 @@
 *  inode VFS functions
 */
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include "hpfs_fn.h"
@@ -267,7 +266,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
        struct inode *inode = dentry->d_inode;
        int error = -EINVAL;
-        lock_kernel();
+        hpfs_lock(inode->i_sb);
        if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root)
                goto out_unlock;
        if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size)
@@ -290,7 +289,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
        hpfs_write_inode(inode);
 out_unlock:
-        unlock_kernel();
+        hpfs_unlock(inode->i_sb);
        return error;
 }
@@ -307,8 +306,8 @@ void hpfs_evict_inode(struct inode *inode)
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
        if (!inode->i_nlink) {
-                lock_kernel();
+                hpfs_lock(inode->i_sb);
                hpfs_remove_fnode(inode->i_sb, inode->i_ino);
-                unlock_kernel();
+                hpfs_unlock(inode->i_sb);
        }
 }
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index f4ad9e31ddc4..d5f8c8a19023 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -6,7 +6,6 @@
 *  adding & removing files & directories
 */
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include "hpfs_fn.h"
 static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
@@ -25,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct hpfs_dirent dee;
        int err;
        if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        err = -ENOSPC;
        fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
        if (!fnode)
@@ -103,7 +102,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        }
        d_instantiate(dentry, result);
        mutex_unlock(&hpfs_i(dir)->i_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return 0;
 bail3:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -115,7 +114,7 @@ bail1:
        brelse(bh);
        hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -132,7 +131,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
        int err;
        if ((err = hpfs_chk_name(name, &len)))
                return err==-ENOENT ? -EINVAL : err;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        err = -ENOSPC;
        fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
        if (!fnode)
@@ -195,7 +194,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
        }
        d_instantiate(dentry, result);
        mutex_unlock(&hpfs_i(dir)->i_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return 0;
 bail2:
@@ -205,7 +204,7 @@ bail1:
        brelse(bh);
        hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -224,7 +223,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
        if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
        if (!new_valid_dev(rdev))
                return -EINVAL;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        err = -ENOSPC;
        fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
        if (!fnode)
@@ -274,7 +273,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
        d_instantiate(dentry, result);
        mutex_unlock(&hpfs_i(dir)->i_mutex);
        brelse(bh);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return 0;
 bail2:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -283,7 +282,7 @@ bail1:
        brelse(bh);
        hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -299,9 +298,9 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
        struct inode *result;
        int err;
        if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
-                unlock_kernel();
+                hpfs_unlock(dir->i_sb);
                return -EPERM;
        }
        err = -ENOSPC;
@@ -354,7 +353,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
        hpfs_write_inode_nolock(result);
        d_instantiate(dentry, result);
        mutex_unlock(&hpfs_i(dir)->i_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return 0;
 bail2:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -363,7 +362,7 @@ bail1:
        brelse(bh);
        hpfs_free_sectors(dir->i_sb, fno, 1);
 bail:
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -380,7 +379,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
        int rep = 0;
        int err;
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        hpfs_adjust_length(name, &len);
 again:
        mutex_lock(&hpfs_i(inode)->i_parent_mutex);
@@ -416,7 +415,7 @@ again:
                dentry_unhash(dentry);
                if (!d_unhashed(dentry)) {
                        dput(dentry);
-                        unlock_kernel();
+                        hpfs_unlock(dir->i_sb);
                        return -ENOSPC;
                }
                if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
@@ -435,7 +434,7 @@ again:
                        if (!err)
                                goto again;
                }
-                unlock_kernel();
+                hpfs_unlock(dir->i_sb);
                return -ENOSPC;
        default:
                drop_nlink(inode);
@@ -448,7 +447,7 @@ out1:
 out:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
        mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -466,7 +465,7 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
        int r;
        hpfs_adjust_length(name, &len);
-        lock_kernel();
+        hpfs_lock(dir->i_sb);
        mutex_lock(&hpfs_i(inode)->i_parent_mutex);
        mutex_lock(&hpfs_i(dir)->i_mutex);
        err = -ENOENT;
@@ -508,7 +507,7 @@ out1:
 out:
        mutex_unlock(&hpfs_i(dir)->i_mutex);
        mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
-        unlock_kernel();
+        hpfs_unlock(dir->i_sb);
        return err;
 }
@@ -521,21 +520,21 @@ static int hpfs_symlink_readpage(struct file *file, struct page *page)
        int err;
        err = -EIO;
-        lock_kernel();
+        hpfs_lock(i->i_sb);
        if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh)))
                goto fail;
        err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE);
        brelse(bh);
        if (err)
                goto fail;
-        unlock_kernel();
+        hpfs_unlock(i->i_sb);
        SetPageUptodate(page);
        kunmap(page);
        unlock_page(page);
        return 0;
 fail:
-        unlock_kernel();
+        hpfs_unlock(i->i_sb);
        SetPageError(page);
        kunmap(page);
        unlock_page(page);
@@ -567,7 +566,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        err = 0;
        hpfs_adjust_length(old_name, &old_len);
-        lock_kernel();
+        hpfs_lock(i->i_sb);
        /* order doesn't matter, due to VFS exclusion */
        mutex_lock(&hpfs_i(i)->i_parent_mutex);
        if (new_inode)
@@ -659,7 +658,7 @@ end1:
        mutex_unlock(&hpfs_i(i)->i_parent_mutex);
        if (new_inode)
                mutex_unlock(&hpfs_i(new_inode)->i_parent_mutex);
-        unlock_kernel();
+        hpfs_unlock(i->i_sb);
        return err;
 }
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index b30426b1fc97..c89b40808587 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -13,7 +13,6 @@
 #include <linux/statfs.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
-#include <linux/smp_lock.h>
 #include <linux/bitmap.h>
 #include <linux/slab.h>
@@ -103,15 +102,11 @@ static void hpfs_put_super(struct super_block *s)
 {
        struct hpfs_sb_info *sbi = hpfs_sb(s);
-        lock_kernel();
        kfree(sbi->sb_cp_table);
        kfree(sbi->sb_bmp_dir);
        unmark_dirty(s);
        s->s_fs_info = NULL;
        kfree(sbi);
-        unlock_kernel();
 }
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
@@ -143,7 +138,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct super_block *s = dentry->d_sb;
        struct hpfs_sb_info *sbi = hpfs_sb(s);
        u64 id = huge_encode_dev(s->s_bdev->bd_dev);
-        lock_kernel();
+        hpfs_lock(s);
        /*if (sbi->sb_n_free == -1) {*/
                sbi->sb_n_free = count_bitmaps(s);
@@ -160,7 +155,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = 254;
-        unlock_kernel();
+        hpfs_unlock(s);
        return 0;
 }
@@ -406,7 +401,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        
        *flags |= MS_NOATIME;
        
-        lock_kernel();
+        hpfs_lock(s);
        lock_super(s);
        uid = sbi->sb_uid; gid = sbi->sb_gid;
        umask = 0777 & ~sbi->sb_mode;
@@ -441,12 +436,12 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        replace_mount_options(s, new_opts);
        unlock_super(s);
-        unlock_kernel();
+        hpfs_unlock(s);
        return 0;
 out_err:
        unlock_super(s);
-        unlock_kernel();
+        hpfs_unlock(s);
        kfree(new_opts);
        return -EINVAL;
 }
@@ -484,13 +479,15 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
        int o;
-        lock_kernel();
+        if (num_possible_cpus() > 1) {
+                printk(KERN_ERR "HPFS is not SMP safe\n");
+                return -EINVAL;
+        }
        save_mount_options(s, options);
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi) {
-                unlock_kernel();
                return -ENOMEM;
        }
        s->s_fs_info = sbi;
@@ -677,7 +674,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
                        root->i_blocks = 5;
                hpfs_brelse4(&qbh);
        }
-        unlock_kernel();
        return 0;
 bail4:  brelse(bh2);
@@ -689,7 +685,6 @@ bail0:
        kfree(sbi->sb_cp_table);
        s->s_fs_info = NULL;
        kfree(sbi);
-        unlock_kernel();
        return -EINVAL;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9885082b470f..b9eeb1cd03ff 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -332,8 +332,7 @@ static void truncate_huge_page(struct page *page)
 {
        cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
        ClearPageUptodate(page);
-        remove_from_page_cache(page);
+        delete_from_page_cache(page);
-        put_page(page);
 }
 static void truncate_hugepages(struct inode *inode, loff_t lstart)
diff --git a/fs/inode.c b/fs/inode.c
index da85e56378f3..33c963d08ab4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -25,6 +25,39 @@
 #include <linux/async.h>
 #include <linux/posix_acl.h>
 #include <linux/ima.h>
+#include <linux/cred.h>
+#include "internal.h"
+/*
+ * inode locking rules.
+ *
+ * inode->i_lock protects:
+ *   inode->i_state, inode->i_hash, __iget()
+ * inode_lru_lock protects:
+ *   inode_lru, inode->i_lru
+ * inode_sb_list_lock protects:
+ *   sb->s_inodes, inode->i_sb_list
+ * inode_wb_list_lock protects:
+ *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ * inode_hash_lock protects:
+ *   inode_hashtable, inode->i_hash
+ *
+ * Lock ordering:
+ *
+ * inode_sb_list_lock
+ *   inode->i_lock
+ *     inode_lru_lock
+ *
+ * inode_wb_list_lock
+ *   inode->i_lock
+ *
+ * inode_hash_lock
+ *   inode_sb_list_lock
+ *   inode->i_lock
+ *
+ * iunique_lock
+ *   inode_hash_lock
+ */
 /*
 * This is needed for the following functions:
@@ -59,6 +92,8 @@
 static unsigned int i_hash_mask __read_mostly;
 static unsigned int i_hash_shift __read_mostly;
+static struct hlist_head *inode_hashtable __read_mostly;
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 /*
 * Each inode can be on two separate lists. One is
@@ -73,29 +108,29 @@ static unsigned int i_hash_shift __read_mostly;
 */
 static LIST_HEAD(inode_lru);
-static struct hlist_head *inode_hashtable __read_mostly;
+static DEFINE_SPINLOCK(inode_lru_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
 /*
- * A simple spinlock to protect the list manipulations.
+ * iprune_sem provides exclusion between the icache shrinking and the
+ * umount path.
 *
- * NOTE! You also have to own the lock if you change
+ * We don't actually need it to protect anything in the umount path,
- * the i_state of an inode while it is in use..
+ * but only need to cycle through it to make sure any inode that
+ * prune_icache took off the LRU list has been fully torn down by the
+ * time we are past evict_inodes.
 */
-DEFINE_SPINLOCK(inode_lock);
+static DECLARE_RWSEM(iprune_sem);
 /*
- * iprune_sem provides exclusion between the kswapd or try_to_free_pages
+ * Empty aops. Can be used for the cases where the user does not
- * icache shrinking path, and the umount path.  Without this exclusion,
+ * define any of the address_space operations.
- * by the time prune_icache calls iput for the inode whose pages it has
- * been invalidating, or by the time it calls clear_inode & destroy_inode
- * from its final dispose_list, the struct super_block they refer to
- * (for inode->i_sb->s_op) may already have been freed and reused.
- *
- * We make this an rwsem because the fastpath is icache shrinking. In
- * some cases a filesystem may be doing a significant amount of work in
- * its inode reclaim code, so this should improve parallelism.
 */
-static DECLARE_RWSEM(iprune_sem);
+const struct address_space_operations empty_aops = {
+};
+EXPORT_SYMBOL(empty_aops);
 /*
 * Statistics gathering..
@@ -139,15 +174,6 @@ int proc_nr_inodes(ctl_table *table, int write,
 }
 #endif
-static void wake_up_inode(struct inode *inode)
-{
-        /*
-         * Prevent speculative execution through spin_unlock(&inode_lock);
-         */
-        smp_mb();
-        wake_up_bit(&inode->i_state, __I_NEW);
-}
 /**
 * inode_init_always - perform inode structure intialisation
 * @sb: superblock inode belongs to
@@ -158,7 +184,6 @@ static void wake_up_inode(struct inode *inode)
 */
 int inode_init_always(struct super_block *sb, struct inode *inode)
 {
-        static const struct address_space_operations empty_aops;
        static const struct inode_operations empty_iops;
        static const struct file_operations empty_fops;
        struct address_space *const mapping = &inode->i_data;
@@ -295,6 +320,20 @@ static void destroy_inode(struct inode *inode)
                call_rcu(&inode->i_rcu, i_callback);
 }
+void address_space_init_once(struct address_space *mapping)
+{
+        memset(mapping, 0, sizeof(*mapping));
+        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+        spin_lock_init(&mapping->tree_lock);
+        spin_lock_init(&mapping->i_mmap_lock);
+        INIT_LIST_HEAD(&mapping->private_list);
+        spin_lock_init(&mapping->private_lock);
+        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+        mutex_init(&mapping->unmap_mutex);
+}
+EXPORT_SYMBOL(address_space_init_once);
 /*
 * These are initializations that only need to be done
 * once, because the fields are idempotent across use
@@ -308,13 +347,7 @@ void inode_init_once(struct inode *inode)
        INIT_LIST_HEAD(&inode->i_devices);
        INIT_LIST_HEAD(&inode->i_wb_list);
        INIT_LIST_HEAD(&inode->i_lru);
-        INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+        address_space_init_once(&inode->i_data);
-        spin_lock_init(&inode->i_data.tree_lock);
-        spin_lock_init(&inode->i_data.i_mmap_lock);
-        INIT_LIST_HEAD(&inode->i_data.private_list);
-        spin_lock_init(&inode->i_data.private_lock);
-        INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
-        INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
        i_size_ordered_init(inode);
 #ifdef CONFIG_FSNOTIFY
        INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
@@ -330,7 +363,7 @@ static void init_once(void *foo)
 }
 /*
- * inode_lock must be held
+ * inode->i_lock must be held
 */
 void __iget(struct inode *inode)
 {
@@ -348,23 +381,22 @@ EXPORT_SYMBOL(ihold);
 static void inode_lru_list_add(struct inode *inode)
 {
+        spin_lock(&inode_lru_lock);
        if (list_empty(&inode->i_lru)) {
                list_add(&inode->i_lru, &inode_lru);
                inodes_stat.nr_unused++;
        }
+        spin_unlock(&inode_lru_lock);
 }
 static void inode_lru_list_del(struct inode *inode)
 {
+        spin_lock(&inode_lru_lock);
        if (!list_empty(&inode->i_lru)) {
                list_del_init(&inode->i_lru);
                inodes_stat.nr_unused--;
        }
-}
+        spin_unlock(&inode_lru_lock);
-static inline void __inode_sb_list_add(struct inode *inode)
-{
-        list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
 }
 /**
@@ -373,15 +405,17 @@ static inline void __inode_sb_list_add(struct inode *inode)
 */
 void inode_sb_list_add(struct inode *inode)
 {
-        spin_lock(&inode_lock);
+        spin_lock(&inode_sb_list_lock);
-        __inode_sb_list_add(inode);
+        list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
 }
 EXPORT_SYMBOL_GPL(inode_sb_list_add);
-static inline void __inode_sb_list_del(struct inode *inode)
+static inline void inode_sb_list_del(struct inode *inode)
 {
+        spin_lock(&inode_sb_list_lock);
        list_del_init(&inode->i_sb_list);
+        spin_unlock(&inode_sb_list_lock);
 }
 static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -406,24 +440,15 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
 {
        struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
-        spin_lock(&inode_lock);
+        spin_lock(&inode_hash_lock);
+        spin_lock(&inode->i_lock);
        hlist_add_head(&inode->i_hash, b);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&inode_hash_lock);
 }
 EXPORT_SYMBOL(__insert_inode_hash);
 /**
- *      __remove_inode_hash - remove an inode from the hash
- *      @inode: inode to unhash
- *
- *      Remove an inode from the superblock.
- */
-static void __remove_inode_hash(struct inode *inode)
-{
-        hlist_del_init(&inode->i_hash);
-}
-/**
 *      remove_inode_hash - remove an inode from the hash
 *      @inode: inode to unhash
 *
@@ -431,9 +456,11 @@ static void __remove_inode_hash(struct inode *inode)
 */
 void remove_inode_hash(struct inode *inode)
 {
-        spin_lock(&inode_lock);
+        spin_lock(&inode_hash_lock);
+        spin_lock(&inode->i_lock);
        hlist_del_init(&inode->i_hash);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&inode_hash_lock);
 }
 EXPORT_SYMBOL(remove_inode_hash);
@@ -450,10 +477,29 @@ void end_writeback(struct inode *inode)
 }
 EXPORT_SYMBOL(end_writeback);
+/*
+ * Free the inode passed in, removing it from the lists it is still connected
+ * to. We remove any pages still attached to the inode and wait for any IO that
+ * is still in progress before finally destroying the inode.
+ *
+ * An inode must already be marked I_FREEING so that we avoid the inode being
+ * moved back onto lists if we race with other code that manipulates the lists
+ * (e.g. writeback_single_inode). The caller is responsible for setting this.
+ *
+ * An inode must already be removed from the LRU list before being evicted from
+ * the cache. This should occur atomically with setting the I_FREEING state
+ * flag, so no inodes here should ever be on the LRU when being evicted.
+ */
 static void evict(struct inode *inode)
 {
        const struct super_operations *op = inode->i_sb->s_op;
+        BUG_ON(!(inode->i_state & I_FREEING));
+        BUG_ON(!list_empty(&inode->i_lru));
+        inode_wb_list_del(inode);
+        inode_sb_list_del(inode);
        if (op->evict_inode) {
                op->evict_inode(inode);
        } else {
@@ -465,6 +511,15 @@ static void evict(struct inode *inode)
                bd_forget(inode);
        if (S_ISCHR(inode->i_mode) && inode->i_cdev)
                cd_forget(inode);
+        remove_inode_hash(inode);
+        spin_lock(&inode->i_lock);
+        wake_up_bit(&inode->i_state, __I_NEW);
+        BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
+        spin_unlock(&inode->i_lock);
+        destroy_inode(inode);
 }
 /*
@@ -483,14 +538,6 @@ static void dispose_list(struct list_head *head)
                list_del_init(&inode->i_lru);
                evict(inode);
-                spin_lock(&inode_lock);
-                __remove_inode_hash(inode);
-                __inode_sb_list_del(inode);
-                spin_unlock(&inode_lock);
-                wake_up_inode(inode);
-                destroy_inode(inode);
        }
 }
@@ -508,74 +555,77 @@ void evict_inodes(struct super_block *sb)
        struct inode *inode, *next;
        LIST_HEAD(dispose);
-        down_write(&iprune_sem);
+        spin_lock(&inode_sb_list_lock);
-        spin_lock(&inode_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                if (atomic_read(&inode->i_count))
                        continue;
+                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
-                        WARN_ON(1);
+                        spin_unlock(&inode->i_lock);
                        continue;
                }
                inode->i_state |= I_FREEING;
+                inode_lru_list_del(inode);
-                /*
+                spin_unlock(&inode->i_lock);
-                 * Move the inode off the IO lists and LRU once I_FREEING is
+                list_add(&inode->i_lru, &dispose);
-                 * set so that it won't get moved back on there if it is dirty.
-                 */
-                list_move(&inode->i_lru, &dispose);
-                list_del_init(&inode->i_wb_list);
-                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-                        inodes_stat.nr_unused--;
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
        dispose_list(&dispose);
+        /*
+         * Cycle through iprune_sem to make sure any inode that prune_icache
+         * moved off the list before we took the lock has been fully torn
+         * down.
+         */
+        down_write(&iprune_sem);
        up_write(&iprune_sem);
 }
 /**
 * invalidate_inodes    - attempt to free all inodes on a superblock
 * @sb:         superblock to operate on
+ * @kill_dirty: flag to guide handling of dirty inodes
 *
 * Attempts to free all inodes for a given superblock.  If there were any
 * busy inodes return a non-zero value, else zero.
+ * If @kill_dirty is set, discard dirty inodes too, otherwise treat
+ * them as busy.
 */
-int invalidate_inodes(struct super_block *sb)
+int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 {
        int busy = 0;
        struct inode *inode, *next;
        LIST_HEAD(dispose);
-        down_write(&iprune_sem);
+        spin_lock(&inode_sb_list_lock);
-        spin_lock(&inode_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
-                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
+                spin_lock(&inode->i_lock);
+                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
+                if (inode->i_state & I_DIRTY && !kill_dirty) {
+                        spin_unlock(&inode->i_lock);
+                        busy = 1;
+                        continue;
+                }
                if (atomic_read(&inode->i_count)) {
+                        spin_unlock(&inode->i_lock);
                        busy = 1;
                        continue;
                }
                inode->i_state |= I_FREEING;
+                inode_lru_list_del(inode);
-                /*
+                spin_unlock(&inode->i_lock);
-                 * Move the inode off the IO lists and LRU once I_FREEING is
+                list_add(&inode->i_lru, &dispose);
-                 * set so that it won't get moved back on there if it is dirty.
-                 */
-                list_move(&inode->i_lru, &dispose);
-                list_del_init(&inode->i_wb_list);
-                if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-                        inodes_stat.nr_unused--;
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
        dispose_list(&dispose);
-        up_write(&iprune_sem);
        return busy;
 }
@@ -595,7 +645,7 @@ static int can_unuse(struct inode *inode)
 /*
 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
- * temporary list and then are freed outside inode_lock by dispose_list().
+ * temporary list and then are freed outside inode_lru_lock by dispose_list().
 *
 * Any inodes which are pinned purely because of attached pagecache have their
 * pagecache removed.  If the inode has metadata buffers attached to
@@ -616,7 +666,7 @@ static void prune_icache(int nr_to_scan)
        unsigned long reap = 0;
        down_read(&iprune_sem);
-        spin_lock(&inode_lock);
+        spin_lock(&inode_lru_lock);
        for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
                struct inode *inode;
@@ -626,53 +676,67 @@ static void prune_icache(int nr_to_scan)
                inode = list_entry(inode_lru.prev, struct inode, i_lru);
                /*
+                 * we are inverting the inode_lru_lock/inode->i_lock here,
+                 * so use a trylock. If we fail to get the lock, just move the
+                 * inode to the back of the list so we don't spin on it.
+                 */
+                if (!spin_trylock(&inode->i_lock)) {
+                        list_move(&inode->i_lru, &inode_lru);
+                        continue;
+                }
+                /*
                 * Referenced or dirty inodes are still in use. Give them
                 * another pass through the LRU as we canot reclaim them now.
                 */
                if (atomic_read(&inode->i_count) ||
                    (inode->i_state & ~I_REFERENCED)) {
                        list_del_init(&inode->i_lru);
+                        spin_unlock(&inode->i_lock);
                        inodes_stat.nr_unused--;
                        continue;
                }
                /* recently referenced inodes get one more pass */
                if (inode->i_state & I_REFERENCED) {
-                        list_move(&inode->i_lru, &inode_lru);
                        inode->i_state &= ~I_REFERENCED;
+                        list_move(&inode->i_lru, &inode_lru);
+                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if (inode_has_buffers(inode) || inode->i_data.nrpages) {
                        __iget(inode);
-                        spin_unlock(&inode_lock);
+                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&inode_lru_lock);
                        if (remove_inode_buffers(inode))
                                reap += invalidate_mapping_pages(&inode->i_data,
                                                                0, -1);
                        iput(inode);
-                        spin_lock(&inode_lock);
+                        spin_lock(&inode_lru_lock);
                        if (inode != list_entry(inode_lru.next,
                                                struct inode, i_lru))
                                continue;       /* wrong inode or list_empty */
-                        if (!can_unuse(inode))
+                        /* avoid lock inversions with trylock */
+                        if (!spin_trylock(&inode->i_lock))
+                                continue;
+                        if (!can_unuse(inode)) {
+                                spin_unlock(&inode->i_lock);
                                continue;
+                        }
                }
                WARN_ON(inode->i_state & I_NEW);
                inode->i_state |= I_FREEING;
+                spin_unlock(&inode->i_lock);
-                /*
-                 * Move the inode off the IO lists and LRU once I_FREEING is
-                 * set so that it won't get moved back on there if it is dirty.
-                 */
                list_move(&inode->i_lru, &freeable);
-                list_del_init(&inode->i_wb_list);
                inodes_stat.nr_unused--;
        }
        if (current_is_kswapd())
                __count_vm_events(KSWAPD_INODESTEAL, reap);
        else
                __count_vm_events(PGINODESTEAL, reap);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_lru_lock);
        dispose_list(&freeable);
        up_read(&iprune_sem);
@@ -721,15 +785,21 @@ static struct inode *find_inode(struct super_block *sb,
 repeat:
        hlist_for_each_entry(inode, node, head, i_hash) {
-                if (inode->i_sb != sb)
+                spin_lock(&inode->i_lock);
+                if (inode->i_sb != sb) {
+                        spin_unlock(&inode->i_lock);
                        continue;
-                if (!test(inode, data))
+                }
+                if (!test(inode, data)) {
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
                __iget(inode);
+                spin_unlock(&inode->i_lock);
                return inode;
        }
        return NULL;
@@ -747,15 +817,21 @@ static struct inode *find_inode_fast(struct super_block *sb,
 repeat:
        hlist_for_each_entry(inode, node, head, i_hash) {
-                if (inode->i_ino != ino)
+                spin_lock(&inode->i_lock);
+                if (inode->i_ino != ino) {
+                        spin_unlock(&inode->i_lock);
                        continue;
-                if (inode->i_sb != sb)
+                }
+                if (inode->i_sb != sb) {
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode);
                        goto repeat;
                }
                __iget(inode);
+                spin_unlock(&inode->i_lock);
                return inode;
        }
        return NULL;
@@ -815,19 +891,26 @@ struct inode *new_inode(struct super_block *sb)
 {
        struct inode *inode;
-        spin_lock_prefetch(&inode_lock);
+        spin_lock_prefetch(&inode_sb_list_lock);
        inode = alloc_inode(sb);
        if (inode) {
-                spin_lock(&inode_lock);
+                spin_lock(&inode->i_lock);
-                __inode_sb_list_add(inode);
                inode->i_state = 0;
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode->i_lock);
+                inode_sb_list_add(inode);
        }
        return inode;
 }
 EXPORT_SYMBOL(new_inode);
+/**
+ * unlock_new_inode - clear the I_NEW state and wake up any waiters
+ * @inode:      new inode to unlock
+ *
+ * Called when the inode is fully initialised to clear the new state of the
+ * inode and wake up anyone waiting for the inode to finish initialisation.
+ */
 void unlock_new_inode(struct inode *inode)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -847,51 +930,67 @@ void unlock_new_inode(struct inode *inode)
                }
        }
 #endif
-        /*
+        spin_lock(&inode->i_lock);
-         * This is special!  We do not need the spinlock when clearing I_NEW,
-         * because we're guaranteed that nobody else tries to do anything about
-         * the state of the inode when it is locked, as we just created it (so
-         * there can be no old holders that haven't tested I_NEW).
-         * However we must emit the memory barrier so that other CPUs reliably
-         * see the clearing of I_NEW after the other inode initialisation has
-         * completed.
-         */
-        smp_mb();
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW;
-        wake_up_inode(inode);
+        wake_up_bit(&inode->i_state, __I_NEW);
+        spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(unlock_new_inode);
-/*
+/**
- * This is called without the inode lock held.. Be careful.
+ * iget5_locked - obtain an inode from a mounted file system
+ * @sb:         super block of file system
+ * @hashval:    hash value (usually inode number) to get
+ * @test:       callback used for comparisons between inodes
+ * @set:        callback used to initialize a new struct inode
+ * @data:       opaque data pointer to pass to @test and @set
+ *
+ * Search for the inode specified by @hashval and @data in the inode cache,
+ * and if present it is return it with an increased reference count. This is
+ * a generalized version of iget_locked() for file systems where the inode
+ * number is not sufficient for unique identification of an inode.
 *
- * We no longer cache the sb_flags in i_flags - see fs.h
+ * If the inode is not in cache, allocate a new inode and return it locked,
- *      -- rmk@arm.uk.linux.org
+ * hashed, and with the I_NEW flag set. The file system gets to fill it in
+ * before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_hash_lock held, so can't
+ * sleep.
 */
-static struct inode *get_new_inode(struct super_block *sb,
+struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
-                                struct hlist_head *head,
+                int (*test)(struct inode *, void *),
-                                int (*test)(struct inode *, void *),
+                int (*set)(struct inode *, void *), void *data)
-                                int (*set)(struct inode *, void *),
-                                void *data)
 {
+        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;
+        spin_lock(&inode_hash_lock);
+        inode = find_inode(sb, head, test, data);
+        spin_unlock(&inode_hash_lock);
+        if (inode) {
+                wait_on_inode(inode);
+                return inode;
+        }
        inode = alloc_inode(sb);
        if (inode) {
                struct inode *old;
-                spin_lock(&inode_lock);
+                spin_lock(&inode_hash_lock);
                /* We released the lock, so.. */
                old = find_inode(sb, head, test, data);
                if (!old) {
                        if (set(inode, data))
                                goto set_failed;
-                        hlist_add_head(&inode->i_hash, head);
+                        spin_lock(&inode->i_lock);
-                        __inode_sb_list_add(inode);
                        inode->i_state = I_NEW;
-                        spin_unlock(&inode_lock);
+                        hlist_add_head(&inode->i_hash, head);
+                        spin_unlock(&inode->i_lock);
+                        inode_sb_list_add(inode);
+                        spin_unlock(&inode_hash_lock);
                        /* Return the locked inode with I_NEW set, the
                         * caller is responsible for filling in the contents
@@ -904,7 +1003,7 @@ static struct inode *get_new_inode(struct super_block *sb,
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode_hash_lock);
                destroy_inode(inode);
                inode = old;
                wait_on_inode(inode);
@@ -912,33 +1011,53 @@ static struct inode *get_new_inode(struct super_block *sb,
        return inode;
 set_failed:
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_hash_lock);
        destroy_inode(inode);
        return NULL;
 }
+EXPORT_SYMBOL(iget5_locked);
-/*
+/**
- * get_new_inode_fast is the fast path version of get_new_inode, see the
+ * iget_locked - obtain an inode from a mounted file system
- * comment at iget_locked for details.
+ * @sb:         super block of file system
+ * @ino:        inode number to get
+ *
+ * Search for the inode specified by @ino in the inode cache and if present
+ * return it with an increased reference count. This is for file systems
+ * where the inode number is sufficient for unique identification of an inode.
+ *
+ * If the inode is not in cache, allocate a new inode and return it locked,
+ * hashed, and with the I_NEW flag set.  The file system gets to fill it in
+ * before unlocking it via unlock_new_inode().
 */
-static struct inode *get_new_inode_fast(struct super_block *sb,
+struct inode *iget_locked(struct super_block *sb, unsigned long ino)
-                                struct hlist_head *head, unsigned long ino)
 {
+        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;
+        spin_lock(&inode_hash_lock);
+        inode = find_inode_fast(sb, head, ino);
+        spin_unlock(&inode_hash_lock);
+        if (inode) {
+                wait_on_inode(inode);
+                return inode;
+        }
        inode = alloc_inode(sb);
        if (inode) {
                struct inode *old;
-                spin_lock(&inode_lock);
+                spin_lock(&inode_hash_lock);
                /* We released the lock, so.. */
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
-                        hlist_add_head(&inode->i_hash, head);
+                        spin_lock(&inode->i_lock);
-                        __inode_sb_list_add(inode);
                        inode->i_state = I_NEW;
-                        spin_unlock(&inode_lock);
+                        hlist_add_head(&inode->i_hash, head);
+                        spin_unlock(&inode->i_lock);
+                        inode_sb_list_add(inode);
+                        spin_unlock(&inode_hash_lock);
                        /* Return the locked inode with I_NEW set, the
                         * caller is responsible for filling in the contents
@@ -951,13 +1070,14 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode_hash_lock);
                destroy_inode(inode);
                inode = old;
                wait_on_inode(inode);
        }
        return inode;
 }
+EXPORT_SYMBOL(iget_locked);
 /*
 * search the inode cache for a matching inode number.
@@ -972,10 +1092,14 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino)
        struct hlist_node *node;
        struct inode *inode;
+        spin_lock(&inode_hash_lock);
        hlist_for_each_entry(inode, node, b, i_hash) {
-                if (inode->i_ino == ino && inode->i_sb == sb)
+                if (inode->i_ino == ino && inode->i_sb == sb) {
+                        spin_unlock(&inode_hash_lock);
                        return 0;
+                }
        }
+        spin_unlock(&inode_hash_lock);
        return 1;
 }
@@ -1005,7 +1129,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
        static unsigned int counter;
        ino_t res;
-        spin_lock(&inode_lock);
        spin_lock(&iunique_lock);
        do {
                if (counter <= max_reserved)
@@ -1013,7 +1136,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
                res = counter++;
        } while (!test_inode_iunique(sb, res));
        spin_unlock(&iunique_lock);
-        spin_unlock(&inode_lock);
        return res;
 }
@@ -1021,116 +1143,50 @@ EXPORT_SYMBOL(iunique);
 struct inode *igrab(struct inode *inode)
 {
-        spin_lock(&inode_lock);
+        spin_lock(&inode->i_lock);
-        if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
+        if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
                __iget(inode);
-        else
+                spin_unlock(&inode->i_lock);
+        } else {
+                spin_unlock(&inode->i_lock);
                /*
                 * Handle the case where s_op->clear_inode is not been
                 * called yet, and somebody is calling igrab
                 * while the inode is getting freed.
                 */
                inode = NULL;
-        spin_unlock(&inode_lock);
+        }
        return inode;
 }
 EXPORT_SYMBOL(igrab);
 /**
- * ifind - internal function, you want ilookup5() or iget5().
- * @sb:         super block of file system to search
- * @head:       the head of the list to search
- * @test:       callback used for comparisons between inodes
- * @data:       opaque data pointer to pass to @test
- * @wait:       if true wait for the inode to be unlocked, if false do not
- *
- * ifind() searches for the inode specified by @data in the inode
- * cache. This is a generalized version of ifind_fast() for file systems where
- * the inode number is not sufficient for unique identification of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count.
- *
- * Otherwise NULL is returned.
- *
- * Note, @test is called with the inode_lock held, so can't sleep.
- */
-static struct inode *ifind(struct super_block *sb,
-                struct hlist_head *head, int (*test)(struct inode *, void *),
-                void *data, const int wait)
-{
-        struct inode *inode;
-        spin_lock(&inode_lock);
-        inode = find_inode(sb, head, test, data);
-        if (inode) {
-                spin_unlock(&inode_lock);
-                if (likely(wait))
-                        wait_on_inode(inode);
-                return inode;
-        }
-        spin_unlock(&inode_lock);
-        return NULL;
-}
-/**
- * ifind_fast - internal function, you want ilookup() or iget().
- * @sb:         super block of file system to search
- * @head:       head of the list to search
- * @ino:        inode number to search for
- *
- * ifind_fast() searches for the inode @ino in the inode cache. This is for
- * file systems where the inode number is sufficient for unique identification
- * of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count.
- *
- * Otherwise NULL is returned.
- */
-static struct inode *ifind_fast(struct super_block *sb,
-                struct hlist_head *head, unsigned long ino)
-{
-        struct inode *inode;
-        spin_lock(&inode_lock);
-        inode = find_inode_fast(sb, head, ino);
-        if (inode) {
-                spin_unlock(&inode_lock);
-                wait_on_inode(inode);
-                return inode;
-        }
-        spin_unlock(&inode_lock);
-        return NULL;
-}
-/**
 * ilookup5_nowait - search for an inode in the inode cache
 * @sb:         super block of file system to search
 * @hashval:    hash value (usually inode number) to search for
 * @test:       callback used for comparisons between inodes
 * @data:       opaque data pointer to pass to @test
 *
- * ilookup5() uses ifind() to search for the inode specified by @hashval and
+ * Search for the inode specified by @hashval and @data in the inode cache.
- * @data in the inode cache. This is a generalized version of ilookup() for
- * file systems where the inode number is not sufficient for unique
- * identification of an inode.
- *
 * If the inode is in the cache, the inode is returned with an incremented
- * reference count.  Note, the inode lock is not waited upon so you have to be
+ * reference count.
- * very careful what you do with the returned inode.  You probably should be
- * using ilookup5() instead.
 *
- * Otherwise NULL is returned.
+ * Note: I_NEW is not waited upon so you have to be very careful what you do
+ * with the returned inode.  You probably should be using ilookup5() instead.
 *
- * Note, @test is called with the inode_lock held, so can't sleep.
+ * Note2: @test is called with the inode_hash_lock held, so can't sleep.
 */
 struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
 {
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+        struct inode *inode;
-        return ifind(sb, head, test, data, 0);
+        spin_lock(&inode_hash_lock);
+        inode = find_inode(sb, head, test, data);
+        spin_unlock(&inode_hash_lock);
+        return inode;
 }
 EXPORT_SYMBOL(ilookup5_nowait);
@@ -1141,24 +1197,24 @@ EXPORT_SYMBOL(ilookup5_nowait);
 * @test:       callback used for comparisons between inodes
 * @data:       opaque data pointer to pass to @test
 *
- * ilookup5() uses ifind() to search for the inode specified by @hashval and
+ * Search for the inode specified by @hashval and @data in the inode cache,
- * @data in the inode cache. This is a generalized version of ilookup() for
+ * and if the inode is in the cache, return the inode with an incremented
- * file systems where the inode number is not sufficient for unique
+ * reference count.  Waits on I_NEW before returning the inode.
- * identification of an inode.
- *
- * If the inode is in the cache, the inode lock is waited upon and the inode is
 * returned with an incremented reference count.
 *
- * Otherwise NULL is returned.
+ * This is a generalized version of ilookup() for file systems where the
+ * inode number is not sufficient for unique identification of an inode.
 *
- * Note, @test is called with the inode_lock held, so can't sleep.
+ * Note: @test is called with the inode_hash_lock held, so can't sleep.
 */
 struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
 {
-        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+        struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
-        return ifind(sb, head, test, data, 1);
+        if (inode)
+                wait_on_inode(inode);
+        return inode;
 }
 EXPORT_SYMBOL(ilookup5);
@@ -1167,91 +1223,23 @@ EXPORT_SYMBOL(ilookup5);
 * @sb:         super block of file system to search
 * @ino:        inode number to search for
 *
- * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache.
+ * Search for the inode @ino in the inode cache, and if the inode is in the
- * This is for file systems where the inode number is sufficient for unique
+ * cache, the inode is returned with an incremented reference count.
- * identification of an inode.
- *
- * If the inode is in the cache, the inode is returned with an incremented
- * reference count.
- *
- * Otherwise NULL is returned.
 */
 struct inode *ilookup(struct super_block *sb, unsigned long ino)
 {
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
-        return ifind_fast(sb, head, ino);
-}
-EXPORT_SYMBOL(ilookup);
-/**
- * iget5_locked - obtain an inode from a mounted file system
- * @sb:         super block of file system
- * @hashval:    hash value (usually inode number) to get
- * @test:       callback used for comparisons between inodes
- * @set:        callback used to initialize a new struct inode
- * @data:       opaque data pointer to pass to @test and @set
- *
- * iget5_locked() uses ifind() to search for the inode specified by @hashval
- * and @data in the inode cache and if present it is returned with an increased
- * reference count. This is a generalized version of iget_locked() for file
- * systems where the inode number is not sufficient for unique identification
- * of an inode.
- *
- * If the inode is not in cache, get_new_inode() is called to allocate a new
- * inode and this is returned locked, hashed, and with the I_NEW flag set. The
- * file system gets to fill it in before unlocking it via unlock_new_inode().
- *
- * Note both @test and @set are called with the inode_lock held, so can't sleep.
- */
-struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
-                int (*test)(struct inode *, void *),
-                int (*set)(struct inode *, void *), void *data)
-{
-        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;
-        inode = ifind(sb, head, test, data, 1);
+        spin_lock(&inode_hash_lock);
-        if (inode)
+        inode = find_inode_fast(sb, head, ino);
-                return inode;
+        spin_unlock(&inode_hash_lock);
-        /*
-         * get_new_inode() will do the right thing, re-trying the search
-         * in case it had to block at any point.
-         */
-        return get_new_inode(sb, head, test, set, data);
-}
-EXPORT_SYMBOL(iget5_locked);
-/**
- * iget_locked - obtain an inode from a mounted file system
- * @sb:         super block of file system
- * @ino:        inode number to get
- *
- * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
- * the inode cache and if present it is returned with an increased reference
- * count. This is for file systems where the inode number is sufficient for
- * unique identification of an inode.
- *
- * If the inode is not in cache, get_new_inode_fast() is called to allocate a
- * new inode and this is returned locked, hashed, and with the I_NEW flag set.
- * The file system gets to fill it in before unlocking it via
- * unlock_new_inode().
- */
-struct inode *iget_locked(struct super_block *sb, unsigned long ino)
-{
-        struct hlist_head *head = inode_hashtable + hash(sb, ino);
-        struct inode *inode;
-        inode = ifind_fast(sb, head, ino);
        if (inode)
-                return inode;
+                wait_on_inode(inode);
-        /*
+        return inode;
-         * get_new_inode_fast() will do the right thing, re-trying the search
-         * in case it had to block at any point.
-         */
-        return get_new_inode_fast(sb, head, ino);
 }
-EXPORT_SYMBOL(iget_locked);
+EXPORT_SYMBOL(ilookup);
 int insert_inode_locked(struct inode *inode)
 {
@@ -1259,27 +1247,33 @@ int insert_inode_locked(struct inode *inode)
        ino_t ino = inode->i_ino;
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
-        inode->i_state |= I_NEW;
        while (1) {
                struct hlist_node *node;
                struct inode *old = NULL;
-                spin_lock(&inode_lock);
+                spin_lock(&inode_hash_lock);
                hlist_for_each_entry(old, node, head, i_hash) {
                        if (old->i_ino != ino)
                                continue;
                        if (old->i_sb != sb)
                                continue;
-                        if (old->i_state & (I_FREEING|I_WILL_FREE))
+                        spin_lock(&old->i_lock);
+                        if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+                                spin_unlock(&old->i_lock);
                                continue;
+                        }
                        break;
                }
                if (likely(!node)) {
+                        spin_lock(&inode->i_lock);
+                        inode->i_state |= I_NEW;
                        hlist_add_head(&inode->i_hash, head);
-                        spin_unlock(&inode_lock);
+                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&inode_hash_lock);
                        return 0;
                }
                __iget(old);
-                spin_unlock(&inode_lock);
+                spin_unlock(&old->i_lock);
+                spin_unlock(&inode_hash_lock);
                wait_on_inode(old);
                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
@@ -1296,29 +1290,34 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
        struct super_block *sb = inode->i_sb;
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
-        inode->i_state |= I_NEW;
        while (1) {
                struct hlist_node *node;
                struct inode *old = NULL;
-                spin_lock(&inode_lock);
+                spin_lock(&inode_hash_lock);
                hlist_for_each_entry(old, node, head, i_hash) {
                        if (old->i_sb != sb)
                                continue;
                        if (!test(old, data))
                                continue;
-                        if (old->i_state & (I_FREEING|I_WILL_FREE))
+                        spin_lock(&old->i_lock);
+                        if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+                                spin_unlock(&old->i_lock);
                                continue;
+                        }
                        break;
                }
                if (likely(!node)) {
+                        spin_lock(&inode->i_lock);
+                        inode->i_state |= I_NEW;
                        hlist_add_head(&inode->i_hash, head);
-                        spin_unlock(&inode_lock);
+                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&inode_hash_lock);
                        return 0;
                }
                __iget(old);
-                spin_unlock(&inode_lock);
+                spin_unlock(&old->i_lock);
+                spin_unlock(&inode_hash_lock);
                wait_on_inode(old);
                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
@@ -1363,47 +1362,35 @@ static void iput_final(struct inode *inode)
        const struct super_operations *op = inode->i_sb->s_op;
        int drop;
+        WARN_ON(inode->i_state & I_NEW);
        if (op && op->drop_inode)
                drop = op->drop_inode(inode);
        else
                drop = generic_drop_inode(inode);
+        if (!drop && (sb->s_flags & MS_ACTIVE)) {
+                inode->i_state |= I_REFERENCED;
+                if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+                        inode_lru_list_add(inode);
+                spin_unlock(&inode->i_lock);
+                return;
+        }
        if (!drop) {
-                if (sb->s_flags & MS_ACTIVE) {
-                        inode->i_state |= I_REFERENCED;
-                        if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
-                                inode_lru_list_add(inode);
-                        }
-                        spin_unlock(&inode_lock);
-                        return;
-                }
-                WARN_ON(inode->i_state & I_NEW);
                inode->i_state |= I_WILL_FREE;
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode->i_lock);
                write_inode_now(inode, 1);
-                spin_lock(&inode_lock);
+                spin_lock(&inode->i_lock);
                WARN_ON(inode->i_state & I_NEW);
                inode->i_state &= ~I_WILL_FREE;
-                __remove_inode_hash(inode);
        }
-        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
-        /*
-         * Move the inode off the IO lists and LRU once I_FREEING is
-         * set so that it won't get moved back on there if it is dirty.
-         */
        inode_lru_list_del(inode);
-        list_del_init(&inode->i_wb_list);
+        spin_unlock(&inode->i_lock);
-        __inode_sb_list_del(inode);
-        spin_unlock(&inode_lock);
        evict(inode);
-        remove_inode_hash(inode);
-        wake_up_inode(inode);
-        BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
-        destroy_inode(inode);
 }
 /**
@@ -1420,7 +1407,7 @@ void iput(struct inode *inode)
        if (inode) {
                BUG_ON(inode->i_state & I_CLEAR);
-                if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+                if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
                        iput_final(inode);
        }
 }
@@ -1599,9 +1586,8 @@ EXPORT_SYMBOL(inode_wait);
 * to recheck inode state.
 *
 * It doesn't matter if I_NEW is not set initially, a call to
- * wake_up_inode() after removing from the hash list will DTRT.
+ * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
- *
+ * will DTRT.
- * This is called with inode_lock held.
 */
 static void __wait_on_freeing_inode(struct inode *inode)
 {
@@ -1609,10 +1595,11 @@ static void __wait_on_freeing_inode(struct inode *inode)
        DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
        wq = bit_waitqueue(&inode->i_state, __I_NEW);
        prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&inode_hash_lock);
        schedule();
        finish_wait(wq, &wait.wait);
-        spin_lock(&inode_lock);
+        spin_lock(&inode_hash_lock);
 }
 static __initdata unsigned long ihash_entries;
@@ -1704,7 +1691,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
 EXPORT_SYMBOL(init_special_inode);
 /**
- * Init uid,gid,mode for new inode according to posix standards
+ * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
 * @inode: New inode
 * @dir: Directory inode
 * @mode: mode of the new inode
@@ -1722,3 +1709,22 @@ void inode_init_owner(struct inode *inode, const struct inode *dir,
        inode->i_mode = mode;
 }
 EXPORT_SYMBOL(inode_init_owner);
+/**
+ * inode_owner_or_capable - check current task permissions to inode
+ * @inode: inode being checked
+ *
+ * Return true if current either has CAP_FOWNER to the inode, or
+ * owns the file.
+ */
+bool inode_owner_or_capable(const struct inode *inode)
+{
+        struct user_namespace *ns = inode_userns(inode);
+        if (current_user_ns() == ns && current_fsuid() == inode->i_uid)
+                return true;
+        if (ns_capable(ns, CAP_FOWNER))
+                return true;
+        return false;
+}
+EXPORT_SYMBOL(inode_owner_or_capable);
diff --git a/fs/internal.h b/fs/internal.h
index 0663568b1247..b29c46e4e32f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -12,6 +12,7 @@
 #include <linux/lglock.h>
 struct super_block;
+struct file_system_type;
 struct linux_binprm;
 struct path;
@@ -61,10 +62,9 @@ extern int check_unsafe_exec(struct linux_binprm *);
 extern int copy_mount_options(const void __user *, unsigned long *);
 extern int copy_mount_string(const void __user *, char **);
-extern void free_vfsmnt(struct vfsmount *);
-extern struct vfsmount *alloc_vfsmnt(const char *);
 extern unsigned int mnt_get_count(struct vfsmount *mnt);
 extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
+extern struct vfsmount *lookup_mnt(struct path *);
 extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
                                struct vfsmount *);
 extern void release_mounts(struct list_head *);
@@ -99,6 +99,8 @@ extern struct file *get_empty_filp(void);
 extern int do_remount_sb(struct super_block *, int, void *, int);
 extern void __put_super(struct super_block *sb);
 extern void put_super(struct super_block *sb);
+extern struct dentry *mount_fs(struct file_system_type *,
+                               int, const char *, void *);
 /*
 * open.c
@@ -106,10 +108,30 @@ extern void put_super(struct super_block *sb);
 struct nameidata;
 extern struct file *nameidata_to_filp(struct nameidata *);
 extern void release_open_intent(struct nameidata *);
+struct open_flags {
+        int open_flag;
+        int mode;
+        int acc_mode;
+        int intent;
+};
+extern struct file *do_filp_open(int dfd, const char *pathname,
+                const struct open_flags *op, int lookup_flags);
+extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
+                const char *, const struct open_flags *, int lookup_flags);
+extern long do_handle_open(int mountdirfd,
+                           struct file_handle __user *ufh, int open_flag);
 /*
 * inode.c
 */
+extern spinlock_t inode_sb_list_lock;
+/*
+ * fs-writeback.c
+ */
+extern void inode_wb_list_del(struct inode *inode);
 extern int get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
-extern int invalidate_inodes(struct super_block *);
+extern int invalidate_inodes(struct super_block *, bool);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index a59635e295fa..1d9b9fcb2db4 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -273,6 +273,13 @@ int __generic_block_fiemap(struct inode *inode,
                len = isize;
        }
+        /*
+         * Some filesystems can't deal with being asked to map less than
+         * blocksize, so make sure our len is at least block length.
+         */
+        if (logical_to_blk(inode, len) == 0)
+                len = blk_to_logical(inode, 1);
        start_blk = logical_to_blk(inode, start);
        last_blk = logical_to_blk(inode, start + len - 1);
@@ -541,6 +548,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 {
        int error = 0;
        int __user *argp = (int __user *)arg;
+        struct inode *inode = filp->f_path.dentry->d_inode;
        switch (cmd) {
        case FIOCLEX:
@@ -560,13 +568,11 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                break;
        case FIOQSIZE:
-                if (S_ISDIR(filp->f_path.dentry->d_inode->i_mode) ||
+                if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
-                    S_ISREG(filp->f_path.dentry->d_inode->i_mode) ||
+                    S_ISLNK(inode->i_mode)) {
-                    S_ISLNK(filp->f_path.dentry->d_inode->i_mode)) {
+                        loff_t res = inode_get_bytes(inode);
-                        loff_t res =
+                        error = copy_to_user(argp, &res, sizeof(res)) ?
-                                inode_get_bytes(filp->f_path.dentry->d_inode);
+                                        -EFAULT : 0;
-                        error = copy_to_user((loff_t __user *)arg, &res,
-                                             sizeof(res)) ? -EFAULT : 0;
                } else
                        error = -ENOTTY;
                break;
@@ -583,14 +589,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                return ioctl_fiemap(filp, arg);
        case FIGETBSZ:
-        {
+                return put_user(inode->i_sb->s_blocksize, argp);
-                struct inode *inode = filp->f_path.dentry->d_inode;
-                int __user *p = (int __user *)arg;
-                return put_user(inode->i_sb->s_blocksize, p);
-        }
        default:
-                if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
+                if (S_ISREG(inode->i_mode))
                        error = file_ioctl(filp, cmd, arg);
                else
                        error = vfs_ioctl(filp, cmd, arg);
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index ed752cb38474..dd4687ff30d0 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry,
         * offset of the inode and the upper 16 bits of fh32[1] to
         * hold the offset of the parent.
         */
+        if (connectable && (len < 5)) {
-        if (len < 3 || (connectable && len < 5))
+                *max_len = 5;
+                return 255;
+        } else if (len < 3) {
+                *max_len = 3;
                return 255;
+        }
        len = 3;
        fh32[0] = ei->i_iget5_block;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index a0f3833c0dbf..3db5ba4568fc 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1158,7 +1158,6 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations isofs_aops = {
        .readpage = isofs_readpage,
-        .sync_page = block_sync_page,
        .bmap = _isofs_bmap
 };
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 34a4861c14b8..69b180459463 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
+#include <linux/blkdev.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -294,7 +295,7 @@ void journal_commit_transaction(journal_t *journal)
        int first_tag = 0;
        int tag_flag;
        int i;
-        int write_op = WRITE_SYNC;
+        struct blk_plug plug;
        /*
         * First job: lock down the current transaction and wait for
@@ -327,13 +328,6 @@ void journal_commit_transaction(journal_t *journal)
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
-        /*
-         * Use plugged writes here, since we want to submit several before
-         * we unplug the device. We don't do explicit unplugging in here,
-         * instead we rely on sync_buffer() doing the unplug for us.
-         */
-        if (commit_transaction->t_synchronous_commit)
-                write_op = WRITE_SYNC_PLUG;
        spin_lock(&commit_transaction->t_handle_lock);
        while (commit_transaction->t_updates) {
                DEFINE_WAIT(wait);
@@ -368,7 +362,7 @@ void journal_commit_transaction(journal_t *journal)
         * we do not require it to remember exactly which old buffers it
         * has reserved.  This is consistent with the existing behaviour
         * that multiple journal_get_write_access() calls to the same
-         * buffer are perfectly permissable.
+         * buffer are perfectly permissible.
         */
        while (commit_transaction->t_reserved_list) {
                jh = commit_transaction->t_reserved_list;
@@ -418,8 +412,10 @@ void journal_commit_transaction(journal_t *journal)
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
+        blk_start_plug(&plug);
        err = journal_submit_data_buffers(journal, commit_transaction,
-                                          write_op);
+                                          WRITE_SYNC);
+        blk_finish_plug(&plug);
        /*
         * Wait for all previously submitted IO to complete.
@@ -480,7 +476,9 @@ void journal_commit_transaction(journal_t *journal)
                err = 0;
        }
-        journal_write_revoke_records(journal, commit_transaction, write_op);
+        blk_start_plug(&plug);
+        journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC);
        /*
         * If we found any dirty or locked buffers, then we should have
@@ -650,7 +648,7 @@ start_journal_io:
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
                                bh->b_end_io = journal_end_buffer_io_sync;
-                                submit_bh(write_op, bh);
+                                submit_bh(WRITE_SYNC, bh);
                        }
                        cond_resched();
@@ -661,6 +659,8 @@ start_journal_io:
                }
        }
+        blk_finish_plug(&plug);
        /* Lo and behold: we have just managed to send a transaction to
           the log.  Before we can commit it, wait for the IO so far to
           complete.  Control buffers being written are on the
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index da1b5e4ffce1..b3713afaaa9e 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -770,7 +770,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
        journal->j_wbufsize = n;
        journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
        if (!journal->j_wbuf) {
-                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
+                printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
                        __func__);
                goto out_err;
        }
@@ -831,7 +831,7 @@ journal_t * journal_init_inode (struct inode *inode)
        journal->j_wbufsize = n;
        journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
        if (!journal->j_wbuf) {
-                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
+                printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
                        __func__);
                goto out_err;
        }
@@ -839,7 +839,7 @@ journal_t * journal_init_inode (struct inode *inode)
        err = journal_bmap(journal, 0, &blocknr);
        /* If that failed, give up */
        if (err) {
-                printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
+                printk(KERN_ERR "%s: Cannot locate journal superblock\n",
                       __func__);
                goto out_err;
        }
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index d29018307e2e..305a90763154 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -71,7 +71,7 @@
 * switching hash tables under them. For operations on the lists of entries in
 * the hash table j_revoke_lock is used.
 *
- * Finally, also replay code uses the hash tables but at this moment noone else
+ * Finally, also replay code uses the hash tables but at this moment no one else
 * can touch them (filesystem isn't mounted yet) and hence no locking is
 * needed.
 */
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 5b2e4c30a2a1..60d2319651b2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1392,7 +1392,7 @@ int journal_stop(handle_t *handle)
         * by 30x or more...
         *
         * We try and optimize the sleep time against what the underlying disk
-         * can do, instead of having a static sleep time.  This is usefull for
+         * can do, instead of having a static sleep time.  This is useful for
         * the case where our storage is so fast that it is more optimal to go
         * ahead and force a flush and wait for the transaction to be committed
         * than it is to wait for an arbitrary amount of time for new writers to
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f3ad1598b201..6e28000a4b21 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -105,6 +105,8 @@ static int journal_submit_commit_record(journal_t *journal,
        int ret;
        struct timespec now = current_kernel_time();
+        *cbh = NULL;
        if (is_journal_aborted(journal))
                return 0;
@@ -137,9 +139,9 @@ static int journal_submit_commit_record(journal_t *journal,
        if (journal->j_flags & JBD2_BARRIER &&
            !JBD2_HAS_INCOMPAT_FEATURE(journal,
                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-                ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh);
+                ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
        else
-                ret = submit_bh(WRITE_SYNC_PLUG, bh);
+                ret = submit_bh(WRITE_SYNC, bh);
        *cbh = bh;
        return ret;
@@ -329,7 +331,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        int tag_bytes = journal_tag_bytes(journal);
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
-        int write_op = WRITE_SYNC;
+        struct blk_plug plug;
        /*
         * First job: lock down the current transaction and wait for
@@ -363,13 +365,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        write_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
-        /*
-         * Use plugged writes here, since we want to submit several before
-         * we unplug the device. We don't do explicit unplugging in here,
-         * instead we rely on sync_buffer() doing the unplug for us.
-         */
-        if (commit_transaction->t_synchronous_commit)
-                write_op = WRITE_SYNC_PLUG;
        trace_jbd2_commit_locking(journal, commit_transaction);
        stats.run.rs_wait = commit_transaction->t_max_wait;
        stats.run.rs_locked = jiffies;
@@ -410,7 +405,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         * we do not require it to remember exactly which old buffers it
         * has reserved.  This is consistent with the existing behaviour
         * that multiple jbd2_journal_get_write_access() calls to the same
-         * buffer are perfectly permissable.
+         * buffer are perfectly permissible.
         */
        while (commit_transaction->t_reserved_list) {
                jh = commit_transaction->t_reserved_list;
@@ -469,8 +464,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        if (err)
                jbd2_journal_abort(journal, err);
+        blk_start_plug(&plug);
        jbd2_journal_write_revoke_records(journal, commit_transaction,
-                                          write_op);
+                                          WRITE_SYNC);
+        blk_finish_plug(&plug);
        jbd_debug(3, "JBD: commit phase 2\n");
@@ -497,6 +494,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        err = 0;
        descriptor = NULL;
        bufs = 0;
+        blk_start_plug(&plug);
        while (commit_transaction->t_buffers) {
                /* Find the next buffer to be journaled... */
@@ -658,7 +656,7 @@ start_journal_io:
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
                                bh->b_end_io = journal_end_buffer_io_sync;
-                                submit_bh(write_op, bh);
+                                submit_bh(WRITE_SYNC, bh);
                        }
                        cond_resched();
                        stats.run.rs_blocks_logged += bufs;
@@ -699,6 +697,8 @@ start_journal_io:
                        __jbd2_journal_abort_hard(journal);
        }
+        blk_finish_plug(&plug);
        /* Lo and behold: we have just managed to send a transaction to
           the log.  Before we can commit it, wait for the IO so far to
           complete.  Control buffers being written are on the
@@ -808,7 +808,7 @@ wait_for_iobuf:
                if (err)
                        __jbd2_journal_abort_hard(journal);
        }
-        if (!err && !is_journal_aborted(journal))
+        if (cbh)
                err = journal_wait_on_commit_record(journal, cbh);
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9e4686900f18..e0ec3db1c395 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -473,7 +473,8 @@ int __jbd2_log_space_left(journal_t *journal)
 }
 /*
- * Called under j_state_lock.  Returns true if a transaction commit was started.
+ * Called with j_state_lock locked for writing.
+ * Returns true if a transaction commit was started.
 */
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
@@ -520,11 +521,13 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 {
        transaction_t *transaction = NULL;
        tid_t tid;
+        int need_to_start = 0;
        read_lock(&journal->j_state_lock);
        if (journal->j_running_transaction && !current->journal_info) {
                transaction = journal->j_running_transaction;
-                __jbd2_log_start_commit(journal, transaction->t_tid);
+                if (!tid_geq(journal->j_commit_request, transaction->t_tid))
+                        need_to_start = 1;
        } else if (journal->j_committing_transaction)
                transaction = journal->j_committing_transaction;
@@ -535,6 +538,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
        tid = transaction->t_tid;
        read_unlock(&journal->j_state_lock);
+        if (need_to_start)
+                jbd2_log_start_commit(journal, tid);
        jbd2_log_wait_commit(journal, tid);
        return 1;
 }
@@ -912,7 +917,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
        journal->j_wbufsize = n;
        journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
        if (!journal->j_wbuf) {
-                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
+                printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
                        __func__);
                goto out_err;
        }
@@ -978,7 +983,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
        journal->j_wbufsize = n;
        journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
        if (!journal->j_wbuf) {
-                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
+                printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
                        __func__);
                goto out_err;
        }
@@ -986,7 +991,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
        err = jbd2_journal_bmap(journal, 0, &blocknr);
        /* If that failed, give up */
        if (err) {
-                printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
+                printk(KERN_ERR "%s: Cannot locate journal superblock\n",
                       __func__);
                goto out_err;
        }
@@ -2408,10 +2413,12 @@ const char *jbd2_dev_to_name(dev_t device)
        new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
        if (!new_dev)
                return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
+        bd = bdget(device);
        spin_lock(&devname_cache_lock);
        if (devcache[i]) {
                if (devcache[i]->device == device) {
                        kfree(new_dev);
+                        bdput(bd);
                        ret = devcache[i]->devname;
                        spin_unlock(&devname_cache_lock);
                        return ret;
@@ -2420,7 +2427,6 @@ const char *jbd2_dev_to_name(dev_t device)
        }
        devcache[i] = new_dev;
        devcache[i]->device = device;
-        bd = bdget(device);
        if (bd) {
                bdevname(bd, devcache[i]->devname);
                bdput(bd);
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 9ad321fd63fd..69fd93588118 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -71,7 +71,7 @@
 * switching hash tables under them. For operations on the lists of entries in
 * the hash table j_revoke_lock is used.
 *
- * Finally, also replay code uses the hash tables but at this moment noone else
+ * Finally, also replay code uses the hash tables but at this moment no one else
 * can touch them (filesystem isn't mounted yet) and hence no locking is
 * needed.
 */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index faad2bd787c7..05fa77a23711 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -117,10 +117,10 @@ static inline void update_t_max_wait(transaction_t *transaction)
 static int start_this_handle(journal_t *journal, handle_t *handle,
                             int gfp_mask)
 {
-        transaction_t *transaction;
+        transaction_t   *transaction, *new_transaction = NULL;
-        int needed;
+        tid_t           tid;
-        int nblocks = handle->h_buffer_credits;
+        int             needed, need_to_start;
-        transaction_t *new_transaction = NULL;
+        int             nblocks = handle->h_buffer_credits;
        if (nblocks > journal->j_max_transaction_buffers) {
                printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -222,8 +222,11 @@ repeat:
                atomic_sub(nblocks, &transaction->t_outstanding_credits);
                prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
                                TASK_UNINTERRUPTIBLE);
-                __jbd2_log_start_commit(journal, transaction->t_tid);
+                tid = transaction->t_tid;
+                need_to_start = !tid_geq(journal->j_commit_request, tid);
                read_unlock(&journal->j_state_lock);
+                if (need_to_start)
+                        jbd2_log_start_commit(journal, tid);
                schedule();
                finish_wait(&journal->j_wait_transaction_locked, &wait);
                goto repeat;
@@ -442,7 +445,8 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-        int ret;
+        tid_t           tid;
+        int             need_to_start, ret;
        /* If we've had an abort of any type, don't even think about
         * actually doing the restart! */
@@ -465,8 +469,11 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
        spin_unlock(&transaction->t_handle_lock);
        jbd_debug(2, "restarting handle %p\n", handle);
-        __jbd2_log_start_commit(journal, transaction->t_tid);
+        tid = transaction->t_tid;
+        need_to_start = !tid_geq(journal->j_commit_request, tid);
        read_unlock(&journal->j_state_lock);
+        if (need_to_start)
+                jbd2_log_start_commit(journal, tid);
        lock_map_release(&handle->h_lockdep_map);
        handle->h_buffer_credits = nblocks;
@@ -1396,7 +1403,7 @@ int jbd2_journal_stop(handle_t *handle)
        /*
         * Once we drop t_updates, if it goes to zero the transaction
-         * could start commiting on us and eventually disappear.  So
+         * could start committing on us and eventually disappear.  So
         * once we do this, we must not dereference transaction
         * pointer again.
         */
diff --git a/fs/jffs2/TODO b/fs/jffs2/TODO
index 5d3ea4070f01..ca28964abd4b 100644
--- a/fs/jffs2/TODO
+++ b/fs/jffs2/TODO
@@ -11,7 +11,7 @@
 - checkpointing (do we need this? scan is quite fast)
 - make the scan code populate real inodes so read_inode just after 
        mount doesn't have to read the flash twice for large files.
-        Make this a per-inode option, changable with chattr, so you can
+        Make this a per-inode option, changeable with chattr, so you can
        decide which inodes should be in-core immediately after mount.
 - test, test, test
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 95b79672150a..828a0e1ea438 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -402,7 +402,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
        if (name[0] != '\0')
                return -EINVAL;
-        if (!is_owner_or_cap(dentry->d_inode))
+        if (!inode_owner_or_capable(dentry->d_inode))
                return -EPERM;
        if (value) {
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index fd05a0b9431d..5a001020c542 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -40,12 +40,13 @@ static z_stream inf_strm, def_strm;
 static int __init alloc_workspaces(void)
 {
-        def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+        def_strm.workspace = vmalloc(zlib_deflate_workspacesize(MAX_WBITS,
+                                                        MAX_MEM_LEVEL));
        if (!def_strm.workspace) {
-                printk(KERN_WARNING "Failed to allocate %d bytes for deflate workspace\n", zlib_deflate_workspacesize());
+                printk(KERN_WARNING "Failed to allocate %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL));
                return -ENOMEM;
        }
-        D1(printk(KERN_DEBUG "Allocated %d bytes for deflate workspace\n", zlib_deflate_workspacesize()));
+        D1(printk(KERN_DEBUG "Allocated %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL)));
        inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
        if (!inf_strm.workspace) {
                printk(KERN_WARNING "Failed to allocate %d bytes for inflate workspace\n", zlib_inflate_workspacesize());
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 92978658ed18..82faddd1f321 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -215,8 +215,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
           no chance of AB-BA deadlock involving its f->sem). */
        mutex_unlock(&f->sem);
-        ret = jffs2_do_create(c, dir_f, f, ri,
+        ret = jffs2_do_create(c, dir_f, f, ri, &dentry->d_name);
-                              dentry->d_name.name, dentry->d_name.len);
        if (ret)
                goto fail;
@@ -386,7 +385,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        jffs2_complete_reservation(c);
-        ret = jffs2_init_security(inode, dir_i);
+        ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
        if (ret)
                goto fail;
@@ -530,7 +529,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
        jffs2_complete_reservation(c);
-        ret = jffs2_init_security(inode, dir_i);
+        ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
        if (ret)
                goto fail;
@@ -703,7 +702,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
        jffs2_complete_reservation(c);
-        ret = jffs2_init_security(inode, dir_i);
+        ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
        if (ret)
                goto fail;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 5a53d9bdb2b5..e4619b00f7c5 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -401,7 +401,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
                            struct jffs2_raw_inode *ri, unsigned char *buf,
                            uint32_t offset, uint32_t writelen, uint32_t *retlen);
 int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f,
-                    struct jffs2_raw_inode *ri, const char *name, int namelen);
+                    struct jffs2_raw_inode *ri, const struct qstr *qstr);
 int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name,
                    int namelen, struct jffs2_inode_info *dead_f, uint32_t time);
 int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino,
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index d32ee9412cb9..2ab1a0d91210 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -24,7 +24,7 @@
 *
 * Returns: 0 if the data CRC is correct;
 *          1 - if incorrect;
- *          error code if an error occured.
+ *          error code if an error occurred.
 */
 static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
 {
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 239f51216a68..cfeb7164b085 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -23,14 +23,15 @@
 #include "nodelist.h"
 /* ---- Initial Security Label Attachment -------------- */
-int jffs2_init_security(struct inode *inode, struct inode *dir)
+int jffs2_init_security(struct inode *inode, struct inode *dir,
+                        const struct qstr *qstr)
 {
        int rc;
        size_t len;
        void *value;
        char *name;
-        rc = security_inode_init_security(inode, dir, &name, &value, &len);
+        rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
        if (rc) {
                if (rc == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 800171dca53b..e537fb0e0184 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -121,7 +121,7 @@ int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri,
        temp->nodetype = ri->nodetype;
        temp->inode = ri->ino;
        temp->version = ri->version;
-        temp->offset = cpu_to_je32(ofs); /* relative offset from the begining of the jeb */
+        temp->offset = cpu_to_je32(ofs); /* relative offset from the beginning of the jeb */
        temp->totlen = ri->totlen;
        temp->next = NULL;
@@ -139,7 +139,7 @@ int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *r
        temp->nodetype = rd->nodetype;
        temp->totlen = rd->totlen;
-        temp->offset = cpu_to_je32(ofs);        /* relative from the begining of the jeb */
+        temp->offset = cpu_to_je32(ofs);        /* relative from the beginning of the jeb */
        temp->pino = rd->pino;
        temp->version = rd->version;
        temp->ino = rd->ino;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 07ee1546b2fa..4515bea0268f 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1116,7 +1116,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
 /*
 * On NAND we try to mark this block bad. If the block was erased more
- * than MAX_ERASE_FAILURES we mark it finaly bad.
+ * than MAX_ERASE_FAILURES we mark it finally bad.
 * Don't care about failures. This block remains on the erase-pending
 * or badblock list as long as nobody manipulates the flash with
 * a bootloader or something like that.
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index c819eb0e982d..30d175b6d290 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -424,7 +424,9 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
        return ret;
 }
-int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const char *name, int namelen)
+int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
+                    struct jffs2_inode_info *f, struct jffs2_raw_inode *ri,
+                    const struct qstr *qstr)
 {
        struct jffs2_raw_dirent *rd;
        struct jffs2_full_dnode *fn;
@@ -466,15 +468,15 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
        mutex_unlock(&f->sem);
        jffs2_complete_reservation(c);
-        ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode);
+        ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode, qstr);
        if (ret)
                return ret;
        ret = jffs2_init_acl_post(&f->vfs_inode);
        if (ret)
                return ret;
-        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
+        ret = jffs2_reserve_space(c, sizeof(*rd)+qstr->len, &alloclen,
-                                ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
+                                ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(qstr->len));
        if (ret) {
                /* Eep. */
@@ -493,19 +495,19 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
        rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
        rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
-        rd->totlen = cpu_to_je32(sizeof(*rd) + namelen);
+        rd->totlen = cpu_to_je32(sizeof(*rd) + qstr->len);
        rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
        rd->pino = cpu_to_je32(dir_f->inocache->ino);
        rd->version = cpu_to_je32(++dir_f->highest_version);
        rd->ino = ri->ino;
        rd->mctime = ri->ctime;
-        rd->nsize = namelen;
+        rd->nsize = qstr->len;
        rd->type = DT_REG;
        rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
-        rd->name_crc = cpu_to_je32(crc32(0, name, namelen));
+        rd->name_crc = cpu_to_je32(crc32(0, qstr->name, qstr->len));
-        fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL);
+        fd = jffs2_write_dirent(c, dir_f, rd, qstr->name, qstr->len, ALLOC_NORMAL);
        jffs2_free_raw_dirent(rd);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 4f9cc0482949..3e93cdd19005 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -31,7 +31,7 @@
 *   is used to release xattr name/value pair and detach from c->xattrindex.
 * reclaim_xattr_datum(c)
 *   is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
- *   memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold 
+ *   memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold
 *   is hard coded as 32KiB.
 * do_verify_xattr_datum(c, xd)
 *   is used to load the xdatum informations without name/value pair from the medium.
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index cf4f5759b42b..7be4beb306f3 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -121,10 +121,11 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
 #endif /* CONFIG_JFFS2_FS_XATTR */
 #ifdef CONFIG_JFFS2_FS_SECURITY
-extern int jffs2_init_security(struct inode *inode, struct inode *dir);
+extern int jffs2_init_security(struct inode *inode, struct inode *dir,
+                               const struct qstr *qstr);
 extern const struct xattr_handler jffs2_security_xattr_handler;
 #else
-#define jffs2_init_security(inode,dir)  (0)
+#define jffs2_init_security(inode,dir,qstr)     (0)
 #endif /* CONFIG_JFFS2_FS_SECURITY */
 #endif /* _JFFS2_FS_XATTR_H_ */
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile
index 3adb6395e42d..a58fa72d7e59 100644
--- a/fs/jfs/Makefile
+++ b/fs/jfs/Makefile
@@ -13,4 +13,4 @@ jfs-y    := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
 jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o
-EXTRA_CFLAGS += -D_JFS_4K
+ccflags-y := -D_JFS_4K
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9978803ceedc..eddbb373209e 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -352,7 +352,6 @@ const struct address_space_operations jfs_aops = {
        .readpages      = jfs_readpages,
        .writepage      = jfs_writepage,
        .writepages     = jfs_writepages,
-        .sync_page      = block_sync_page,
        .write_begin    = jfs_write_begin,
        .write_end      = nobh_write_end,
        .bmap           = jfs_bmap,
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index afe222bf300f..6f98a1866776 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -72,7 +72,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                if (err)
                        return err;
-                if (!is_owner_or_cap(inode)) {
+                if (!inode_owner_or_capable(inode)) {
                        err = -EACCES;
                        goto setflags_out;
                }
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index c92ea3b3ea5e..4496872cf4e7 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -1649,7 +1649,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
                }
                /* search the tree within the dmap control page for
-                 * sufficent free space.  if sufficient free space is found,
+                 * sufficient free space.  if sufficient free space is found,
                 * dbFindLeaf() returns the index of the leaf at which
                 * free space was found.
                 */
@@ -2744,7 +2744,7 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
                        /* check which (leafno or buddy) is the left buddy.
                         * the left buddy gets to claim the blocks resulting
                         * from the join while the right gets to claim none.
-                         * the left buddy is also eligable to participate in
+                         * the left buddy is also eligible to participate in
                         * a join at the next higher level while the right
                         * is not.
                         *
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 5d3bbd10f8db..e5fe8506ed16 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -126,7 +126,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
        /* allocate the disk blocks for the extent.  initially, extBalloc()
         * will try to allocate disk blocks for the requested size (xlen).
-         * if this fails (xlen contiguous free blocks not avaliable), it'll
+         * if this fails (xlen contiguous free blocks not available), it'll
         * try to allocate a smaller number of blocks (producing a smaller
         * extent), with this smaller number of blocks consisting of the
         * requested number of blocks rounded down to the next smaller
@@ -481,7 +481,7 @@ int extFill(struct inode *ip, xad_t * xp)
 *
 *              initially, we will try to allocate disk blocks for the
 *              requested size (nblocks).  if this fails (nblocks
- *              contiguous free blocks not avaliable), we'll try to allocate
+ *              contiguous free blocks not available), we'll try to allocate
 *              a smaller number of blocks (producing a smaller extent), with
 *              this smaller number of blocks consisting of the requested
 *              number of blocks rounded down to the next smaller power of 2
@@ -575,7 +575,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 *              to a new set of blocks.  If moving the extent, we initially
 *              will try to allocate disk blocks for the requested size
 *              (newnblks).  if this fails (new contiguous free blocks not
- *              avaliable), we'll try to allocate a smaller number of
+ *              available), we'll try to allocate a smaller number of
 *              blocks (producing a smaller extent), with this smaller
 *              number of blocks consisting of the requested number of
 *              blocks rounded down to the next smaller power of 2
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 3a09423b6c22..ed53a4740168 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1069,7 +1069,7 @@ int diFree(struct inode *ip)
                 */
                if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
                        /* in preparation for removing the iag from the
-                         * ag extent free list, read the iags preceeding
+                         * ag extent free list, read the iags preceding
                         * and following the iag on the ag extent free
                         * list.
                         */
@@ -1095,7 +1095,7 @@ int diFree(struct inode *ip)
                int inofreefwd = le32_to_cpu(iagp->inofreefwd);
                /* in preparation for removing the iag from the
-                 * ag inode free list, read the iags preceeding
+                 * ag inode free list, read the iags preceding
                 * and following the iag on the ag inode free
                 * list.  before reading these iags, we must make
                 * sure that we already don't have them in hand
@@ -1681,7 +1681,7 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
         * try to allocate a new extent of free inodes.
         */
        if (addext) {
-                /* if free space is not avaliable for this new extent, try
+                /* if free space is not available for this new extent, try
                 * below to allocate a free and existing (already backed)
                 * inode from the ag.
                 */
@@ -2036,7 +2036,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
        /* check if this is the last free inode within the iag.
         * if so, it will have to be removed from the ag free
-         * inode list, so get the iags preceeding and following
+         * inode list, so get the iags preceding and following
         * it on the list.
         */
        if (iagp->nfreeinos == cpu_to_le32(1)) {
@@ -2208,7 +2208,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
        /* check if this is the last free extent within the
         * iag.  if so, the iag must be removed from the ag
-         * free extent list, so get the iags preceeding and
+         * free extent list, so get the iags preceding and
         * following the iag on this list.
         */
        if (iagp->nfreeexts == cpu_to_le32(1)) {
@@ -2504,7 +2504,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
                }
-                /* get the next avaliable iag number */
+                /* get the next available iag number */
                iagno = imap->im_nextiag;
                /* make sure that we have not exceeded the maximum inode
@@ -2615,7 +2615,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
                duplicateIXtree(sb, blkno, xlen, &xaddr);
-                /* update the next avaliable iag number */
+                /* update the next available iag number */
                imap->im_nextiag += 1;
                /* Add the iag to the iag free list so we don't lose the iag
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index 9236bc49ae7f..e38c21598850 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -288,7 +288,7 @@ struct lrd {
                /*
                 *      SYNCPT: log sync point
                 *
-                 * replay log upto syncpt address specified;
+                 * replay log up to syncpt address specified;
                 */
                struct {
                        __le32 sync;    /* 4: syncpt address (0 = here) */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 48b44bd8267b..6740d34cd82b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -583,7 +583,6 @@ static void metapage_invalidatepage(struct page *page, unsigned long offset)
 const struct address_space_operations jfs_metapage_aops = {
        .readpage       = metapage_readpage,
        .writepage      = metapage_writepage,
-        .sync_page      = block_sync_page,
        .releasepage    = metapage_releasepage,
        .invalidatepage = metapage_invalidatepage,
        .set_page_dirty = __set_page_dirty_nobuffers,
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
index d94f8d9e87d7..a78beda85f68 100644
--- a/fs/jfs/jfs_metapage.h
+++ b/fs/jfs/jfs_metapage.h
@@ -75,7 +75,7 @@ extern void grab_metapage(struct metapage *);
 extern void force_metapage(struct metapage *);
 /*
- * hold_metapage and put_metapage are used in conjuction.  The page lock
+ * hold_metapage and put_metapage are used in conjunction.  The page lock
 * is not dropped between the two, so no other threads can get or release
 * the metapage
 */
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 9466957ec841..f6cc0c09ec63 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -636,7 +636,7 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
         * the inode of the page and available to all anonymous
         * transactions until txCommit() time at which point
         * they are transferred to the transaction tlock list of
-         * the commiting transaction of the inode)
+         * the committing transaction of the inode)
         */
        if (xtid == 0) {
                tlck->tid = tid;
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index 88b6cc535bf2..e9e100fd7c09 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -62,10 +62,11 @@ extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
 extern int jfs_removexattr(struct dentry *, const char *);
 #ifdef CONFIG_JFS_SECURITY
-extern int jfs_init_security(tid_t, struct inode *, struct inode *);
+extern int jfs_init_security(tid_t, struct inode *, struct inode *,
+                             const struct qstr *);
 #else
 static inline int jfs_init_security(tid_t tid, struct inode *inode,
-                                    struct inode *dir)
+                                    struct inode *dir, const struct qstr *qstr)
 {
        return 0;
 }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 81ead850ddb6..eaaf2b511e89 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -115,7 +115,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
        if (rc)
                goto out3;
-        rc = jfs_init_security(tid, ip, dip);
+        rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
        if (rc) {
                txAbort(tid, 0);
                goto out3;
@@ -253,7 +253,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
        if (rc)
                goto out3;
-        rc = jfs_init_security(tid, ip, dip);
+        rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
        if (rc) {
                txAbort(tid, 0);
                goto out3;
@@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry,
        if (ip->i_nlink == JFS_LINK_MAX)
                return -EMLINK;
-        if (ip->i_nlink == 0)
-                return -ENOENT;
        dquot_initialize(dir);
        tid = txBegin(ip->i_sb, 0);
@@ -932,7 +929,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
        mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
        mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
-        rc = jfs_init_security(tid, ip, dip);
+        rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
        if (rc)
                goto out3;
@@ -1395,7 +1392,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
        if (rc)
                goto out3;
-        rc = jfs_init_security(tid, ip, dir);
+        rc = jfs_init_security(tid, ip, dir, &dentry->d_name);
        if (rc) {
                txAbort(tid, 0);
                goto out3;
@@ -1600,7 +1597,7 @@ out:
 static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        /*
         * This is not negative dentry. Always valid.
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 1aba0039f1c9..8ea5efb5a34e 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -57,7 +57,7 @@
 * 2. compute new FSCKSize from new LVSize;
 * 3. set new FSSize as MIN(FSSize, LVSize-(LogSize+FSCKSize)) where
 *    assert(new FSSize >= old FSSize),
- *    i.e., file system must not be shrinked;
+ *    i.e., file system must not be shrunk;
 */
 int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
 {
@@ -182,7 +182,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
         */
        newFSSize = newLVSize - newLogSize - newFSCKSize;
-        /* file system cannot be shrinked */
+        /* file system cannot be shrunk */
        if (newFSSize < bmp->db_mapsize) {
                rc = -EINVAL;
                goto out;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index eeca48a031ab..06c8a67cbe76 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -644,7 +644,7 @@ static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
 * we don't have to be afraid of races */
 static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data,
                              size_t len, loff_t off)
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 2d7f165d0f1d..24838f1eeee5 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -678,7 +678,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
        struct posix_acl *acl;
        int rc;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        /*
@@ -1091,7 +1091,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
 }
 #ifdef CONFIG_JFS_SECURITY
-int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir)
+int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
+                      const struct qstr *qstr)
 {
        int rc;
        size_t len;
@@ -1099,7 +1100,8 @@ int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir)
        char *suffix;
        char *name;
-        rc = security_inode_init_security(inode, dir, &suffix, &value, &len);
+        rc = security_inode_init_security(inode, dir, qstr, &suffix, &value,
+                                          &len);
        if (rc) {
                if (rc == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/locks.c b/fs/locks.c
index 0f3998291f78..0a4f50dfadfb 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -145,7 +145,6 @@ static DEFINE_SPINLOCK(file_lock_lock);
 /*
 * Protects the two list heads above, plus the inode->i_flock list
- * FIXME: should use a spinlock, once lockd and ceph are ready.
 */
 void lock_flocks(void)
 {
@@ -415,17 +414,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
        fl->fl_ops = NULL;
        fl->fl_lmops = NULL;
-        switch (l->l_type) {
+        return assign_type(fl, l->l_type);
-        case F_RDLCK:
-        case F_WRLCK:
-        case F_UNLCK:
-                fl->fl_type = l->l_type;
-                break;
-        default:
-                return -EINVAL;
-        }
-        return (0);
 }
 #endif
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
index 44bbfd249abc..961f02b86d97 100644
--- a/fs/logfs/compr.c
+++ b/fs/logfs/compr.c
@@ -81,7 +81,7 @@ error:
 int __init logfs_compr_init(void)
 {
-        size_t size = max(zlib_deflate_workspacesize(),
+        size_t size = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
                        zlib_inflate_workspacesize());
        stream.workspace = vmalloc(size);
        if (!stream.workspace)
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 723bc5bca09a..1adc8d455f0e 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -39,7 +39,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
        bio.bi_end_io = request_complete;
        submit_bio(rw, &bio);
-        generic_unplug_device(bdev_get_queue(bdev));
        wait_for_completion(&complete);
        return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
 }
@@ -168,7 +167,6 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
        }
        len = PAGE_ALIGN(len);
        __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
-        generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
 }
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 7466e9dcc8c5..339e17e9133d 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -60,7 +60,7 @@ static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
 * asynchronous properties.  So just to prevent the first implementor of such
 * a thing from breaking logfs in 2350, we do the usual pointless dance to
 * declare a completion variable and wait for completion before returning
- * from mtd_erase().  What an excercise in futility!
+ * from mtd_erase().  What an exercise in futility!
 */
 static void logfs_erase_callback(struct erase_info *ei)
 {
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index f9ddf0c388c8..9ed89d1663f8 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -92,7 +92,7 @@ static int beyond_eof(struct inode *inode, loff_t bix)
 * so short names (len <= 9) don't even occupy the complete 32bit name
 * space.  A prime >256 ensures short names quickly spread the 32bit
 * name space.  Add about 26 for the estimated amount of information
- * of each character and pick a prime nearby, preferrably a bit-sparse
+ * of each character and pick a prime nearby, preferably a bit-sparse
 * one.
 */
 static u32 hash_32(const char *s, int len, u32 seed)
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index e86376b87af1..c2ad7028def4 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -196,7 +196,7 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                if (IS_RDONLY(inode))
                        return -EROFS;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                err = get_user(flags, (int __user *)arg);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 03b8c240aeda..edfea7a3a747 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -293,7 +293,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        return ret;
 }
-/* called with inode_lock held */
+/* called with inode->i_lock held */
 static int logfs_drop_inode(struct inode *inode)
 {
        struct logfs_super *super = logfs_super(inode->i_sb);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index ee99a9f5dfd3..9e22085231b3 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1616,7 +1616,7 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
                err = logfs_write_buf(inode, page, flags);
                if (!err && shrink_level(gc_level) == 0) {
                        /* Rewrite cannot mark the inode dirty but has to
-                         * write it immediatly.
+                         * write it immediately.
                         * Q: Can't we just create an alias for the inode
                         * instead?  And if not, why not?
                         */
diff --git a/fs/mbcache.c b/fs/mbcache.c
index a25444ab2baf..2f174be06555 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -542,7 +542,7 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head,
 * mb_cache_entry_find_first()
 *
 * Find the first cache entry on a given device with a certain key in
- * an additional index. Additonal matches can be found with
+ * an additional index. Additional matches can be found with
 * mb_cache_entry_find_next(). Returns NULL if no match was found. The
 * returned cache entry is locked for shared access ("multiple readers").
 *
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index 0fd7ca994264..6624684dd5de 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig
@@ -15,3 +15,11 @@ config MINIX_FS
          module will be called minix.  Note that the file system of your root
          partition (the one containing the directory /) cannot be compiled as
          a module.
+config MINIX_FS_NATIVE_ENDIAN
+        def_bool MINIX_FS
+        depends on H8300 || M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
+config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED
+        def_bool MINIX_FS
+        depends on M68K && MMU
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index ae0b83f476a6..adcdc0a4e182 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -399,7 +399,6 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations minix_aops = {
        .readpage = minix_readpage,
        .writepage = minix_writepage,
-        .sync_page = block_sync_page,
        .write_begin = minix_write_begin,
        .write_end = generic_write_end,
        .bmap = minix_bmap
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 407b1c84911e..341e2122879a 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -88,4 +88,78 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
        return list_entry(inode, struct minix_inode_info, vfs_inode);
 }
+#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \
+        defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
+#error Minix file system byte order broken
+#elif defined(CONFIG_MINIX_FS_NATIVE_ENDIAN)
+/*
+ * big-endian 32 or 64 bit indexed bitmaps on big-endian system or
+ * little-endian bitmaps on little-endian system
+ */
+#define minix_test_and_set_bit(nr, addr)        \
+        __test_and_set_bit((nr), (unsigned long *)(addr))
+#define minix_set_bit(nr, addr)         \
+        __set_bit((nr), (unsigned long *)(addr))
+#define minix_test_and_clear_bit(nr, addr) \
+        __test_and_clear_bit((nr), (unsigned long *)(addr))
+#define minix_test_bit(nr, addr)                \
+        test_bit((nr), (unsigned long *)(addr))
+#define minix_find_first_zero_bit(addr, size) \
+        find_first_zero_bit((unsigned long *)(addr), (size))
+#elif defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
+/*
+ * big-endian 16bit indexed bitmaps
+ */
+static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size)
+{
+        const unsigned short *p = vaddr, *addr = vaddr;
+        unsigned short num;
+        if (!size)
+                return 0;
+        size = (size >> 4) + ((size & 15) > 0);
+        while (*p++ == 0xffff) {
+                if (--size == 0)
+                        return (p - addr) << 4;
+        }
+        num = *--p;
+        return ((p - addr) << 4) + ffz(num);
+}
+#define minix_test_and_set_bit(nr, addr)        \
+        __test_and_set_bit((nr) ^ 16, (unsigned long *)(addr))
+#define minix_set_bit(nr, addr) \
+        __set_bit((nr) ^ 16, (unsigned long *)(addr))
+#define minix_test_and_clear_bit(nr, addr)      \
+        __test_and_clear_bit((nr) ^ 16, (unsigned long *)(addr))
+static inline int minix_test_bit(int nr, const void *vaddr)
+{
+        const unsigned short *p = vaddr;
+        return (p[nr >> 4] & (1U << (nr & 15))) != 0;
+}
+#else
+/*
+ * little-endian bitmaps
+ */
+#define minix_test_and_set_bit  __test_and_set_bit_le
+#define minix_set_bit           __set_bit_le
+#define minix_test_and_clear_bit        __test_and_clear_bit_le
+#define minix_test_bit  test_bit_le
+#define minix_find_first_zero_bit       find_first_zero_bit_le
+#endif
 #endif /* FS_MINIX_H */
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index ce7337ddfdbf..6e6777f1b4b2 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -213,7 +213,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
                new_de = minix_find_entry(new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                minix_set_link(new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -225,18 +224,15 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= info->s_link_max)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = minix_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
        minix_delete_entry(old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                minix_set_link(dir_de, dir_page, new_dir);
diff --git a/fs/mpage.c b/fs/mpage.c
index d78455a81ec9..0afc809e46e0 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -364,6 +364,9 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
        sector_t last_block_in_bio = 0;
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
+        struct blk_plug plug;
+        blk_start_plug(&plug);
        map_bh.b_state = 0;
        map_bh.b_size = 0;
@@ -385,6 +388,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
        BUG_ON(!list_empty(pages));
        if (bio)
                mpage_bio_submit(READ, bio);
+        blk_finish_plug(&plug);
        return 0;
 }
 EXPORT_SYMBOL(mpage_readpages);
@@ -666,8 +670,11 @@ int
 mpage_writepages(struct address_space *mapping,
                struct writeback_control *wbc, get_block_t get_block)
 {
+        struct blk_plug plug;
        int ret;
+        blk_start_plug(&plug);
        if (!get_block)
                ret = generic_writepages(mapping, wbc);
        else {
@@ -682,6 +689,7 @@ mpage_writepages(struct address_space *mapping,
                if (mpd.bio)
                        mpage_bio_submit(WRITE, mpd.bio);
        }
+        blk_finish_plug(&plug);
        return ret;
 }
 EXPORT_SYMBOL(mpage_writepages);
diff --git a/fs/namei.c b/fs/namei.c
index 7d77f24d32a9..54fc993e3027 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -70,7 +70,7 @@
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
- * the name is a symlink pointing to a non-existant name.
+ * the name is a symlink pointing to a non-existent name.
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
@@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page)
        return retval;
 }
-char * getname(const char __user * filename)
+static char *getname_flags(const char __user * filename, int flags)
 {
        char *tmp, *result;
@@ -147,14 +147,21 @@ char * getname(const char __user * filename)
                result = tmp;
                if (retval < 0) {
-                        __putname(tmp);
+                        if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
-                        result = ERR_PTR(retval);
+                                __putname(tmp);
+                                result = ERR_PTR(retval);
+                        }
                }
        }
        audit_getname(result);
        return result;
 }
+char *getname(const char __user * filename)
+{
+        return getname_flags(filename, 0);
+}
 #ifdef CONFIG_AUDITSYSCALL
 void putname(const char *name)
 {
@@ -176,6 +183,9 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
+        if (current_user_ns() != inode_userns(inode))
+                goto other_perms;
        if (current_fsuid() == inode->i_uid)
                mode >>= 6;
        else {
@@ -189,6 +199,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
                        mode >>= 3;
        }
+other_perms:
        /*
         * If the DACs are ok we don't need any capability check.
         */
@@ -230,7 +241,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
         * Executable DACs are overridable if at least one exec bit is set.
         */
        if (!(mask & MAY_EXEC) || execute_ok(inode))
-                if (capable(CAP_DAC_OVERRIDE))
+                if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
                        return 0;
        /*
@@ -238,7 +249,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
         */
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
-                if (capable(CAP_DAC_READ_SEARCH))
+                if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
                        return 0;
        return -EACCES;
@@ -401,9 +412,11 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 {
        struct fs_struct *fs = current->fs;
        struct dentry *dentry = nd->path.dentry;
+        int want_root = 0;
        BUG_ON(!(nd->flags & LOOKUP_RCU));
-        if (nd->root.mnt) {
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+                want_root = 1;
                spin_lock(&fs->lock);
                if (nd->root.mnt != fs->root.mnt ||
                                nd->root.dentry != fs->root.dentry)
@@ -414,7 +427,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
                goto err;
        BUG_ON(nd->inode != dentry->d_inode);
        spin_unlock(&dentry->d_lock);
-        if (nd->root.mnt) {
+        if (want_root) {
                path_get(&nd->root);
                spin_unlock(&fs->lock);
        }
@@ -427,7 +440,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 err:
        spin_unlock(&dentry->d_lock);
 err_root:
-        if (nd->root.mnt)
+        if (want_root)
                spin_unlock(&fs->lock);
        return -ECHILD;
 }
@@ -454,17 +467,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 {
        struct fs_struct *fs = current->fs;
        struct dentry *parent = nd->path.dentry;
+        int want_root = 0;
-        /*
-         * It can be possible to revalidate the dentry that we started
-         * the path walk with. force_reval_path may also revalidate the
-         * dentry already committed to the nameidata.
-         */
-        if (unlikely(parent == dentry))
-                return nameidata_drop_rcu(nd);
        BUG_ON(!(nd->flags & LOOKUP_RCU));
-        if (nd->root.mnt) {
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+                want_root = 1;
                spin_lock(&fs->lock);
                if (nd->root.mnt != fs->root.mnt ||
                                nd->root.dentry != fs->root.dentry)
@@ -484,7 +491,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
        parent->d_count++;
        spin_unlock(&dentry->d_lock);
        spin_unlock(&parent->d_lock);
-        if (nd->root.mnt) {
+        if (want_root) {
                path_get(&nd->root);
                spin_unlock(&fs->lock);
        }
@@ -498,7 +505,7 @@ err:
        spin_unlock(&dentry->d_lock);
        spin_unlock(&parent->d_lock);
 err_root:
-        if (nd->root.mnt)
+        if (want_root)
                spin_unlock(&fs->lock);
        return -ECHILD;
 }
@@ -506,8 +513,16 @@ err_root:
 /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
 static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
 {
-        if (nd->flags & LOOKUP_RCU)
+        if (nd->flags & LOOKUP_RCU) {
-                return nameidata_dentry_drop_rcu(nd, dentry);
+                if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
+                        nd->flags &= ~LOOKUP_RCU;
+                        if (!(nd->flags & LOOKUP_ROOT))
+                                nd->root.mnt = NULL;
+                        rcu_read_unlock();
+                        br_read_unlock(vfsmount_lock);
+                        return -ECHILD;
+                }
+        }
        return 0;
 }
@@ -526,7 +541,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd)
        BUG_ON(!(nd->flags & LOOKUP_RCU));
        nd->flags &= ~LOOKUP_RCU;
-        nd->root.mnt = NULL;
+        if (!(nd->flags & LOOKUP_ROOT))
+                nd->root.mnt = NULL;
        spin_lock(&dentry->d_lock);
        if (!__d_rcu_to_refcount(dentry, nd->seq))
                goto err_unlock;
@@ -547,53 +563,31 @@ err_unlock:
        return -ECHILD;
 }
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
-{
-        if (likely(nd->flags & LOOKUP_RCU))
-                return nameidata_drop_rcu_last(nd);
-        return 0;
-}
 /**
 * release_open_intent - free up open intent resources
 * @nd: pointer to nameidata
 */
 void release_open_intent(struct nameidata *nd)
 {
-        if (nd->intent.open.file->f_path.dentry == NULL)
+        struct file *file = nd->intent.open.file;
-                put_filp(nd->intent.open.file);
-        else
-                fput(nd->intent.open.file);
-}
-/*
- * Call d_revalidate and handle filesystems that request rcu-walk
- * to be dropped. This may be called and return in rcu-walk mode,
- * regardless of success or error. If -ECHILD is returned, the caller
- * must return -ECHILD back up the path walk stack so path walk may
- * be restarted in ref-walk mode.
- */
-static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-        int status;
-        status = dentry->d_op->d_revalidate(dentry, nd);
+        if (file && !IS_ERR(file)) {
-        if (status == -ECHILD) {
+                if (file->f_path.dentry == NULL)
-                if (nameidata_dentry_drop_rcu(nd, dentry))
+                        put_filp(file);
-                        return status;
+                else
-                status = dentry->d_op->d_revalidate(dentry, nd);
+                        fput(file);
        }
+}
-        return status;
+static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        return dentry->d_op->d_revalidate(dentry, nd);
 }
-static inline struct dentry *
+static struct dentry *
 do_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        int status;
+        int status = d_revalidate(dentry, nd);
-        status = d_revalidate(dentry, nd);
        if (unlikely(status <= 0)) {
                /*
                 * The dentry failed validation.
@@ -602,37 +596,18 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
                 * to return a fail status.
                 */
                if (status < 0) {
-                        /* If we're in rcu-walk, we don't have a ref */
+                        dput(dentry);
-                        if (!(nd->flags & LOOKUP_RCU))
-                                dput(dentry);
                        dentry = ERR_PTR(status);
+                } else if (!d_invalidate(dentry)) {
-                } else {
+                        dput(dentry);
-                        /* Don't d_invalidate in rcu-walk mode */
+                        dentry = NULL;
-                        if (nameidata_dentry_drop_rcu_maybe(nd, dentry))
-                                return ERR_PTR(-ECHILD);
-                        if (!d_invalidate(dentry)) {
-                                dput(dentry);
-                                dentry = NULL;
-                        }
                }
        }
        return dentry;
 }
-static inline int need_reval_dot(struct dentry *dentry)
-{
-        if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
-                return 0;
-        if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
-                return 0;
-        return 1;
-}
 /*
- * force_reval_path - force revalidation of a dentry
+ * handle_reval_path - force revalidation of a dentry
 *
 * In some situations the path walking code will trust dentries without
 * revalidating them. This causes problems for filesystems that depend on
@@ -646,30 +621,28 @@ static inline int need_reval_dot(struct dentry *dentry)
 * invalidate the dentry. It's up to the caller to handle putting references
 * to the path if necessary.
 */
-static int
+static inline int handle_reval_path(struct nameidata *nd)
-force_reval_path(struct path *path, struct nameidata *nd)
 {
+        struct dentry *dentry = nd->path.dentry;
        int status;
-        struct dentry *dentry = path->dentry;
-        /*
+        if (likely(!(nd->flags & LOOKUP_JUMPED)))
-         * only check on filesystems where it's possible for the dentry to
+                return 0;
-         * become stale.
-         */
+        if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
-        if (!need_reval_dot(dentry))
+                return 0;
+        if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
                return 0;
+        /* Note: we do not d_invalidate() */
        status = d_revalidate(dentry, nd);
        if (status > 0)
                return 0;
-        if (!status) {
+        if (!status)
-                /* Don't d_invalidate in rcu-walk mode */
-                if (nameidata_drop_rcu(nd))
-                        return -ECHILD;
-                d_invalidate(dentry);
                status = -ESTALE;
-        }
        return status;
 }
@@ -685,6 +658,7 @@ force_reval_path(struct path *path, struct nameidata *nd)
 static inline int exec_permission(struct inode *inode, unsigned int flags)
 {
        int ret;
+        struct user_namespace *ns = inode_userns(inode);
        if (inode->i_op->permission) {
                ret = inode->i_op->permission(inode, MAY_EXEC, flags);
@@ -697,7 +671,8 @@ static inline int exec_permission(struct inode *inode, unsigned int flags)
        if (ret == -ECHILD)
                return ret;
-        if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
+        if (ns_capable(ns, CAP_DAC_OVERRIDE) ||
+                        ns_capable(ns, CAP_DAC_READ_SEARCH))
                goto ok;
        return ret;
@@ -722,6 +697,7 @@ static __always_inline void set_root_rcu(struct nameidata *nd)
                do {
                        seq = read_seqcount_begin(&fs->seq);
                        nd->root = fs->root;
+                        nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
                } while (read_seqcount_retry(&fs->seq, seq));
        }
 }
@@ -738,6 +714,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                path_put(&nd->path);
                nd->path = nd->root;
                path_get(&nd->root);
+                nd->flags |= LOOKUP_JUMPED;
        }
        nd->inode = nd->path.dentry->d_inode;
@@ -767,18 +744,43 @@ static inline void path_to_nameidata(const struct path *path,
        nd->path.dentry = path->dentry;
 }
+static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+{
+        struct inode *inode = link->dentry->d_inode;
+        if (!IS_ERR(cookie) && inode->i_op->put_link)
+                inode->i_op->put_link(link->dentry, nd, cookie);
+        path_put(link);
+}
 static __always_inline int
-__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
+follow_link(struct path *link, struct nameidata *nd, void **p)
 {
        int error;
        struct dentry *dentry = link->dentry;
-        touch_atime(link->mnt, dentry);
+        BUG_ON(nd->flags & LOOKUP_RCU);
-        nd_set_link(nd, NULL);
        if (link->mnt == nd->path.mnt)
                mntget(link->mnt);
+        if (unlikely(current->total_link_count >= 40)) {
+                *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
+                path_put(&nd->path);
+                return -ELOOP;
+        }
+        cond_resched();
+        current->total_link_count++;
+        touch_atime(link->mnt, dentry);
+        nd_set_link(nd, NULL);
+        error = security_inode_follow_link(link->dentry, nd);
+        if (error) {
+                *p = ERR_PTR(error); /* no ->put_link(), please */
+                path_put(&nd->path);
+                return error;
+        }
        nd->last_type = LAST_BIND;
        *p = dentry->d_inode->i_op->follow_link(dentry, nd);
        error = PTR_ERR(*p);
@@ -788,50 +790,18 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
                if (s)
                        error = __vfs_follow_link(nd, s);
                else if (nd->last_type == LAST_BIND) {
-                        error = force_reval_path(&nd->path, nd);
+                        nd->flags |= LOOKUP_JUMPED;
-                        if (error)
+                        nd->inode = nd->path.dentry->d_inode;
+                        if (nd->inode->i_op->follow_link) {
+                                /* stepped on a _really_ weird one */
                                path_put(&nd->path);
+                                error = -ELOOP;
+                        }
                }
        }
        return error;
 }
-/*
- * This limits recursive symlink follows to 8, while
- * limiting consecutive symlinks to 40.
- *
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups. 
- */
-static inline int do_follow_link(struct path *path, struct nameidata *nd)
-{
-        void *cookie;
-        int err = -ELOOP;
-        if (current->link_count >= MAX_NESTED_LINKS)
-                goto loop;
-        if (current->total_link_count >= 40)
-                goto loop;
-        BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-        cond_resched();
-        err = security_inode_follow_link(path->dentry, nd);
-        if (err)
-                goto loop;
-        current->link_count++;
-        current->total_link_count++;
-        nd->depth++;
-        err = __do_follow_link(path, nd, &cookie);
-        if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
-                path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
-        path_put(path);
-        current->link_count--;
-        nd->depth--;
-        return err;
-loop:
-        path_put_conditional(path, nd);
-        path_put(&nd->path);
-        return err;
-}
 static int follow_up_rcu(struct path *path)
 {
        struct vfsmount *parent;
@@ -970,8 +940,7 @@ static int follow_managed(struct path *path, unsigned flags)
                if (managed & DCACHE_MANAGE_TRANSIT) {
                        BUG_ON(!path->dentry->d_op);
                        BUG_ON(!path->dentry->d_op->d_manage);
-                        ret = path->dentry->d_op->d_manage(path->dentry,
+                        ret = path->dentry->d_op->d_manage(path->dentry, false);
-                                                           false, false);
                        if (ret < 0)
                                return ret == -EISDIR ? 0 : ret;
                }
@@ -1024,6 +993,12 @@ int follow_down_one(struct path *path)
        return 0;
 }
+static inline bool managed_dentry_might_block(struct dentry *dentry)
+{
+        return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
+                dentry->d_op->d_manage(dentry, true) < 0);
+}
 /*
 * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
 * meet a managed dentry and we're not walking to "..".  True is returned to
@@ -1032,19 +1007,26 @@ int follow_down_one(struct path *path)
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                               struct inode **inode, bool reverse_transit)
 {
-        while (d_mountpoint(path->dentry)) {
+        for (;;) {
                struct vfsmount *mounted;
-                if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
+                /*
-                    !reverse_transit &&
+                 * Don't forget we might have a non-mountpoint managed dentry
-                    path->dentry->d_op->d_manage(path->dentry, false, true) < 0)
+                 * that wants to block transit.
+                 */
+                *inode = path->dentry->d_inode;
+                if (!reverse_transit &&
+                     unlikely(managed_dentry_might_block(path->dentry)))
                        return false;
+                if (!d_mountpoint(path->dentry))
+                        break;
                mounted = __lookup_mnt(path->mnt, path->dentry, 1);
                if (!mounted)
                        break;
                path->mnt = mounted;
                path->dentry = mounted->mnt_root;
                nd->seq = read_seqcount_begin(&path->dentry->d_seq);
-                *inode = path->dentry->d_inode;
        }
        if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
@@ -1070,7 +1052,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                        seq = read_seqcount_begin(&parent->d_seq);
                        if (read_seqcount_retry(&old->d_seq, nd->seq))
-                                return -ECHILD;
+                                goto failed;
                        inode = parent->d_inode;
                        nd->path.dentry = parent;
                        nd->seq = seq;
@@ -1083,8 +1065,15 @@ static int follow_dotdot_rcu(struct nameidata *nd)
        }
        __follow_mount_rcu(nd, &nd->path, &inode, true);
        nd->inode = inode;
        return 0;
+failed:
+        nd->flags &= ~LOOKUP_RCU;
+        if (!(nd->flags & LOOKUP_ROOT))
+                nd->root.mnt = NULL;
+        rcu_read_unlock();
+        br_read_unlock(vfsmount_lock);
+        return -ECHILD;
 }
 /*
@@ -1095,7 +1084,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 * Care must be taken as namespace_sem may be held (indicated by mounting_here
 * being true).
 */
-int follow_down(struct path *path, bool mounting_here)
+int follow_down(struct path *path)
 {
        unsigned managed;
        int ret;
@@ -1116,7 +1105,7 @@ int follow_down(struct path *path, bool mounting_here)
                        BUG_ON(!path->dentry->d_op);
                        BUG_ON(!path->dentry->d_op->d_manage);
                        ret = path->dentry->d_op->d_manage(
-                                path->dentry, mounting_here, false);
+                                path->dentry, false);
                        if (ret < 0)
                                return ret == -EISDIR ? 0 : ret;
                }
@@ -1218,57 +1207,85 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 {
        struct vfsmount *mnt = nd->path.mnt;
        struct dentry *dentry, *parent = nd->path.dentry;
-        struct inode *dir;
+        int need_reval = 1;
+        int status = 1;
        int err;
        /*
-         * See if the low-level filesystem might want
-         * to use its own hash..
-         */
-        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-                err = parent->d_op->d_hash(parent, nd->inode, name);
-                if (err < 0)
-                        return err;
-        }
-        /*
         * Rename seqlock is not required here because in the off chance
         * of a false negative due to a concurrent rename, we're going to
         * do the non-racy lookup, below.
         */
        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;
                *inode = nd->inode;
                dentry = __d_lookup_rcu(parent, name, &seq, inode);
-                if (!dentry) {
+                if (!dentry)
-                        if (nameidata_drop_rcu(nd))
+                        goto unlazy;
-                                return -ECHILD;
-                        goto need_lookup;
-                }
                /* Memory barrier in read_seqcount_begin of child is enough */
                if (__read_seqcount_retry(&parent->d_seq, nd->seq))
                        return -ECHILD;
                nd->seq = seq;
-                if (dentry->d_flags & DCACHE_OP_REVALIDATE)
-                        goto need_revalidate;
+                if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-done2:
+                        status = d_revalidate(dentry, nd);
+                        if (unlikely(status <= 0)) {
+                                if (status != -ECHILD)
+                                        need_reval = 0;
+                                goto unlazy;
+                        }
+                }
                path->mnt = mnt;
                path->dentry = dentry;
                if (likely(__follow_mount_rcu(nd, path, inode, false)))
                        return 0;
-                if (nameidata_drop_rcu(nd))
+unlazy:
-                        return -ECHILD;
+                if (dentry) {
-                /* fallthru */
+                        if (nameidata_dentry_drop_rcu(nd, dentry))
+                                return -ECHILD;
+                } else {
+                        if (nameidata_drop_rcu(nd))
+                                return -ECHILD;
+                }
+        } else {
+                dentry = __d_lookup(parent, name);
        }
-        dentry = __d_lookup(parent, name);
-        if (!dentry)
+retry:
-                goto need_lookup;
+        if (unlikely(!dentry)) {
-found:
+                struct inode *dir = parent->d_inode;
-        if (dentry->d_flags & DCACHE_OP_REVALIDATE)
+                BUG_ON(nd->inode != dir);
-                goto need_revalidate;
-done:
+                mutex_lock(&dir->i_mutex);
+                dentry = d_lookup(parent, name);
+                if (likely(!dentry)) {
+                        dentry = d_alloc_and_lookup(parent, name, nd);
+                        if (IS_ERR(dentry)) {
+                                mutex_unlock(&dir->i_mutex);
+                                return PTR_ERR(dentry);
+                        }
+                        /* known good */
+                        need_reval = 0;
+                        status = 1;
+                }
+                mutex_unlock(&dir->i_mutex);
+        }
+        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
+                status = d_revalidate(dentry, nd);
+        if (unlikely(status <= 0)) {
+                if (status < 0) {
+                        dput(dentry);
+                        return status;
+                }
+                if (!d_invalidate(dentry)) {
+                        dput(dentry);
+                        dentry = NULL;
+                        need_reval = 1;
+                        goto retry;
+                }
+        }
        path->mnt = mnt;
        path->dentry = dentry;
        err = follow_managed(path, nd->flags);
@@ -1278,49 +1295,113 @@ done:
        }
        *inode = path->dentry->d_inode;
        return 0;
+}
-need_lookup:
+static inline int may_lookup(struct nameidata *nd)
-        dir = parent->d_inode;
+{
-        BUG_ON(nd->inode != dir);
+        if (nd->flags & LOOKUP_RCU) {
+                int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+                if (err != -ECHILD)
+                        return err;
+                if (nameidata_drop_rcu(nd))
+                        return -ECHILD;
+        }
+        return exec_permission(nd->inode, 0);
+}
-        mutex_lock(&dir->i_mutex);
+static inline int handle_dots(struct nameidata *nd, int type)
-        /*
+{
-         * First re-do the cached lookup just in case it was created
+        if (type == LAST_DOTDOT) {
-         * while we waited for the directory semaphore, or the first
+                if (nd->flags & LOOKUP_RCU) {
-         * lookup failed due to an unrelated rename.
+                        if (follow_dotdot_rcu(nd))
-         *
+                                return -ECHILD;
-         * This could use version numbering or similar to avoid unnecessary
+                } else
-         * cache lookups, but then we'd have to do the first lookup in the
+                        follow_dotdot(nd);
-         * non-racy way. However in the common case here, everything should
+        }
-         * be hot in cache, so would it be a big win?
+        return 0;
-         */
+}
-        dentry = d_lookup(parent, name);
-        if (likely(!dentry)) {
+static void terminate_walk(struct nameidata *nd)
-                dentry = d_alloc_and_lookup(parent, name, nd);
+{
-                mutex_unlock(&dir->i_mutex);
+        if (!(nd->flags & LOOKUP_RCU)) {
-                if (IS_ERR(dentry))
+                path_put(&nd->path);
-                        goto fail;
+        } else {
-                goto done;
+                nd->flags &= ~LOOKUP_RCU;
+                if (!(nd->flags & LOOKUP_ROOT))
+                        nd->root.mnt = NULL;
+                rcu_read_unlock();
+                br_read_unlock(vfsmount_lock);
        }
+}
+static inline int walk_component(struct nameidata *nd, struct path *path,
+                struct qstr *name, int type, int follow)
+{
+        struct inode *inode;
+        int err;
        /*
-         * Uhhuh! Nasty case: the cache was re-populated while
+         * "." and ".." are special - ".." especially so because it has
-         * we waited on the semaphore. Need to revalidate.
+         * to be able to know about the current root directory and
+         * parent relationships.
         */
-        mutex_unlock(&dir->i_mutex);
+        if (unlikely(type != LAST_NORM))
-        goto found;
+                return handle_dots(nd, type);
+        err = do_lookup(nd, name, path, &inode);
+        if (unlikely(err)) {
+                terminate_walk(nd);
+                return err;
+        }
+        if (!inode) {
+                path_to_nameidata(path, nd);
+                terminate_walk(nd);
+                return -ENOENT;
+        }
+        if (unlikely(inode->i_op->follow_link) && follow) {
+                if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+                        return -ECHILD;
+                BUG_ON(inode != path->dentry->d_inode);
+                return 1;
+        }
+        path_to_nameidata(path, nd);
+        nd->inode = inode;
+        return 0;
+}
-need_revalidate:
+/*
-        dentry = do_revalidate(dentry, nd);
+ * This limits recursive symlink follows to 8, while
-        if (!dentry)
+ * limiting consecutive symlinks to 40.
-                goto need_lookup;
+ *
-        if (IS_ERR(dentry))
+ * Without that kind of total limit, nasty chains of consecutive
-                goto fail;
+ * symlinks can cause almost arbitrarily long lookups.
-        if (nd->flags & LOOKUP_RCU)
+ */
-                goto done2;
+static inline int nested_symlink(struct path *path, struct nameidata *nd)
-        goto done;
+{
+        int res;
-fail:
+        BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-        return PTR_ERR(dentry);
+        if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
+                path_put_conditional(path, nd);
+                path_put(&nd->path);
+                return -ELOOP;
+        }
+        nd->depth++;
+        current->link_count++;
+        do {
+                struct path link = *path;
+                void *cookie;
+                res = follow_link(&link, nd, &cookie);
+                if (!res)
+                        res = walk_component(nd, path, &nd->last,
+                                             nd->last_type, LOOKUP_FOLLOW);
+                put_link(nd, &link, cookie);
+        } while (res > 0);
+        current->link_count--;
+        nd->depth--;
+        return res;
 }
 /*
@@ -1340,30 +1421,18 @@ static int link_path_walk(const char *name, struct nameidata *nd)
        while (*name=='/')
                name++;
        if (!*name)
-                goto return_reval;
+                return 0;
-        if (nd->depth)
-                lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
        /* At this point we know we have a real path component. */
        for(;;) {
-                struct inode *inode;
                unsigned long hash;
                struct qstr this;
                unsigned int c;
+                int type;
                nd->flags |= LOOKUP_CONTINUE;
-                if (nd->flags & LOOKUP_RCU) {
-                        err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+                err = may_lookup(nd);
-                        if (err == -ECHILD) {
-                                if (nameidata_drop_rcu(nd))
-                                        return -ECHILD;
-                                goto exec_again;
-                        }
-                } else {
-exec_again:
-                        err = exec_permission(nd->inode, 0);
-                }
                if (err)
                        break;
@@ -1379,56 +1448,43 @@ exec_again:
                this.len = name - (const char *) this.name;
                this.hash = end_name_hash(hash);
+                type = LAST_NORM;
+                if (this.name[0] == '.') switch (this.len) {
+                        case 2:
+                                if (this.name[1] == '.') {
+                                        type = LAST_DOTDOT;
+                                        nd->flags |= LOOKUP_JUMPED;
+                                }
+                                break;
+                        case 1:
+                                type = LAST_DOT;
+                }
+                if (likely(type == LAST_NORM)) {
+                        struct dentry *parent = nd->path.dentry;
+                        nd->flags &= ~LOOKUP_JUMPED;
+                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+                                err = parent->d_op->d_hash(parent, nd->inode,
+                                                           &this);
+                                if (err < 0)
+                                        break;
+                        }
+                }
                /* remove trailing slashes? */
                if (!c)
                        goto last_component;
                while (*++name == '/');
                if (!*name)
-                        goto last_with_slashes;
+                        goto last_component;
-                /*
+                err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
-                 * "." and ".." are special - ".." especially so because it has
+                if (err < 0)
-                 * to be able to know about the current root directory and
+                        return err;
-                 * parent relationships.
-                 */
-                if (this.name[0] == '.') switch (this.len) {
-                        default:
-                                break;
-                        case 2:
-                                if (this.name[1] != '.')
-                                        break;
-                                if (nd->flags & LOOKUP_RCU) {
-                                        if (follow_dotdot_rcu(nd))
-                                                return -ECHILD;
-                                } else
-                                        follow_dotdot(nd);
-                                /* fallthrough */
-                        case 1:
-                                continue;
-                }
-                /* This does the actual lookups.. */
-                err = do_lookup(nd, &this, &next, &inode);
-                if (err)
-                        break;
-                err = -ENOENT;
-                if (!inode)
-                        goto out_dput;
-                if (inode->i_op->follow_link) {
+                if (err) {
-                        /* We commonly drop rcu-walk here */
+                        err = nested_symlink(&next, nd);
-                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
-                                return -ECHILD;
-                        BUG_ON(inode != next.dentry->d_inode);
-                        err = do_follow_link(&next, nd);
                        if (err)
-                                goto return_err;
+                                return err;
-                        nd->inode = nd->path.dentry->d_inode;
-                        err = -ENOENT;
-                        if (!nd->inode)
-                                break;
-                } else {
-                        path_to_nameidata(&next, nd);
-                        nd->inode = inode;
                }
                err = -ENOTDIR; 
                if (!nd->inode->i_op->lookup)
@@ -1436,209 +1492,109 @@ exec_again:
                continue;
                /* here ends the main loop */
-last_with_slashes:
-                lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 last_component:
                /* Clear LOOKUP_CONTINUE iff it was previously unset */
                nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
-                if (lookup_flags & LOOKUP_PARENT)
-                        goto lookup_parent;
-                if (this.name[0] == '.') switch (this.len) {
-                        default:
-                                break;
-                        case 2:
-                                if (this.name[1] != '.')
-                                        break;
-                                if (nd->flags & LOOKUP_RCU) {
-                                        if (follow_dotdot_rcu(nd))
-                                                return -ECHILD;
-                                } else
-                                        follow_dotdot(nd);
-                                /* fallthrough */
-                        case 1:
-                                goto return_reval;
-                }
-                err = do_lookup(nd, &this, &next, &inode);
-                if (err)
-                        break;
-                if (inode && unlikely(inode->i_op->follow_link) &&
-                    (lookup_flags & LOOKUP_FOLLOW)) {
-                        if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
-                                return -ECHILD;
-                        BUG_ON(inode != next.dentry->d_inode);
-                        err = do_follow_link(&next, nd);
-                        if (err)
-                                goto return_err;
-                        nd->inode = nd->path.dentry->d_inode;
-                } else {
-                        path_to_nameidata(&next, nd);
-                        nd->inode = inode;
-                }
-                err = -ENOENT;
-                if (!nd->inode)
-                        break;
-                if (lookup_flags & LOOKUP_DIRECTORY) {
-                        err = -ENOTDIR; 
-                        if (!nd->inode->i_op->lookup)
-                                break;
-                }
-                goto return_base;
-lookup_parent:
                nd->last = this;
-                nd->last_type = LAST_NORM;
+                nd->last_type = type;
-                if (this.name[0] != '.')
-                        goto return_base;
-                if (this.len == 1)
-                        nd->last_type = LAST_DOT;
-                else if (this.len == 2 && this.name[1] == '.')
-                        nd->last_type = LAST_DOTDOT;
-                else
-                        goto return_base;
-return_reval:
-                /*
-                 * We bypassed the ordinary revalidation routines.
-                 * We may need to check the cached dentry for staleness.
-                 */
-                if (need_reval_dot(nd->path.dentry)) {
-                        /* Note: we do not d_invalidate() */
-                        err = d_revalidate(nd->path.dentry, nd);
-                        if (!err)
-                                err = -ESTALE;
-                        if (err < 0)
-                                break;
-                }
-return_base:
-                if (nameidata_drop_rcu_last_maybe(nd))
-                        return -ECHILD;
                return 0;
-out_dput:
-                if (!(nd->flags & LOOKUP_RCU))
-                        path_put_conditional(&next, nd);
-                break;
        }
-        if (!(nd->flags & LOOKUP_RCU))
+        terminate_walk(nd);
-                path_put(&nd->path);
-return_err:
        return err;
 }
-static inline int path_walk_rcu(const char *name, struct nameidata *nd)
+static int path_init(int dfd, const char *name, unsigned int flags,
-{
+                     struct nameidata *nd, struct file **fp)
-        current->total_link_count = 0;
-        return link_path_walk(name, nd);
-}
-static inline int path_walk_simple(const char *name, struct nameidata *nd)
-{
-        current->total_link_count = 0;
-        return link_path_walk(name, nd);
-}
-static int path_walk(const char *name, struct nameidata *nd)
-{
-        struct path save = nd->path;
-        int result;
-        current->total_link_count = 0;
-        /* make sure the stuff we saved doesn't go away */
-        path_get(&save);
-        result = link_path_walk(name, nd);
-        if (result == -ESTALE) {
-                /* nd->path had been dropped */
-                current->total_link_count = 0;
-                nd->path = save;
-                path_get(&nd->path);
-                nd->flags |= LOOKUP_REVAL;
-                result = link_path_walk(name, nd);
-        }
-        path_put(&save);
-        return result;
-}
-static void path_finish_rcu(struct nameidata *nd)
-{
-        if (nd->flags & LOOKUP_RCU) {
-                /* RCU dangling. Cancel it. */
-                nd->flags &= ~LOOKUP_RCU;
-                nd->root.mnt = NULL;
-                rcu_read_unlock();
-                br_read_unlock(vfsmount_lock);
-        }
-        if (nd->file)
-                fput(nd->file);
-}
-static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
        int retval = 0;
        int fput_needed;
        struct file *file;
        nd->last_type = LAST_ROOT; /* if there are only slashes... */
-        nd->flags = flags | LOOKUP_RCU;
+        nd->flags = flags | LOOKUP_JUMPED;
        nd->depth = 0;
+        if (flags & LOOKUP_ROOT) {
+                struct inode *inode = nd->root.dentry->d_inode;
+                if (*name) {
+                        if (!inode->i_op->lookup)
+                                return -ENOTDIR;
+                        retval = inode_permission(inode, MAY_EXEC);
+                        if (retval)
+                                return retval;
+                }
+                nd->path = nd->root;
+                nd->inode = inode;
+                if (flags & LOOKUP_RCU) {
+                        br_read_lock(vfsmount_lock);
+                        rcu_read_lock();
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                } else {
+                        path_get(&nd->path);
+                }
+                return 0;
+        }
        nd->root.mnt = NULL;
-        nd->file = NULL;
        if (*name=='/') {
-                struct fs_struct *fs = current->fs;
+                if (flags & LOOKUP_RCU) {
-                unsigned seq;
+                        br_read_lock(vfsmount_lock);
+                        rcu_read_lock();
-                br_read_lock(vfsmount_lock);
+                        set_root_rcu(nd);
-                rcu_read_lock();
+                } else {
+                        set_root(nd);
-                do {
+                        path_get(&nd->root);
-                        seq = read_seqcount_begin(&fs->seq);
+                }
-                        nd->root = fs->root;
+                nd->path = nd->root;
-                        nd->path = nd->root;
-                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                } while (read_seqcount_retry(&fs->seq, seq));
        } else if (dfd == AT_FDCWD) {
-                struct fs_struct *fs = current->fs;
+                if (flags & LOOKUP_RCU) {
-                unsigned seq;
+                        struct fs_struct *fs = current->fs;
+                        unsigned seq;
-                br_read_lock(vfsmount_lock);
+                        br_read_lock(vfsmount_lock);
-                rcu_read_lock();
+                        rcu_read_lock();
-                do {
-                        seq = read_seqcount_begin(&fs->seq);
-                        nd->path = fs->pwd;
-                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                } while (read_seqcount_retry(&fs->seq, seq));
+                        do {
+                                seq = read_seqcount_begin(&fs->seq);
+                                nd->path = fs->pwd;
+                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                        } while (read_seqcount_retry(&fs->seq, seq));
+                } else {
+                        get_fs_pwd(current->fs, &nd->path);
+                }
        } else {
                struct dentry *dentry;
-                file = fget_light(dfd, &fput_needed);
+                file = fget_raw_light(dfd, &fput_needed);
                retval = -EBADF;
                if (!file)
                        goto out_fail;
                dentry = file->f_path.dentry;
-                retval = -ENOTDIR;
+                if (*name) {
-                if (!S_ISDIR(dentry->d_inode->i_mode))
+                        retval = -ENOTDIR;
-                        goto fput_fail;
+                        if (!S_ISDIR(dentry->d_inode->i_mode))
+                                goto fput_fail;
-                retval = file_permission(file, MAY_EXEC);
+                        retval = file_permission(file, MAY_EXEC);
-                if (retval)
+                        if (retval)
-                        goto fput_fail;
+                                goto fput_fail;
+                }
                nd->path = file->f_path;
-                if (fput_needed)
+                if (flags & LOOKUP_RCU) {
-                        nd->file = file;
+                        if (fput_needed)
+                                *fp = file;
-                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                br_read_lock(vfsmount_lock);
+                        br_read_lock(vfsmount_lock);
-                rcu_read_lock();
+                        rcu_read_lock();
+                } else {
+                        path_get(&file->f_path);
+                        fput_light(file, fput_needed);
+                }
        }
        nd->inode = nd->path.dentry->d_inode;
        return 0;
@@ -1648,60 +1604,23 @@ out_fail:
        return retval;
 }
-static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+static inline int lookup_last(struct nameidata *nd, struct path *path)
 {
-        int retval = 0;
+        if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
-        int fput_needed;
+                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
-        struct file *file;
-        nd->last_type = LAST_ROOT; /* if there are only slashes... */
-        nd->flags = flags;
-        nd->depth = 0;
-        nd->root.mnt = NULL;
-        if (*name=='/') {
-                set_root(nd);
-                nd->path = nd->root;
-                path_get(&nd->root);
-        } else if (dfd == AT_FDCWD) {
-                get_fs_pwd(current->fs, &nd->path);
-        } else {
-                struct dentry *dentry;
-                file = fget_light(dfd, &fput_needed);
-                retval = -EBADF;
-                if (!file)
-                        goto out_fail;
-                dentry = file->f_path.dentry;
-                retval = -ENOTDIR;
+        nd->flags &= ~LOOKUP_PARENT;
-                if (!S_ISDIR(dentry->d_inode->i_mode))
+        return walk_component(nd, path, &nd->last, nd->last_type,
-                        goto fput_fail;
+                                        nd->flags & LOOKUP_FOLLOW);
-                retval = file_permission(file, MAY_EXEC);
-                if (retval)
-                        goto fput_fail;
-                nd->path = file->f_path;
-                path_get(&file->f_path);
-                fput_light(file, fput_needed);
-        }
-        nd->inode = nd->path.dentry->d_inode;
-        return 0;
-fput_fail:
-        fput_light(file, fput_needed);
-out_fail:
-        return retval;
 }
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int do_path_lookup(int dfd, const char *name,
+static int path_lookupat(int dfd, const char *name,
                                unsigned int flags, struct nameidata *nd)
 {
-        int retval;
+        struct file *base = NULL;
+        struct path path;
+        int err;
        /*
         * Path walking is largely split up into 2 different synchronisation
@@ -1717,44 +1636,78 @@ static int do_path_lookup(int dfd, const char *name,
         * be handled by restarting a traditional ref-walk (which will always
         * be able to complete).
         */
-        retval = path_init_rcu(dfd, name, flags, nd);
+        err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
-        if (unlikely(retval))
-                return retval;
+        if (unlikely(err))
-        retval = path_walk_rcu(name, nd);
+                return err;
-        path_finish_rcu(nd);
-        if (nd->root.mnt) {
+        current->total_link_count = 0;
-                path_put(&nd->root);
+        err = link_path_walk(name, nd);
-                nd->root.mnt = NULL;
+        if (!err && !(flags & LOOKUP_PARENT)) {
+                err = lookup_last(nd, &path);
+                while (err > 0) {
+                        void *cookie;
+                        struct path link = path;
+                        nd->flags |= LOOKUP_PARENT;
+                        err = follow_link(&link, nd, &cookie);
+                        if (!err)
+                                err = lookup_last(nd, &path);
+                        put_link(nd, &link, cookie);
+                }
        }
-        if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
+        if (nd->flags & LOOKUP_RCU) {
-                /* slower, locked walk */
+                /* went all way through without dropping RCU */
-                if (retval == -ESTALE)
+                BUG_ON(err);
-                        flags |= LOOKUP_REVAL;
+                if (nameidata_drop_rcu_last(nd))
-                retval = path_init(dfd, name, flags, nd);
+                        err = -ECHILD;
-                if (unlikely(retval))
+        }
-                        return retval;
-                retval = path_walk(name, nd);
+        if (!err) {
-                if (nd->root.mnt) {
+                err = handle_reval_path(nd);
-                        path_put(&nd->root);
+                if (err)
-                        nd->root.mnt = NULL;
+                        path_put(&nd->path);
+        }
+        if (!err && nd->flags & LOOKUP_DIRECTORY) {
+                if (!nd->inode->i_op->lookup) {
+                        path_put(&nd->path);
+                        err = -ENOTDIR;
                }
        }
+        if (base)
+                fput(base);
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+                path_put(&nd->root);
+                nd->root.mnt = NULL;
+        }
+        return err;
+}
+static int do_path_lookup(int dfd, const char *name,
+                                unsigned int flags, struct nameidata *nd)
+{
+        int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+        if (unlikely(retval == -ECHILD))
+                retval = path_lookupat(dfd, name, flags, nd);
+        if (unlikely(retval == -ESTALE))
+                retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
        if (likely(!retval)) {
                if (unlikely(!audit_dummy_context())) {
                        if (nd->path.dentry && nd->inode)
                                audit_inode(name, nd->path.dentry);
                }
        }
        return retval;
 }
-int path_lookup(const char *name, unsigned int flags,
+int kern_path_parent(const char *name, struct nameidata *nd)
-                        struct nameidata *nd)
 {
-        return do_path_lookup(AT_FDCWD, name, flags, nd);
+        return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
 }
 int kern_path(const char *name, unsigned int flags, struct path *path)
@@ -1778,29 +1731,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct nameidata *nd)
 {
-        int retval;
+        nd->root.dentry = dentry;
+        nd->root.mnt = mnt;
-        /* same as do_path_lookup */
+        /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
-        nd->last_type = LAST_ROOT;
+        return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
-        nd->flags = flags;
-        nd->depth = 0;
-        nd->path.dentry = dentry;
-        nd->path.mnt = mnt;
-        path_get(&nd->path);
-        nd->root = nd->path;
-        path_get(&nd->root);
-        nd->inode = nd->path.dentry->d_inode;
-        retval = path_walk(name, nd);
-        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
-                                nd->inode))
-                audit_inode(name, nd->path.dentry);
-        path_put(&nd->root);
-        nd->root.mnt = NULL;
-        return retval;
 }
 static struct dentry *__lookup_hash(struct qstr *name,
@@ -1815,17 +1749,6 @@ static struct dentry *__lookup_hash(struct qstr *name,
                return ERR_PTR(err);
        /*
-         * See if the low-level filesystem might want
-         * to use its own hash..
-         */
-        if (base->d_flags & DCACHE_OP_HASH) {
-                err = base->d_op->d_hash(base, inode, name);
-                dentry = ERR_PTR(err);
-                if (err < 0)
-                        goto out;
-        }
-        /*
         * Don't bother with __d_lookup: callers are for creat as
         * well as unlink, so a lot of the time it would cost
         * a double lookup.
@@ -1837,7 +1760,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
        if (!dentry)
                dentry = d_alloc_and_lookup(base, name, nd);
-out:
        return dentry;
 }
@@ -1851,28 +1774,6 @@ static struct dentry *lookup_hash(struct nameidata *nd)
        return __lookup_hash(&nd->last, nd->path.dentry, nd);
 }
-static int __lookup_one_len(const char *name, struct qstr *this,
-                struct dentry *base, int len)
-{
-        unsigned long hash;
-        unsigned int c;
-        this->name = name;
-        this->len = len;
-        if (!len)
-                return -EACCES;
-        hash = init_name_hash();
-        while (len--) {
-                c = *(const unsigned char *)name++;
-                if (c == '/' || c == '\0')
-                        return -EACCES;
-                hash = partial_name_hash(c, hash);
-        }
-        this->hash = end_name_hash(hash);
-        return 0;
-}
 /**
 * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:       pathname component to lookup
@@ -1886,14 +1787,34 @@ static int __lookup_one_len(const char *name, struct qstr *this,
 */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
-        int err;
        struct qstr this;
+        unsigned long hash;
+        unsigned int c;
        WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
-        err = __lookup_one_len(name, &this, base, len);
+        this.name = name;
-        if (err)
+        this.len = len;
-                return ERR_PTR(err);
+        if (!len)
+                return ERR_PTR(-EACCES);
+        hash = init_name_hash();
+        while (len--) {
+                c = *(const unsigned char *)name++;
+                if (c == '/' || c == '\0')
+                        return ERR_PTR(-EACCES);
+                hash = partial_name_hash(c, hash);
+        }
+        this.hash = end_name_hash(hash);
+        /*
+         * See if the low-level filesystem might want
+         * to use its own hash..
+         */
+        if (base->d_flags & DCACHE_OP_HASH) {
+                int err = base->d_op->d_hash(base, base->d_inode, &this);
+                if (err < 0)
+                        return ERR_PTR(err);
+        }
        return __lookup_hash(&this, base, NULL);
 }
@@ -1902,7 +1823,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
                 struct path *path)
 {
        struct nameidata nd;
-        char *tmp = getname(name);
+        char *tmp = getname_flags(name, flags);
        int err = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
@@ -1944,11 +1865,15 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
        if (!(dir->i_mode & S_ISVTX))
                return 0;
+        if (current_user_ns() != inode_userns(inode))
+                goto other_userns;
        if (inode->i_uid == fsuid)
                return 0;
        if (dir->i_uid == fsuid)
                return 0;
-        return !capable(CAP_FOWNER);
+other_userns:
+        return !ns_capable(inode_userns(inode), CAP_FOWNER);
 }
 /*
@@ -2082,12 +2007,16 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        return error;
 }
-int may_open(struct path *path, int acc_mode, int flag)
+static int may_open(struct path *path, int acc_mode, int flag)
 {
        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;
+        /* O_PATH? */
+        if (!acc_mode)
+                return 0;
        if (!inode)
                return -ENOENT;
@@ -2124,7 +2053,7 @@ int may_open(struct path *path, int acc_mode, int flag)
        }
        /* O_NOATIME can only be set by the owner or superuser */
-        if (flag & O_NOATIME && !is_owner_or_cap(inode))
+        if (flag & O_NOATIME && !inode_owner_or_capable(inode))
                return -EPERM;
        /*
@@ -2156,34 +2085,6 @@ static int handle_truncate(struct file *filp)
 }
 /*
- * Be careful about ever adding any more callers of this
- * function.  Its flags must be in the namei format, not
- * what get passed to sys_open().
- */
-static int __open_namei_create(struct nameidata *nd, struct path *path,
-                                int open_flag, int mode)
-{
-        int error;
-        struct dentry *dir = nd->path.dentry;
-        if (!IS_POSIXACL(dir->d_inode))
-                mode &= ~current_umask();
-        error = security_path_mknod(&nd->path, path->dentry, mode, 0);
-        if (error)
-                goto out_unlock;
-        error = vfs_create(dir->d_inode, path->dentry, mode, nd);
-out_unlock:
-        mutex_unlock(&dir->d_inode->i_mutex);
-        dput(nd->path.dentry);
-        nd->path.dentry = path->dentry;
-        if (error)
-                return error;
-        /* Don't check for write permission, don't truncate */
-        return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
-}
-/*
 * Note that while the flag value (low two bits) for sys_open means:
 *      00 - read-only
 *      01 - write-only
@@ -2207,128 +2108,115 @@ static inline int open_to_namei_flags(int flag)
        return flag;
 }
-static int open_will_truncate(int flag, struct inode *inode)
-{
-        /*
-         * We'll never write to the fs underlying
-         * a device file.
-         */
-        if (special_file(inode->i_mode))
-                return 0;
-        return (flag & O_TRUNC);
-}
-static struct file *finish_open(struct nameidata *nd,
-                                int open_flag, int acc_mode)
-{
-        struct file *filp;
-        int will_truncate;
-        int error;
-        will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
-        if (will_truncate) {
-                error = mnt_want_write(nd->path.mnt);
-                if (error)
-                        goto exit;
-        }
-        error = may_open(&nd->path, acc_mode, open_flag);
-        if (error) {
-                if (will_truncate)
-                        mnt_drop_write(nd->path.mnt);
-                goto exit;
-        }
-        filp = nameidata_to_filp(nd);
-        if (!IS_ERR(filp)) {
-                error = ima_file_check(filp, acc_mode);
-                if (error) {
-                        fput(filp);
-                        filp = ERR_PTR(error);
-                }
-        }
-        if (!IS_ERR(filp)) {
-                if (will_truncate) {
-                        error = handle_truncate(filp);
-                        if (error) {
-                                fput(filp);
-                                filp = ERR_PTR(error);
-                        }
-                }
-        }
-        /*
-         * It is now safe to drop the mnt write
-         * because the filp has had a write taken
-         * on its behalf.
-         */
-        if (will_truncate)
-                mnt_drop_write(nd->path.mnt);
-        path_put(&nd->path);
-        return filp;
-exit:
-        if (!IS_ERR(nd->intent.open.file))
-                release_open_intent(nd);
-        path_put(&nd->path);
-        return ERR_PTR(error);
-}
 /*
- * Handle O_CREAT case for do_filp_open
+ * Handle the last step of open()
 */
 static struct file *do_last(struct nameidata *nd, struct path *path,
-                            int open_flag, int acc_mode,
+                            const struct open_flags *op, const char *pathname)
-                            int mode, const char *pathname)
 {
        struct dentry *dir = nd->path.dentry;
+        struct dentry *dentry;
+        int open_flag = op->open_flag;
+        int will_truncate = open_flag & O_TRUNC;
+        int want_write = 0;
+        int acc_mode = op->acc_mode;
        struct file *filp;
-        int error = -EISDIR;
+        int error;
+        nd->flags &= ~LOOKUP_PARENT;
+        nd->flags |= op->intent;
        switch (nd->last_type) {
        case LAST_DOTDOT:
-                follow_dotdot(nd);
-                dir = nd->path.dentry;
        case LAST_DOT:
-                if (need_reval_dot(dir)) {
+                error = handle_dots(nd, nd->last_type);
-                        int status = d_revalidate(nd->path.dentry, nd);
+                if (error)
-                        if (!status)
+                        return ERR_PTR(error);
-                                status = -ESTALE;
-                        if (status < 0) {
-                                error = status;
-                                goto exit;
-                        }
-                }
                /* fallthrough */
        case LAST_ROOT:
-                goto exit;
+                if (nd->flags & LOOKUP_RCU) {
+                        if (nameidata_drop_rcu_last(nd))
+                                return ERR_PTR(-ECHILD);
+                }
+                error = handle_reval_path(nd);
+                if (error)
+                        goto exit;
+                audit_inode(pathname, nd->path.dentry);
+                if (open_flag & O_CREAT) {
+                        error = -EISDIR;
+                        goto exit;
+                }
+                goto ok;
        case LAST_BIND:
+                /* can't be RCU mode here */
+                error = handle_reval_path(nd);
+                if (error)
+                        goto exit;
                audit_inode(pathname, dir);
                goto ok;
        }
+        if (!(open_flag & O_CREAT)) {
+                int symlink_ok = 0;
+                if (nd->last.name[nd->last.len])
+                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+                if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
+                        symlink_ok = 1;
+                /* we _can_ be in RCU mode here */
+                error = walk_component(nd, path, &nd->last, LAST_NORM,
+                                        !symlink_ok);
+                if (error < 0)
+                        return ERR_PTR(error);
+                if (error) /* symlink */
+                        return NULL;
+                /* sayonara */
+                if (nd->flags & LOOKUP_RCU) {
+                        if (nameidata_drop_rcu_last(nd))
+                                return ERR_PTR(-ECHILD);
+                }
+                error = -ENOTDIR;
+                if (nd->flags & LOOKUP_DIRECTORY) {
+                        if (!nd->inode->i_op->lookup)
+                                goto exit;
+                }
+                audit_inode(pathname, nd->path.dentry);
+                goto ok;
+        }
+        /* create side of things */
+        if (nd->flags & LOOKUP_RCU) {
+                if (nameidata_drop_rcu_last(nd))
+                        return ERR_PTR(-ECHILD);
+        }
+        audit_inode(pathname, dir);
+        error = -EISDIR;
        /* trailing slashes? */
        if (nd->last.name[nd->last.len])
                goto exit;
        mutex_lock(&dir->d_inode->i_mutex);
-        path->dentry = lookup_hash(nd);
+        dentry = lookup_hash(nd);
-        path->mnt = nd->path.mnt;
+        error = PTR_ERR(dentry);
+        if (IS_ERR(dentry)) {
-        error = PTR_ERR(path->dentry);
-        if (IS_ERR(path->dentry)) {
                mutex_unlock(&dir->d_inode->i_mutex);
                goto exit;
        }
-        if (IS_ERR(nd->intent.open.file)) {
+        path->dentry = dentry;
-                error = PTR_ERR(nd->intent.open.file);
+        path->mnt = nd->path.mnt;
-                goto exit_mutex_unlock;
-        }
        /* Negative dentry, just create the file */
-        if (!path->dentry->d_inode) {
+        if (!dentry->d_inode) {
+                int mode = op->mode;
+                if (!IS_POSIXACL(dir->d_inode))
+                        mode &= ~current_umask();
                /*
                 * This write is needed to ensure that a
-                 * ro->rw transition does not occur between
+                 * rw->ro transition does not occur between
                 * the time when the file is created and when
                 * a permanent write count is taken through
                 * the 'struct file' in nameidata_to_filp().
@@ -2336,22 +2224,21 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                error = mnt_want_write(nd->path.mnt);
                if (error)
                        goto exit_mutex_unlock;
-                error = __open_namei_create(nd, path, open_flag, mode);
+                want_write = 1;
-                if (error) {
+                /* Don't check for write permission, don't truncate */
-                        mnt_drop_write(nd->path.mnt);
+                open_flag &= ~O_TRUNC;
-                        goto exit;
+                will_truncate = 0;
-                }
+                acc_mode = MAY_OPEN;
-                filp = nameidata_to_filp(nd);
+                error = security_path_mknod(&nd->path, dentry, mode, 0);
-                mnt_drop_write(nd->path.mnt);
+                if (error)
-                path_put(&nd->path);
+                        goto exit_mutex_unlock;
-                if (!IS_ERR(filp)) {
+                error = vfs_create(dir->d_inode, dentry, mode, nd);
-                        error = ima_file_check(filp, acc_mode);
+                if (error)
-                        if (error) {
+                        goto exit_mutex_unlock;
-                                fput(filp);
+                mutex_unlock(&dir->d_inode->i_mutex);
-                                filp = ERR_PTR(error);
+                dput(nd->path.dentry);
-                        }
+                nd->path.dentry = dentry;
-                }
+                goto common;
-                return filp;
        }
        /*
@@ -2381,7 +2268,40 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (S_ISDIR(nd->inode->i_mode))
                goto exit;
 ok:
-        filp = finish_open(nd, open_flag, acc_mode);
+        if (!S_ISREG(nd->inode->i_mode))
+                will_truncate = 0;
+        if (will_truncate) {
+                error = mnt_want_write(nd->path.mnt);
+                if (error)
+                        goto exit;
+                want_write = 1;
+        }
+common:
+        error = may_open(&nd->path, acc_mode, open_flag);
+        if (error)
+                goto exit;
+        filp = nameidata_to_filp(nd);
+        if (!IS_ERR(filp)) {
+                error = ima_file_check(filp, op->acc_mode);
+                if (error) {
+                        fput(filp);
+                        filp = ERR_PTR(error);
+                }
+        }
+        if (!IS_ERR(filp)) {
+                if (will_truncate) {
+                        error = handle_truncate(filp);
+                        if (error) {
+                                fput(filp);
+                                filp = ERR_PTR(error);
+                        }
+                }
+        }
+out:
+        if (want_write)
+                mnt_drop_write(nd->path.mnt);
+        path_put(&nd->path);
        return filp;
 exit_mutex_unlock:
@@ -2389,199 +2309,103 @@ exit_mutex_unlock:
 exit_dput:
        path_put_conditional(path, nd);
 exit:
-        if (!IS_ERR(nd->intent.open.file))
+        filp = ERR_PTR(error);
-                release_open_intent(nd);
+        goto out;
-        path_put(&nd->path);
-        return ERR_PTR(error);
 }
-/*
+static struct file *path_openat(int dfd, const char *pathname,
- * Note that the low bits of the passed in "open_flag"
+                struct nameidata *nd, const struct open_flags *op, int flags)
- * are not the same as in the local variable "flag". See
- * open_to_namei_flags() for more details.
- */
-struct file *do_filp_open(int dfd, const char *pathname,
-                int open_flag, int mode, int acc_mode)
 {
+        struct file *base = NULL;
        struct file *filp;
-        struct nameidata nd;
-        int error;
        struct path path;
-        int count = 0;
+        int error;
-        int flag = open_to_namei_flags(open_flag);
-        int flags;
-        if (!(open_flag & O_CREAT))
-                mode = 0;
-        /* Must never be set by userspace */
-        open_flag &= ~FMODE_NONOTIFY;
-        /*
-         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
-         * check for O_DSYNC if the need any syncing at all we enforce it's
-         * always set instead of having to deal with possibly weird behaviour
-         * for malicious applications setting only __O_SYNC.
-         */
-        if (open_flag & __O_SYNC)
-                open_flag |= O_DSYNC;
-        if (!acc_mode)
-                acc_mode = MAY_OPEN | ACC_MODE(open_flag);
-        /* O_TRUNC implies we need access checks for write permissions */
-        if (open_flag & O_TRUNC)
-                acc_mode |= MAY_WRITE;
-        /* Allow the LSM permission hook to distinguish append 
-           access from general write access. */
-        if (open_flag & O_APPEND)
-                acc_mode |= MAY_APPEND;
-        flags = LOOKUP_OPEN;
-        if (open_flag & O_CREAT) {
-                flags |= LOOKUP_CREATE;
-                if (open_flag & O_EXCL)
-                        flags |= LOOKUP_EXCL;
-        }
-        if (open_flag & O_DIRECTORY)
-                flags |= LOOKUP_DIRECTORY;
-        if (!(open_flag & O_NOFOLLOW))
-                flags |= LOOKUP_FOLLOW;
        filp = get_empty_filp();
        if (!filp)
                return ERR_PTR(-ENFILE);
-        filp->f_flags = open_flag;
+        filp->f_flags = op->open_flag;
-        nd.intent.open.file = filp;
+        nd->intent.open.file = filp;
-        nd.intent.open.flags = flag;
+        nd->intent.open.flags = open_to_namei_flags(op->open_flag);
-        nd.intent.open.create_mode = mode;
+        nd->intent.open.create_mode = op->mode;
-        if (open_flag & O_CREAT)
-                goto creat;
-        /* !O_CREAT, simple open */
+        error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
-        error = do_path_lookup(dfd, pathname, flags, &nd);
        if (unlikely(error))
                goto out_filp;
-        error = -ELOOP;
-        if (!(nd.flags & LOOKUP_FOLLOW)) {
-                if (nd.inode->i_op->follow_link)
-                        goto out_path;
-        }
-        error = -ENOTDIR;
-        if (nd.flags & LOOKUP_DIRECTORY) {
-                if (!nd.inode->i_op->lookup)
-                        goto out_path;
-        }
-        audit_inode(pathname, nd.path.dentry);
-        filp = finish_open(&nd, open_flag, acc_mode);
-        return filp;
-creat:
-        /* OK, have to create the file. Find the parent. */
-        error = path_init_rcu(dfd, pathname,
-                        LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-        if (error)
-                goto out_filp;
-        error = path_walk_rcu(pathname, &nd);
-        path_finish_rcu(&nd);
-        if (unlikely(error == -ECHILD || error == -ESTALE)) {
-                /* slower, locked walk */
-                if (error == -ESTALE) {
-reval:
-                        flags |= LOOKUP_REVAL;
-                }
-                error = path_init(dfd, pathname,
-                                LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-                if (error)
-                        goto out_filp;
-                error = path_walk_simple(pathname, &nd);
+        current->total_link_count = 0;
-        }
+        error = link_path_walk(pathname, nd);
        if (unlikely(error))
                goto out_filp;
-        if (unlikely(!audit_dummy_context()))
-                audit_inode(pathname, nd.path.dentry);
-        /*
+        filp = do_last(nd, &path, op, pathname);
-         * We have the parent and last component.
-         */
-        nd.flags = flags;
-        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
                struct path link = path;
-                struct inode *linki = link.dentry->d_inode;
                void *cookie;
-                error = -ELOOP;
+                if (!(nd->flags & LOOKUP_FOLLOW)) {
-                if (!(nd.flags & LOOKUP_FOLLOW))
+                        path_put_conditional(&path, nd);
-                        goto exit_dput;
+                        path_put(&nd->path);
-                if (count++ == 32)
+                        filp = ERR_PTR(-ELOOP);
-                        goto exit_dput;
+                        break;
-                /*
-                 * This is subtle. Instead of calling do_follow_link() we do
-                 * the thing by hands. The reason is that this way we have zero
-                 * link_count and path_walk() (called from ->follow_link)
-                 * honoring LOOKUP_PARENT.  After that we have the parent and
-                 * last component, i.e. we are in the same situation as after
-                 * the first path_walk().  Well, almost - if the last component
-                 * is normal we get its copy stored in nd->last.name and we will
-                 * have to putname() it when we are done. Procfs-like symlinks
-                 * just set LAST_BIND.
-                 */
-                nd.flags |= LOOKUP_PARENT;
-                error = security_inode_follow_link(link.dentry, &nd);
-                if (error)
-                        goto exit_dput;
-                error = __do_follow_link(&link, &nd, &cookie);
-                if (unlikely(error)) {
-                        if (!IS_ERR(cookie) && linki->i_op->put_link)
-                                linki->i_op->put_link(link.dentry, &nd, cookie);
-                        /* nd.path had been dropped */
-                        nd.path = link;
-                        goto out_path;
                }
-                nd.flags &= ~LOOKUP_PARENT;
+                nd->flags |= LOOKUP_PARENT;
-                filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
+                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-                if (linki->i_op->put_link)
+                error = follow_link(&link, nd, &cookie);
-                        linki->i_op->put_link(link.dentry, &nd, cookie);
+                if (unlikely(error))
-                path_put(&link);
+                        filp = ERR_PTR(error);
+                else
+                        filp = do_last(nd, &path, op, pathname);
+                put_link(nd, &link, cookie);
        }
 out:
-        if (nd.root.mnt)
+        if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
-                path_put(&nd.root);
+                path_put(&nd->root);
-        if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
+        if (base)
-                goto reval;
+                fput(base);
+        release_open_intent(nd);
        return filp;
-exit_dput:
-        path_put_conditional(&path, &nd);
-out_path:
-        path_put(&nd.path);
 out_filp:
-        if (!IS_ERR(nd.intent.open.file))
-                release_open_intent(&nd);
        filp = ERR_PTR(error);
        goto out;
 }
-/**
+struct file *do_filp_open(int dfd, const char *pathname,
- * filp_open - open file and return file pointer
+                const struct open_flags *op, int flags)
- *
- * @filename:   path to open
- * @flags:      open flags as per the open(2) second argument
- * @mode:       mode for the new file if O_CREAT is set, else ignored
- *
- * This is the helper to open a file from kernelspace if you really
- * have to.  But in generally you should not do this, so please move
- * along, nothing to see here..
- */
-struct file *filp_open(const char *filename, int flags, int mode)
 {
-        return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
+        struct nameidata nd;
+        struct file *filp;
+        filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
+        if (unlikely(filp == ERR_PTR(-ECHILD)))
+                filp = path_openat(dfd, pathname, &nd, op, flags);
+        if (unlikely(filp == ERR_PTR(-ESTALE)))
+                filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
+        return filp;
+}
+struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+                const char *name, const struct open_flags *op, int flags)
+{
+        struct nameidata nd;
+        struct file *file;
+        nd.root.mnt = mnt;
+        nd.root.dentry = dentry;
+        flags |= LOOKUP_ROOT;
+        if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
+                return ERR_PTR(-ELOOP);
+        file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
+        if (unlikely(file == ERR_PTR(-ECHILD)))
+                file = path_openat(-1, name, &nd, op, flags);
+        if (unlikely(file == ERR_PTR(-ESTALE)))
+                file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
+        return file;
 }
-EXPORT_SYMBOL(filp_open);
 /**
 * lookup_create - lookup a dentry, creating it if it doesn't exist
@@ -2643,7 +2467,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        if (error)
                return error;
-        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
+        if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
+            !ns_capable(inode_userns(dir), CAP_MKNOD))
                return -EPERM;
        if (!dir->i_op->mknod)
@@ -3120,7 +2945,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
                return error;
        mutex_lock(&inode->i_mutex);
-        error = dir->i_op->link(old_dentry, dir, new_dentry);
+        /* Make sure we don't allow creating hardlink to an unlinked file */
+        if (inode->i_nlink == 0)
+                error =  -ENOENT;
+        else
+                error = dir->i_op->link(old_dentry, dir, new_dentry);
        mutex_unlock(&inode->i_mutex);
        if (!error)
                fsnotify_link(dir, inode, new_dentry);
@@ -3142,15 +2971,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
        struct dentry *new_dentry;
        struct nameidata nd;
        struct path old_path;
+        int how = 0;
        int error;
        char *to;
-        if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
+        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;
+        /*
+         * To use null names we require CAP_DAC_READ_SEARCH
+         * This ensures that not everyone will be able to create
+         * handlink using the passed filedescriptor.
+         */
+        if (flags & AT_EMPTY_PATH) {
+                if (!capable(CAP_DAC_READ_SEARCH))
+                        return -ENOENT;
+                how = LOOKUP_EMPTY;
+        }
+        if (flags & AT_SYMLINK_FOLLOW)
+                how |= LOOKUP_FOLLOW;
-        error = user_path_at(olddfd, oldname,
+        error = user_path_at(olddfd, oldname, how, &old_path);
-                             flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-                             &old_path);
        if (error)
                return error;
@@ -3587,7 +3428,7 @@ EXPORT_SYMBOL(page_readlink);
 EXPORT_SYMBOL(__page_symlink);
 EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
-EXPORT_SYMBOL(path_lookup);
+EXPORT_SYMBOL(kern_path_parent);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
diff --git a/fs/namespace.c b/fs/namespace.c
index 7b0b95371696..d99bcf59e4c2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -196,7 +196,7 @@ unsigned int mnt_get_count(struct vfsmount *mnt)
 #endif
 }
-struct vfsmount *alloc_vfsmnt(const char *name)
+static struct vfsmount *alloc_vfsmnt(const char *name)
 {
        struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
        if (mnt) {
@@ -466,15 +466,7 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
        br_write_unlock(vfsmount_lock);
 }
-void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
+static void free_vfsmnt(struct vfsmount *mnt)
-{
-        mnt->mnt_sb = sb;
-        mnt->mnt_root = dget(sb->s_root);
-}
-EXPORT_SYMBOL(simple_set_mnt);
-void free_vfsmnt(struct vfsmount *mnt)
 {
        kfree(mnt->mnt_devname);
        mnt_free_id(mnt);
@@ -678,6 +670,36 @@ static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
        return p;
 }
+struct vfsmount *
+vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
+{
+        struct vfsmount *mnt;
+        struct dentry *root;
+        if (!type)
+                return ERR_PTR(-ENODEV);
+        mnt = alloc_vfsmnt(name);
+        if (!mnt)
+                return ERR_PTR(-ENOMEM);
+        if (flags & MS_KERNMOUNT)
+                mnt->mnt_flags = MNT_INTERNAL;
+        root = mount_fs(type, flags, name, data);
+        if (IS_ERR(root)) {
+                free_vfsmnt(mnt);
+                return ERR_CAST(root);
+        }
+        mnt->mnt_root = root;
+        mnt->mnt_sb = root->d_sb;
+        mnt->mnt_mountpoint = mnt->mnt_root;
+        mnt->mnt_parent = mnt;
+        return mnt;
+}
+EXPORT_SYMBOL_GPL(vfs_kern_mount);
 static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
                                        int flag)
 {
@@ -978,7 +1000,13 @@ static int show_vfsmnt(struct seq_file *m, void *v)
        int err = 0;
        struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
-        mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+        if (mnt->mnt_sb->s_op->show_devname) {
+                err = mnt->mnt_sb->s_op->show_devname(m, mnt);
+                if (err)
+                        goto out;
+        } else {
+                mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+        }
        seq_putc(m, ' ');
        seq_path(m, &mnt_path, " \t\n\\");
        seq_putc(m, ' ');
@@ -1013,7 +1041,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
        seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id,
                   MAJOR(sb->s_dev), MINOR(sb->s_dev));
-        seq_dentry(m, mnt->mnt_root, " \t\n\\");
+        if (sb->s_op->show_path)
+                err = sb->s_op->show_path(m, mnt);
+        else
+                seq_dentry(m, mnt->mnt_root, " \t\n\\");
+        if (err)
+                goto out;
        seq_putc(m, ' ');
        seq_path_root(m, &mnt_path, &root, " \t\n\\");
        if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) {
@@ -1044,7 +1077,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
        seq_puts(m, " - ");
        show_type(m, sb);
        seq_putc(m, ' ');
-        mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+        if (sb->s_op->show_devname)
+                err = sb->s_op->show_devname(m, mnt);
+        else
+                mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
+        if (err)
+                goto out;
        seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
        err = show_sb_opts(m, sb);
        if (err)
@@ -1070,11 +1108,15 @@ static int show_vfsstat(struct seq_file *m, void *v)
        int err = 0;
        /* device */
-        if (mnt->mnt_devname) {
+        if (mnt->mnt_sb->s_op->show_devname) {
-                seq_puts(m, "device ");
+                err = mnt->mnt_sb->s_op->show_devname(m, mnt);
-                mangle(m, mnt->mnt_devname);
+        } else {
-        } else
+                if (mnt->mnt_devname) {
-                seq_puts(m, "no device");
+                        seq_puts(m, "device ");
+                        mangle(m, mnt->mnt_devname);
+                } else
+                        seq_puts(m, "no device");
+        }
        /* mount point */
        seq_puts(m, " mounted on ");
@@ -1088,7 +1130,8 @@ static int show_vfsstat(struct seq_file *m, void *v)
        /* optional statistics */
        if (mnt->mnt_sb->s_op->show_stats) {
                seq_putc(m, ' ');
-                err = mnt->mnt_sb->s_op->show_stats(m, mnt);
+                if (!err)
+                        err = mnt->mnt_sb->s_op->show_stats(m, mnt);
        }
        seq_putc(m, '\n');
@@ -1244,7 +1287,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
                 */
                br_write_lock(vfsmount_lock);
                if (mnt_get_count(mnt) != 2) {
-                        br_write_lock(vfsmount_lock);
+                        br_write_unlock(vfsmount_lock);
                        return -EBUSY;
                }
                br_write_unlock(vfsmount_lock);
@@ -1604,9 +1647,35 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
        return err;
 }
+static int lock_mount(struct path *path)
+{
+        struct vfsmount *mnt;
+retry:
+        mutex_lock(&path->dentry->d_inode->i_mutex);
+        if (unlikely(cant_mount(path->dentry))) {
+                mutex_unlock(&path->dentry->d_inode->i_mutex);
+                return -ENOENT;
+        }
+        down_write(&namespace_sem);
+        mnt = lookup_mnt(path);
+        if (likely(!mnt))
+                return 0;
+        up_write(&namespace_sem);
+        mutex_unlock(&path->dentry->d_inode->i_mutex);
+        path_put(path);
+        path->mnt = mnt;
+        path->dentry = dget(mnt->mnt_root);
+        goto retry;
+}
+static void unlock_mount(struct path *path)
+{
+        up_write(&namespace_sem);
+        mutex_unlock(&path->dentry->d_inode->i_mutex);
+}
 static int graft_tree(struct vfsmount *mnt, struct path *path)
 {
-        int err;
        if (mnt->mnt_sb->s_flags & MS_NOUSER)
                return -EINVAL;
@@ -1614,16 +1683,10 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
              S_ISDIR(mnt->mnt_root->d_inode->i_mode))
                return -ENOTDIR;
-        err = -ENOENT;
+        if (d_unlinked(path->dentry))
-        mutex_lock(&path->dentry->d_inode->i_mutex);
+                return -ENOENT;
-        if (cant_mount(path->dentry))
-                goto out_unlock;
-        if (!d_unlinked(path->dentry))
+        return attach_recursive_mnt(mnt, path, NULL);
-                err = attach_recursive_mnt(mnt, path, NULL);
-out_unlock:
-        mutex_unlock(&path->dentry->d_inode->i_mutex);
-        return err;
 }
 /*
@@ -1686,6 +1749,7 @@ static int do_change_type(struct path *path, int flag)
 static int do_loopback(struct path *path, char *old_name,
                                int recurse)
 {
+        LIST_HEAD(umount_list);
        struct path old_path;
        struct vfsmount *mnt = NULL;
        int err = mount_is_safe(path);
@@ -1697,13 +1761,16 @@ static int do_loopback(struct path *path, char *old_name,
        if (err)
                return err;
-        down_write(&namespace_sem);
+        err = lock_mount(path);
+        if (err)
+                goto out;
        err = -EINVAL;
        if (IS_MNT_UNBINDABLE(old_path.mnt))
-                goto out;
+                goto out2;
        if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
-                goto out;
+                goto out2;
        err = -ENOMEM;
        if (recurse)
@@ -1712,20 +1779,18 @@ static int do_loopback(struct path *path, char *old_name,
                mnt = clone_mnt(old_path.mnt, old_path.dentry, 0);
        if (!mnt)
-                goto out;
+                goto out2;
        err = graft_tree(mnt, path);
        if (err) {
-                LIST_HEAD(umount_list);
                br_write_lock(vfsmount_lock);
                umount_tree(mnt, 0, &umount_list);
                br_write_unlock(vfsmount_lock);
-                release_mounts(&umount_list);
        }
+out2:
+        unlock_mount(path);
+        release_mounts(&umount_list);
 out:
-        up_write(&namespace_sem);
        path_put(&old_path);
        return err;
 }
@@ -1767,6 +1832,10 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
        if (path->dentry != path->mnt->mnt_root)
                return -EINVAL;
+        err = security_sb_remount(sb, data);
+        if (err)
+                return err;
        down_write(&sb->s_umount);
        if (flags & MS_BIND)
                err = change_mount_flags(path->mnt, flags);
@@ -1810,18 +1879,12 @@ static int do_move_mount(struct path *path, char *old_name)
        if (err)
                return err;
-        down_write(&namespace_sem);
+        err = lock_mount(path);
-        err = follow_down(path, true);
        if (err < 0)
                goto out;
        err = -EINVAL;
        if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
-                goto out;
-        err = -ENOENT;
-        mutex_lock(&path->dentry->d_inode->i_mutex);
-        if (cant_mount(path->dentry))
                goto out1;
        if (d_unlinked(path->dentry))
@@ -1863,16 +1926,87 @@ static int do_move_mount(struct path *path, char *old_name)
         * automatically */
        list_del_init(&old_path.mnt->mnt_expire);
 out1:
-        mutex_unlock(&path->dentry->d_inode->i_mutex);
+        unlock_mount(path);
 out:
-        up_write(&namespace_sem);
        if (!err)
                path_put(&parent_path);
        path_put(&old_path);
        return err;
 }
-static int do_add_mount(struct vfsmount *, struct path *, int);
+static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
+{
+        int err;
+        const char *subtype = strchr(fstype, '.');
+        if (subtype) {
+                subtype++;
+                err = -EINVAL;
+                if (!subtype[0])
+                        goto err;
+        } else
+                subtype = "";
+        mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
+        err = -ENOMEM;
+        if (!mnt->mnt_sb->s_subtype)
+                goto err;
+        return mnt;
+ err:
+        mntput(mnt);
+        return ERR_PTR(err);
+}
+struct vfsmount *
+do_kern_mount(const char *fstype, int flags, const char *name, void *data)
+{
+        struct file_system_type *type = get_fs_type(fstype);
+        struct vfsmount *mnt;
+        if (!type)
+                return ERR_PTR(-ENODEV);
+        mnt = vfs_kern_mount(type, flags, name, data);
+        if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
+            !mnt->mnt_sb->s_subtype)
+                mnt = fs_set_subtype(mnt, fstype);
+        put_filesystem(type);
+        return mnt;
+}
+EXPORT_SYMBOL_GPL(do_kern_mount);
+/*
+ * add a mount into a namespace's mount tree
+ */
+static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
+{
+        int err;
+        mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
+        err = lock_mount(path);
+        if (err)
+                return err;
+        err = -EINVAL;
+        if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
+                goto unlock;
+        /* Refuse the same filesystem on the same mount point */
+        err = -EBUSY;
+        if (path->mnt->mnt_sb == newmnt->mnt_sb &&
+            path->mnt->mnt_root == path->dentry)
+                goto unlock;
+        err = -EINVAL;
+        if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
+                goto unlock;
+        newmnt->mnt_flags = mnt_flags;
+        err = graft_tree(newmnt, path);
+unlock:
+        unlock_mount(path);
+        return err;
+}
 /*
 * create a new mount for userspace and request it to be added into the
@@ -1932,43 +2066,6 @@ fail:
        return err;
 }
-/*
- * add a mount into a namespace's mount tree
- */
-static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
-{
-        int err;
-        mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
-        down_write(&namespace_sem);
-        /* Something was mounted here while we slept */
-        err = follow_down(path, true);
-        if (err < 0)
-                goto unlock;
-        err = -EINVAL;
-        if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
-                goto unlock;
-        /* Refuse the same filesystem on the same mount point */
-        err = -EBUSY;
-        if (path->mnt->mnt_sb == newmnt->mnt_sb &&
-            path->mnt->mnt_root == path->dentry)
-                goto unlock;
-        err = -EINVAL;
-        if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
-                goto unlock;
-        newmnt->mnt_flags = mnt_flags;
-        err = graft_tree(newmnt, path);
-unlock:
-        up_write(&namespace_sem);
-        return err;
-}
 /**
 * mnt_set_expiry - Put a mount on an expiration list
 * @mnt: The mount to list.
@@ -2469,65 +2566,60 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        error = user_path_dir(new_root, &new);
        if (error)
                goto out0;
-        error = -EINVAL;
-        if (!check_mnt(new.mnt))
-                goto out1;
        error = user_path_dir(put_old, &old);
        if (error)
                goto out1;
        error = security_sb_pivotroot(&old, &new);
-        if (error) {
+        if (error)
-                path_put(&old);
+                goto out2;
-                goto out1;
-        }
        get_fs_root(current->fs, &root);
-        down_write(&namespace_sem);
+        error = lock_mount(&old);
-        mutex_lock(&old.dentry->d_inode->i_mutex);
+        if (error)
+                goto out3;
        error = -EINVAL;
        if (IS_MNT_SHARED(old.mnt) ||
                IS_MNT_SHARED(new.mnt->mnt_parent) ||
                IS_MNT_SHARED(root.mnt->mnt_parent))
-                goto out2;
+                goto out4;
-        if (!check_mnt(root.mnt))
+        if (!check_mnt(root.mnt) || !check_mnt(new.mnt))
-                goto out2;
+                goto out4;
        error = -ENOENT;
-        if (cant_mount(old.dentry))
-                goto out2;
        if (d_unlinked(new.dentry))
-                goto out2;
+                goto out4;
        if (d_unlinked(old.dentry))
-                goto out2;
+                goto out4;
        error = -EBUSY;
        if (new.mnt == root.mnt ||
            old.mnt == root.mnt)
-                goto out2; /* loop, on the same file system  */
+                goto out4; /* loop, on the same file system  */
        error = -EINVAL;
        if (root.mnt->mnt_root != root.dentry)
-                goto out2; /* not a mountpoint */
+                goto out4; /* not a mountpoint */
        if (root.mnt->mnt_parent == root.mnt)
-                goto out2; /* not attached */
+                goto out4; /* not attached */
        if (new.mnt->mnt_root != new.dentry)
-                goto out2; /* not a mountpoint */
+                goto out4; /* not a mountpoint */
        if (new.mnt->mnt_parent == new.mnt)
-                goto out2; /* not attached */
+                goto out4; /* not attached */
        /* make sure we can reach put_old from new_root */
        tmp = old.mnt;
-        br_write_lock(vfsmount_lock);
        if (tmp != new.mnt) {
                for (;;) {
                        if (tmp->mnt_parent == tmp)
-                                goto out3; /* already mounted on put_old */
+                                goto out4; /* already mounted on put_old */
                        if (tmp->mnt_parent == new.mnt)
                                break;
                        tmp = tmp->mnt_parent;
                }
                if (!is_subdir(tmp->mnt_mountpoint, new.dentry))
-                        goto out3;
+                        goto out4;
        } else if (!is_subdir(old.dentry, new.dentry))
-                goto out3;
+                goto out4;
+        br_write_lock(vfsmount_lock);
        detach_mnt(new.mnt, &parent_path);
        detach_mnt(root.mnt, &root_parent);
        /* mount old root on put_old */
@@ -2537,22 +2629,21 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        br_write_unlock(vfsmount_lock);
        chroot_fs_refs(&root, &new);
        error = 0;
-        path_put(&root_parent);
+out4:
-        path_put(&parent_path);
+        unlock_mount(&old);
-out2:
+        if (!error) {
-        mutex_unlock(&old.dentry->d_inode->i_mutex);
+                path_put(&root_parent);
-        up_write(&namespace_sem);
+                path_put(&parent_path);
+        }
+out3:
        path_put(&root);
+out2:
        path_put(&old);
 out1:
        path_put(&new);
 out0:
        return error;
-out3:
-        br_write_unlock(vfsmount_lock);
-        goto out2;
 }
 static void __init init_mount_tree(void)
@@ -2594,7 +2685,7 @@ void __init mnt_init(void)
        if (!mount_hashtable)
                panic("Failed to allocate mount hash table\n");
-        printk("Mount-cache hash table entries: %lu\n", HASH_SIZE);
+        printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE);
        for (u = 0; u < HASH_SIZE; u++)
                INIT_LIST_HEAD(&mount_hashtable[u]);
@@ -2627,3 +2718,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
        kfree(ns);
 }
 EXPORT_SYMBOL(put_mnt_ns);
+struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
+{
+        return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
+}
+EXPORT_SYMBOL_GPL(kern_mount_data);
diff --git a/fs/ncpfs/Makefile b/fs/ncpfs/Makefile
index 68ea095100a8..c66af563f2ce 100644
--- a/fs/ncpfs/Makefile
+++ b/fs/ncpfs/Makefile
@@ -11,6 +11,6 @@ ncpfs-$(CONFIG_NCPFS_EXTRAS)   += symlink.o
 ncpfs-$(CONFIG_NCPFS_NFS_NS)   += symlink.o
 # If you want debugging output, please uncomment the following line
-# EXTRA_CFLAGS += -DDEBUG_NCP=1
+# ccflags-y := -DDEBUG_NCP=1
 CFLAGS_ncplib_kernel.o := -finline-functions
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 00a1d1c3d3a4..0250e4ce4893 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -596,7 +596,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 /*      server->priv.data = NULL;               */
        server->m = data;
-        /* Althought anything producing this is buggy, it happens
+        /* Although anything producing this is buggy, it happens
           now because of PATH_MAX changes.. */
        if (server->m.time_out < 1) {
                server->m.time_out = 10;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 89587573fe50..2f41dccea18e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
                        rv = NFS4ERR_DELAY;
                list_del_init(&lo->plh_bulk_recall);
                spin_unlock(&ino->i_lock);
+                pnfs_free_lseg_list(&free_me_list);
                put_layout_hdr(lo);
                iput(ino);
        }
-        pnfs_free_lseg_list(&free_me_list);
        return rv;
 }
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 14e0f9371d14..00ecf62ce7c1 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -241,7 +241,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
        args->cbl_layout_type = ntohl(*p++);
        /* Depite the spec's xdr, iomode really belongs in the FILE switch,
-         * as it is unuseable and ignored with the other types.
+         * as it is unusable and ignored with the other types.
         */
        iomode = ntohl(*p++);
        args->cbl_layoutchanged = ntohl(*p++);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index bd3ca32879e7..139be9647d80 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -82,6 +82,11 @@ retry:
 #endif /* CONFIG_NFS_V4 */
 /*
+ * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
+ */
+static int nfs4_disable_idmapping = 0;
+/*
 * RPC cruft for NFS
 */
 static struct rpc_version *nfs_version[5] = {
@@ -481,7 +486,12 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 * Look up a client by IP address and protocol version
 * - creates a new record if one doesn't yet exist
 */
-static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
+static struct nfs_client *
+nfs_get_client(const struct nfs_client_initdata *cl_init,
+               const struct rpc_timeout *timeparms,
+               const char *ip_addr,
+               rpc_authflavor_t authflavour,
+               int noresvport)
 {
        struct nfs_client *clp, *new = NULL;
        int error;
@@ -512,6 +522,13 @@ install_client:
        clp = new;
        list_add(&clp->cl_share_link, &nfs_client_list);
        spin_unlock(&nfs_client_lock);
+        error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
+                                              authflavour, noresvport);
+        if (error < 0) {
+                nfs_put_client(clp);
+                return ERR_PTR(error);
+        }
        dprintk("--> nfs_get_client() = %p [new]\n", clp);
        return clp;
@@ -767,9 +784,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
 /*
 * Initialise an NFS2 or NFS3 client
 */
-static int nfs_init_client(struct nfs_client *clp,
+int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
-                           const struct rpc_timeout *timeparms,
+                    const char *ip_addr, rpc_authflavor_t authflavour,
-                           const struct nfs_parsed_mount_data *data)
+                    int noresvport)
 {
        int error;
@@ -784,7 +801,7 @@ static int nfs_init_client(struct nfs_client *clp,
         * - RFC 2623, sec 2.3.2
         */
        error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
-                                      0, data->flags & NFS_MOUNT_NORESVPORT);
+                                      0, noresvport);
        if (error < 0)
                goto error;
        nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -820,19 +837,17 @@ static int nfs_init_server(struct nfs_server *server,
                cl_init.rpc_ops = &nfs_v3_clientops;
 #endif
+        nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
+                        data->timeo, data->retrans);
        /* Allocate or find a client reference we can use */
-        clp = nfs_get_client(&cl_init);
+        clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX,
+                             data->flags & NFS_MOUNT_NORESVPORT);
        if (IS_ERR(clp)) {
                dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
                return PTR_ERR(clp);
        }
-        nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
-                        data->timeo, data->retrans);
-        error = nfs_init_client(clp, &timeparms, data);
-        if (error < 0)
-                goto error;
        server->nfs_client = clp;
        /* Initialise the client representation from the mount data */
@@ -1009,14 +1024,19 @@ static void nfs_server_insert_lists(struct nfs_server *server)
        spin_lock(&nfs_client_lock);
        list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
        list_add_tail(&server->master_link, &nfs_volume_list);
+        clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
        spin_unlock(&nfs_client_lock);
 }
 static void nfs_server_remove_lists(struct nfs_server *server)
 {
+        struct nfs_client *clp = server->nfs_client;
        spin_lock(&nfs_client_lock);
        list_del_rcu(&server->client_link);
+        if (clp && list_empty(&clp->cl_superblocks))
+                set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
        list_del(&server->master_link);
        spin_unlock(&nfs_client_lock);
@@ -1307,11 +1327,11 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
 /*
 * Initialise an NFS4 client record
 */
-static int nfs4_init_client(struct nfs_client *clp,
+int nfs4_init_client(struct nfs_client *clp,
-                const struct rpc_timeout *timeparms,
+                     const struct rpc_timeout *timeparms,
-                const char *ip_addr,
+                     const char *ip_addr,
-                rpc_authflavor_t authflavour,
+                     rpc_authflavor_t authflavour,
-                int flags)
+                     int noresvport)
 {
        int error;
@@ -1325,7 +1345,7 @@ static int nfs4_init_client(struct nfs_client *clp,
        clp->rpc_ops = &nfs_v4_clientops;
        error = nfs_create_rpc_client(clp, timeparms, authflavour,
-                                      1, flags & NFS_MOUNT_NORESVPORT);
+                                      1, noresvport);
        if (error < 0)
                goto error;
        strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1378,27 +1398,71 @@ static int nfs4_set_client(struct nfs_server *server,
        dprintk("--> nfs4_set_client()\n");
        /* Allocate or find a client reference we can use */
-        clp = nfs_get_client(&cl_init);
+        clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour,
+                             server->flags & NFS_MOUNT_NORESVPORT);
        if (IS_ERR(clp)) {
                error = PTR_ERR(clp);
                goto error;
        }
-        error = nfs4_init_client(clp, timeparms, ip_addr, authflavour,
-                                        server->flags);
+        /*
-        if (error < 0)
+         * Query for the lease time on clientid setup or renewal
-                goto error_put;
+         *
+         * Note that this will be set on nfs_clients that were created
+         * only for the DS role and did not set this bit, but now will
+         * serve a dual role.
+         */
+        set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
        server->nfs_client = clp;
        dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
        return 0;
-error_put:
-        nfs_put_client(clp);
 error:
        dprintk("<-- nfs4_set_client() = xerror %d\n", error);
        return error;
 }
+/*
+ * Set up a pNFS Data Server client.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+                const struct sockaddr *ds_addr,
+                int ds_addrlen, int ds_proto)
+{
+        struct nfs_client_initdata cl_init = {
+                .addr = ds_addr,
+                .addrlen = ds_addrlen,
+                .rpc_ops = &nfs_v4_clientops,
+                .proto = ds_proto,
+                .minorversion = mds_clp->cl_minorversion,
+        };
+        struct rpc_timeout ds_timeout = {
+                .to_initval = 15 * HZ,
+                .to_maxval = 15 * HZ,
+                .to_retries = 1,
+                .to_exponential = 1,
+        };
+        struct nfs_client *clp;
+        /*
+         * Set an authflavor equual to the MDS value. Use the MDS nfs_client
+         * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
+         * (section 13.1 RFC 5661).
+         */
+        clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+                             mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
+        dprintk("<-- %s %p\n", __func__, clp);
+        return clp;
+}
+EXPORT_SYMBOL(nfs4_set_ds_client);
 /*
 * Session has been established, and the client marked ready.
@@ -1435,6 +1499,10 @@ static int nfs4_server_common_setup(struct nfs_server *server,
        BUG_ON(!server->nfs_client->rpc_ops);
        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
+        /* data servers support only a subset of NFSv4.1 */
+        if (is_ds_only_client(server->nfs_client))
+                return -EPROTONOSUPPORT;
        fattr = nfs_alloc_fattr();
        if (fattr == NULL)
                return -ENOMEM;
@@ -1504,6 +1572,13 @@ static int nfs4_init_server(struct nfs_server *server,
        if (error < 0)
                goto error;
+        /*
+         * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
+         * authentication.
+         */
+        if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX)
+                server->caps |= NFS_CAP_UIDGID_NOMAP;
        if (data->rsize)
                server->rsize = nfs_block_size(data->rsize, NULL);
        if (data->wsize)
@@ -1921,3 +1996,7 @@ void nfs_fs_proc_exit(void)
 }
 #endif /* CONFIG_PROC_FS */
+module_param(nfs4_disable_idmapping, bool, 0644);
+MODULE_PARM_DESC(nfs4_disable_idmapping,
+                "Turn off NFSv4 idmapping when using 'sec=sys'");
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2c3eb33b904d..7237672216c8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -44,6 +44,7 @@
 /* #define NFS_DEBUG_VERBOSE 1 */
 static int nfs_opendir(struct inode *, struct file *);
+static int nfs_closedir(struct inode *, struct file *);
 static int nfs_readdir(struct file *, void *, filldir_t);
 static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *);
@@ -64,7 +65,7 @@ const struct file_operations nfs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = nfs_readdir,
        .open           = nfs_opendir,
-        .release        = nfs_release,
+        .release        = nfs_closedir,
        .fsync          = nfs_fsync_dir,
 };
@@ -133,13 +134,35 @@ const struct inode_operations nfs4_dir_inode_operations = {
 #endif /* CONFIG_NFS_V4 */
+static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred)
+{
+        struct nfs_open_dir_context *ctx;
+        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+        if (ctx != NULL) {
+                ctx->duped = 0;
+                ctx->dir_cookie = 0;
+                ctx->dup_cookie = 0;
+                ctx->cred = get_rpccred(cred);
+        } else
+                ctx = ERR_PTR(-ENOMEM);
+        return ctx;
+}
+static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
+{
+        put_rpccred(ctx->cred);
+        kfree(ctx);
+}
 /*
 * Open file
 */
 static int
 nfs_opendir(struct inode *inode, struct file *filp)
 {
-        int res;
+        int res = 0;
+        struct nfs_open_dir_context *ctx;
+        struct rpc_cred *cred;
        dfprintk(FILE, "NFS: open dir(%s/%s)\n",
                        filp->f_path.dentry->d_parent->d_name.name,
@@ -147,8 +170,15 @@ nfs_opendir(struct inode *inode, struct file *filp)
        nfs_inc_stats(inode, NFSIOS_VFSOPEN);
-        /* Call generic open code in order to cache credentials */
+        cred = rpc_lookup_cred();
-        res = nfs_open(inode, filp);
+        if (IS_ERR(cred))
+                return PTR_ERR(cred);
+        ctx = alloc_nfs_open_dir_context(cred);
+        if (IS_ERR(ctx)) {
+                res = PTR_ERR(ctx);
+                goto out;
+        }
+        filp->private_data = ctx;
        if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
                /* This is a mountpoint, so d_revalidate will never
                 * have been called, so we need to refresh the
@@ -156,9 +186,18 @@ nfs_opendir(struct inode *inode, struct file *filp)
                 */
                __nfs_revalidate_inode(NFS_SERVER(inode), inode);
        }
+out:
+        put_rpccred(cred);
        return res;
 }
+static int
+nfs_closedir(struct inode *inode, struct file *filp)
+{
+        put_nfs_open_dir_context(filp->private_data);
+        return 0;
+}
 struct nfs_cache_array_entry {
        u64 cookie;
        u64 ino;
@@ -284,19 +323,20 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
 {
        loff_t diff = desc->file->f_pos - desc->current_index;
        unsigned int index;
+        struct nfs_open_dir_context *ctx = desc->file->private_data;
        if (diff < 0)
                goto out_eof;
        if (diff >= array->size) {
                if (array->eof_index >= 0)
                        goto out_eof;
-                desc->current_index += array->size;
                return -EAGAIN;
        }
        index = (unsigned int)diff;
        *desc->dir_cookie = array->array[index].cookie;
        desc->cache_entry_index = index;
+        ctx->duped = 0;
        return 0;
 out_eof:
        desc->eof = 1;
@@ -307,10 +347,18 @@ static
 int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
 {
        int i;
+        loff_t new_pos;
        int status = -EAGAIN;
+        struct nfs_open_dir_context *ctx = desc->file->private_data;
        for (i = 0; i < array->size; i++) {
                if (array->array[i].cookie == *desc->dir_cookie) {
+                        new_pos = desc->current_index + i;
+                        if (new_pos < desc->file->f_pos) {
+                                ctx->dup_cookie = *desc->dir_cookie;
+                                ctx->duped = 1;
+                        }
+                        desc->file->f_pos = new_pos;
                        desc->cache_entry_index = i;
                        return 0;
                }
@@ -342,6 +390,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
        if (status == -EAGAIN) {
                desc->last_cookie = array->last_cookie;
+                desc->current_index += array->size;
                desc->page_index++;
        }
        nfs_readdir_release_array(desc->page);
@@ -354,7 +403,8 @@ static
 int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
                        struct nfs_entry *entry, struct file *file, struct inode *inode)
 {
-        struct rpc_cred *cred = nfs_file_cred(file);
+        struct nfs_open_dir_context *ctx = file->private_data;
+        struct rpc_cred *cred = ctx->cred;
        unsigned long   timestamp, gencount;
        int             error;
@@ -693,6 +743,20 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
        int i = 0;
        int res = 0;
        struct nfs_cache_array *array = NULL;
+        struct nfs_open_dir_context *ctx = file->private_data;
+        if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) {
+                if (printk_ratelimit()) {
+                        pr_notice("NFS: directory %s/%s contains a readdir loop.  "
+                                "Please contact your server vendor.  "
+                                "Offending cookie: %llu\n",
+                                file->f_dentry->d_parent->d_name.name,
+                                file->f_dentry->d_name.name,
+                                *desc->dir_cookie);
+                }
+                res = -ELOOP;
+                goto out;
+        }
        array = nfs_readdir_get_array(desc->page);
        if (IS_ERR(array)) {
@@ -785,6 +849,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct inode    *inode = dentry->d_inode;
        nfs_readdir_descriptor_t my_desc,
                        *desc = &my_desc;
+        struct nfs_open_dir_context *dir_ctx = filp->private_data;
        int res;
        dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
@@ -801,7 +866,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        memset(desc, 0, sizeof(*desc));
        desc->file = filp;
-        desc->dir_cookie = &nfs_file_open_context(filp)->dir_cookie;
+        desc->dir_cookie = &dir_ctx->dir_cookie;
        desc->decode = NFS_PROTO(inode)->decode_dirent;
        desc->plus = NFS_USE_READDIRPLUS(inode);
@@ -853,6 +918,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
 {
        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
+        struct nfs_open_dir_context *dir_ctx = filp->private_data;
        dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
                        dentry->d_parent->d_name.name,
@@ -872,7 +938,8 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
        }
        if (offset != filp->f_pos) {
                filp->f_pos = offset;
-                nfs_file_open_context(filp)->dir_cookie = 0;
+                dir_ctx->dir_cookie = 0;
+                dir_ctx->duped = 0;
        }
 out:
        mutex_unlock(&inode->i_mutex);
@@ -1068,7 +1135,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
        if (fhandle == NULL || fattr == NULL)
                goto out_error;
-        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+        error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
        if (error)
                goto out_bad;
        if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1169,11 +1236,23 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
        iput(inode);
 }
+static void nfs_d_release(struct dentry *dentry)
+{
+        /* free cached devname value, if it survived that far */
+        if (unlikely(dentry->d_fsdata)) {
+                if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+                        WARN_ON(1);
+                else
+                        kfree(dentry->d_fsdata);
+        }
+}
 const struct dentry_operations nfs_dentry_operations = {
        .d_revalidate   = nfs_lookup_revalidate,
        .d_delete       = nfs_dentry_delete,
        .d_iput         = nfs_dentry_iput,
        .d_automount    = nfs_d_automount,
+        .d_release      = nfs_d_release,
 };
 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1212,7 +1291,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
        parent = dentry->d_parent;
        /* Protect against concurrent sillydeletes */
        nfs_block_sillyrename(parent);
-        error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+        error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
        if (error == -ENOENT)
                goto no_entry;
        if (error < 0) {
@@ -1248,6 +1327,7 @@ const struct dentry_operations nfs4_dentry_operations = {
        .d_delete       = nfs_dentry_delete,
        .d_iput         = nfs_dentry_iput,
        .d_automount    = nfs_d_automount,
+        .d_release      = nfs_d_release,
 };
 /*
@@ -1549,7 +1629,7 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
        if (dentry->d_inode)
                goto out;
        if (fhandle->size == 0) {
-                error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr);
+                error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
                if (error)
                        goto out_error;
        }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9943a75bb6d1..8eea25366717 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -45,6 +45,7 @@
 #include <linux/pagemap.h>
 #include <linux/kref.h>
 #include <linux/slab.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
@@ -649,8 +650,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 {
        struct nfs_write_data *data = calldata;
-        if (nfs_writeback_done(task, data) != 0)
+        nfs_writeback_done(task, data);
-                return;
 }
 /*
@@ -938,6 +938,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
        if (retval)
                goto out;
+        task_io_account_read(count);
        retval = nfs_direct_read(iocb, iov, nr_segs, pos);
        if (retval > 0)
                iocb->ki_pos = pos + retval;
@@ -999,6 +1001,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        if (retval)
                goto out;
+        task_io_account_write(count);
        retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
        if (retval > 0)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7bf029ef4084..2f093ed16980 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -301,7 +301,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 * disk, but it retrieves and clears ctx->error after synching, despite
 * the two being set at the same time in nfs_context_set_write_error().
 * This is because the former is used to notify the _next_ call to
- * nfs_file_write() that a write error occured, and hence cause it to
+ * nfs_file_write() that a write error occurred, and hence cause it to
 * fall back to doing a synchronous write.
 */
 static int
@@ -326,6 +326,9 @@ nfs_file_fsync(struct file *file, int datasync)
                ret = xchg(&ctx->error, 0);
        if (!ret && status < 0)
                ret = status;
+        if (!ret && !datasync)
+                /* application has asked for meta-data sync */
+                ret = pnfs_layoutcommit_inode(inode, true);
        return ret;
 }
@@ -387,10 +390,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
                file->f_path.dentry->d_name.name,
                mapping->host->i_ino, len, (long long) pos);
-        pnfs_update_layout(mapping->host,
-                           nfs_file_open_context(file),
-                           IOMODE_RW);
 start:
        /*
         * Prevent starvation issues if someone is doing a consistency
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b5ffe8fa291f..dcb61548887f 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -75,18 +75,25 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
 /*
 * get an NFS2/NFS3 root dentry from the root filehandle
 */
-struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
+struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
+                            const char *devname)
 {
        struct nfs_server *server = NFS_SB(sb);
        struct nfs_fsinfo fsinfo;
        struct dentry *ret;
        struct inode *inode;
+        void *name = kstrdup(devname, GFP_KERNEL);
        int error;
+        if (!name)
+                return ERR_PTR(-ENOMEM);
        /* get the actual root for this mount */
        fsinfo.fattr = nfs_alloc_fattr();
-        if (fsinfo.fattr == NULL)
+        if (fsinfo.fattr == NULL) {
+                kfree(name);
                return ERR_PTR(-ENOMEM);
+        }
        error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
        if (error < 0) {
@@ -119,7 +126,15 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        }
        security_d_instantiate(ret, inode);
+        spin_lock(&ret->d_lock);
+        if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+                ret->d_fsdata = name;
+                name = NULL;
+        }
+        spin_unlock(&ret->d_lock);
 out:
+        if (name)
+                kfree(name);
        nfs_free_fattr(fsinfo.fattr);
        return ret;
 }
@@ -169,27 +184,35 @@ out:
 /*
 * get an NFS4 root dentry from the root filehandle
 */
-struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
+struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh,
+                             const char *devname)
 {
        struct nfs_server *server = NFS_SB(sb);
        struct nfs_fattr *fattr = NULL;
        struct dentry *ret;
        struct inode *inode;
+        void *name = kstrdup(devname, GFP_KERNEL);
        int error;
        dprintk("--> nfs4_get_root()\n");
+        if (!name)
+                return ERR_PTR(-ENOMEM);
        /* get the info about the server and filesystem */
        error = nfs4_server_capabilities(server, mntfh);
        if (error < 0) {
                dprintk("nfs_get_root: getcaps error = %d\n",
                        -error);
+                kfree(name);
                return ERR_PTR(error);
        }
        fattr = nfs_alloc_fattr();
-        if (fattr == NULL)
+        if (fattr == NULL) {
-                return ERR_PTR(-ENOMEM);;
+                kfree(name);
+                return ERR_PTR(-ENOMEM);
+        }
        /* get the actual root for this mount */
        error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
@@ -199,6 +222,10 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
                goto out;
        }
+        if (fattr->valid & NFS_ATTR_FATTR_FSID &&
+            !nfs_fsid_equal(&server->fsid, &fattr->fsid))
+                memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
        inode = nfs_fhget(sb, mntfh, fattr);
        if (IS_ERR(inode)) {
                dprintk("nfs_get_root: get root inode failed\n");
@@ -223,8 +250,15 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
        }
        security_d_instantiate(ret, inode);
+        spin_lock(&ret->d_lock);
+        if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+                ret->d_fsdata = name;
+                name = NULL;
+        }
+        spin_unlock(&ret->d_lock);
 out:
+        if (name)
+                kfree(name);
        nfs_free_fattr(fattr);
        dprintk("<-- nfs4_get_root()\n");
        return ret;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 18696882f1c6..79664a1025af 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -33,16 +33,41 @@
 *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+{
+        unsigned long val;
+        char buf[16];
+        if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
+                return 0;
+        memcpy(buf, name, namelen);
+        buf[namelen] = '\0';
+        if (strict_strtoul(buf, 0, &val) != 0)
+                return 0;
+        *res = val;
+        return 1;
+}
+static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
+{
+        return snprintf(buf, buflen, "%u", id);
+}
 #ifdef CONFIG_NFS_USE_NEW_IDMAPPER
 #include <linux/slab.h>
 #include <linux/cred.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/nfs4.h>
+#include <linux/nfs_fs_sb.h>
 #include <linux/nfs_idmap.h>
 #include <linux/keyctl.h>
 #include <linux/key-type.h>
 #include <linux/rcupdate.h>
-#include <linux/kernel.h>
 #include <linux/err.h>
 #include <keys/user-type.h>
@@ -219,23 +244,39 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen,
        return ret;
 }
-int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
+        if (nfs_map_string_to_numeric(name, namelen, uid))
+                return 0;
        return nfs_idmap_lookup_id(name, namelen, "uid", uid);
 }
-int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
 {
+        if (nfs_map_string_to_numeric(name, namelen, gid))
+                return 0;
        return nfs_idmap_lookup_id(name, namelen, "gid", gid);
 }
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
-        return nfs_idmap_lookup_name(uid, "user", buf, buflen);
+        int ret = -EINVAL;
+        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+                ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
+        if (ret < 0)
+                ret = nfs_map_numeric_to_string(uid, buf, buflen);
+        return ret;
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
 {
-        return nfs_idmap_lookup_name(gid, "group", buf, buflen);
+        int ret = -EINVAL;
+        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+                ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
+        if (ret < 0)
+                ret = nfs_map_numeric_to_string(gid, buf, buflen);
+        return ret;
 }
 #else  /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
@@ -243,7 +284,6 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
-#include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/socket.h>
 #include <linux/in.h>
@@ -695,31 +735,45 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
        return hash;
 }
-int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
-        struct idmap *idmap = clp->cl_idmap;
+        struct idmap *idmap = server->nfs_client->cl_idmap;
+        if (nfs_map_string_to_numeric(name, namelen, uid))
+                return 0;
        return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
 }
-int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid)
+int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
 {
-        struct idmap *idmap = clp->cl_idmap;
+        struct idmap *idmap = server->nfs_client->cl_idmap;
+        if (nfs_map_string_to_numeric(name, namelen, uid))
+                return 0;
        return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
 }
-int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
-        struct idmap *idmap = clp->cl_idmap;
+        struct idmap *idmap = server->nfs_client->cl_idmap;
+        int ret = -EINVAL;
-        return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+                ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
+        if (ret < 0)
+                ret = nfs_map_numeric_to_string(uid, buf, buflen);
+        return ret;
 }
-int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen)
+int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
 {
-        struct idmap *idmap = clp->cl_idmap;
+        struct idmap *idmap = server->nfs_client->cl_idmap;
+        int ret = -EINVAL;
-        return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+        if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
+                ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
+        if (ret < 0)
+                ret = nfs_map_numeric_to_string(uid, buf, buflen);
+        return ret;
 }
 #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1cc600e77bb4..57bb31ad7a5e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -37,6 +37,7 @@
 #include <linux/inet.h>
 #include <linux/nfs_xdr.h>
 #include <linux/slab.h>
+#include <linux/compat.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -89,7 +90,11 @@ int nfs_wait_bit_killable(void *word)
 */
 u64 nfs_compat_user_ino64(u64 fileid)
 {
-        int ino;
+#ifdef CONFIG_COMPAT
+        compat_ulong_t ino;
+#else   
+        unsigned long ino;
+#endif
        if (enable_ino64)
                return fileid;
@@ -249,7 +254,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
        struct inode *inode = ERR_PTR(-ENOENT);
        unsigned long hash;
-        if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
+        nfs_attr_check_mountpoint(sb, fattr);
+        if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0 && (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0)
                goto out_no_inode;
        if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
                goto out_no_inode;
@@ -293,8 +300,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                        if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
                                set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        /* Deal with crossing mountpoints */
-                        if ((fattr->valid & NFS_ATTR_FATTR_FSID)
+                        if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT ||
-                                        && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+                                        fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
                                if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
                                        inode->i_op = &nfs_referral_inode_operations;
                                else
@@ -634,7 +641,6 @@ struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cr
                ctx->mode = f_mode;
                ctx->flags = 0;
                ctx->error = 0;
-                ctx->dir_cookie = 0;
                nfs_init_lock_context(&ctx->lock_context);
                ctx->lock_context.open_context = ctx;
                INIT_LIST_HEAD(&ctx->list);
@@ -1466,6 +1472,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
        nfsi->delegation_state = 0;
        init_rwsem(&nfsi->rwsem);
        nfsi->layout = NULL;
+        atomic_set(&nfsi->commits_outstanding, 0);
 #endif
 }
@@ -1513,7 +1520,7 @@ static int nfsiod_start(void)
 {
        struct workqueue_struct *wq;
        dprintk("RPC:       creating workqueue nfsiod\n");
-        wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0);
+        wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0);
        if (wq == NULL)
                return -ENOMEM;
        nfsiod_workqueue = wq;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index cf9fdbdabc67..ce118ce885dd 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -39,6 +39,12 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
        return 0;
 }
+static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
+{
+        if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
+                fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT;
+}
 struct nfs_clone_mount {
        const struct super_block *sb;
        const struct dentry *dentry;
@@ -148,6 +154,9 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
                                           struct nfs_fattr *);
 extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
 extern int nfs4_check_client_ready(struct nfs_client *clp);
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+                                             const struct sockaddr *ds_addr,
+                                             int ds_addrlen, int ds_proto);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
@@ -163,10 +172,10 @@ static inline void nfs_fs_proc_exit(void)
 /* nfs4namespace.c */
 #ifdef CONFIG_NFS_V4
-extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry);
+extern struct vfsmount *nfs_do_refmount(struct dentry *dentry);
 #else
 static inline
-struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+struct vfsmount *nfs_do_refmount(struct dentry *dentry)
 {
        return ERR_PTR(-ENOENT);
 }
@@ -211,10 +220,17 @@ extern const u32 nfs41_maxwrite_overhead;
 /* nfs4proc.c */
 #ifdef CONFIG_NFS_V4
 extern struct rpc_procinfo nfs4_procedures[];
+void nfs_fixup_secinfo_attributes(struct nfs_fattr *, struct nfs_fh *);
 #endif
+extern int nfs4_init_ds_session(struct nfs_client *clp);
 /* proc.c */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+extern int nfs_init_client(struct nfs_client *clp,
+                           const struct rpc_timeout *timeparms,
+                           const char *ip_addr, rpc_authflavor_t authflavour,
+                           int noresvport);
 /* dir.c */
 extern int nfs_access_cache_shrinker(struct shrinker *shrink,
@@ -247,25 +263,45 @@ extern void nfs_sb_active(struct super_block *sb);
 extern void nfs_sb_deactive(struct super_block *sb);
 /* namespace.c */
-extern char *nfs_path(const char *base,
+extern char *nfs_path(char **p, struct dentry *dentry,
-                      const struct dentry *droot,
-                      const struct dentry *dentry,
                      char *buffer, ssize_t buflen);
 extern struct vfsmount *nfs_d_automount(struct path *path);
 /* getroot.c */
-extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *);
+extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
+                                   const char *);
 #ifdef CONFIG_NFS_V4
-extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *);
+extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
+                                    const char *);
 extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
 #endif
 /* read.c */
+extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
+                             const struct rpc_call_ops *call_ops);
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 /* write.c */
+extern void nfs_commit_free(struct nfs_write_data *p);
+extern int nfs_initiate_write(struct nfs_write_data *data,
+                              struct rpc_clnt *clnt,
+                              const struct rpc_call_ops *call_ops,
+                              int how);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
+extern int nfs_initiate_commit(struct nfs_write_data *data,
+                               struct rpc_clnt *clnt,
+                               const struct rpc_call_ops *call_ops,
+                               int how);
+extern void nfs_init_commit(struct nfs_write_data *data,
+                            struct list_head *head,
+                            struct pnfs_layout_segment *lseg);
+void nfs_retry_commit(struct list_head *page_list,
+                      struct pnfs_layout_segment *lseg);
+void nfs_commit_clear_lock(struct nfs_inode *nfsi);
+void nfs_commitdata_release(void *data);
+void nfs_commit_release_pages(struct nfs_write_data *data);
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
                struct page *, struct page *);
@@ -274,12 +310,21 @@ extern int nfs_migrate_page(struct address_space *,
 #endif
 /* nfs4proc.c */
-extern int _nfs4_call_sync(struct nfs_server *server,
+extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
+extern int nfs4_init_client(struct nfs_client *clp,
+                            const struct rpc_timeout *timeparms,
+                            const char *ip_addr,
+                            rpc_authflavor_t authflavour,
+                            int noresvport);
+extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
+extern int _nfs4_call_sync(struct rpc_clnt *clnt,
+                           struct nfs_server *server,
                           struct rpc_message *msg,
                           struct nfs4_sequence_args *args,
                           struct nfs4_sequence_res *res,
                           int cache_reply);
-extern int _nfs4_call_sync_session(struct nfs_server *server,
+extern int _nfs4_call_sync_session(struct rpc_clnt *clnt,
+                                   struct nfs_server *server,
                                   struct rpc_message *msg,
                                   struct nfs4_sequence_args *args,
                                   struct nfs4_sequence_res *res,
@@ -288,12 +333,11 @@ extern int _nfs4_call_sync_session(struct nfs_server *server,
 /*
 * Determine the device name as a string
 */
-static inline char *nfs_devname(const struct vfsmount *mnt_parent,
+static inline char *nfs_devname(struct dentry *dentry,
-                                const struct dentry *dentry,
                                char *buffer, ssize_t buflen)
 {
-        return nfs_path(mnt_parent->mnt_devname, mnt_parent->mnt_root,
+        char *dummy;
-                        dentry, buffer, buflen);
+        return nfs_path(&dummy, dentry, buffer, buflen);
 }
 /*
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f32b8603dca8..89fc160fd5b0 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -15,6 +15,7 @@
 #include <linux/string.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/vfs.h>
+#include <linux/sunrpc/gss_api.h>
 #include "internal.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
@@ -25,33 +26,31 @@ static LIST_HEAD(nfs_automount_list);
 static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
 int nfs_mountpoint_expiry_timeout = 500 * HZ;
-static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+static struct vfsmount *nfs_do_submount(struct dentry *dentry,
-                                        const struct dentry *dentry,
                                        struct nfs_fh *fh,
-                                        struct nfs_fattr *fattr);
+                                        struct nfs_fattr *fattr,
+                                        rpc_authflavor_t authflavor);
 /*
 * nfs_path - reconstruct the path given an arbitrary dentry
- * @base - arbitrary string to prepend to the path
+ * @base - used to return pointer to the end of devname part of path
- * @droot - pointer to root dentry for mountpoint
 * @dentry - pointer to dentry
 * @buffer - result buffer
 * @buflen - length of buffer
 *
- * Helper function for constructing the path from the
+ * Helper function for constructing the server pathname
- * root dentry to an arbitrary hashed dentry.
+ * by arbitrary hashed dentry.
 *
 * This is mainly for use in figuring out the path on the
- * server side when automounting on top of an existing partition.
+ * server side when automounting on top of an existing partition
+ * and in generating /proc/mounts and friends.
 */
-char *nfs_path(const char *base,
+char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen)
-               const struct dentry *droot,
-               const struct dentry *dentry,
-               char *buffer, ssize_t buflen)
 {
        char *end;
        int namelen;
        unsigned seq;
+        const char *base;
 rename_retry:
        end = buffer+buflen;
@@ -60,7 +59,10 @@ rename_retry:
        seq = read_seqbegin(&rename_lock);
        rcu_read_lock();
-        while (!IS_ROOT(dentry) && dentry != droot) {
+        while (1) {
+                spin_lock(&dentry->d_lock);
+                if (IS_ROOT(dentry))
+                        break;
                namelen = dentry->d_name.len;
                buflen -= namelen + 1;
                if (buflen < 0)
@@ -68,27 +70,47 @@ rename_retry:
                end -= namelen;
                memcpy(end, dentry->d_name.name, namelen);
                *--end = '/';
+                spin_unlock(&dentry->d_lock);
                dentry = dentry->d_parent;
        }
-        rcu_read_unlock();
+        if (read_seqretry(&rename_lock, seq)) {
-        if (read_seqretry(&rename_lock, seq))
+                spin_unlock(&dentry->d_lock);
+                rcu_read_unlock();
                goto rename_retry;
+        }
        if (*end != '/') {
-                if (--buflen < 0)
+                if (--buflen < 0) {
+                        spin_unlock(&dentry->d_lock);
+                        rcu_read_unlock();
                        goto Elong;
+                }
                *--end = '/';
        }
+        *p = end;
+        base = dentry->d_fsdata;
+        if (!base) {
+                spin_unlock(&dentry->d_lock);
+                rcu_read_unlock();
+                WARN_ON(1);
+                return end;
+        }
        namelen = strlen(base);
        /* Strip off excess slashes in base string */
        while (namelen > 0 && base[namelen - 1] == '/')
                namelen--;
        buflen -= namelen;
-        if (buflen < 0)
+        if (buflen < 0) {
+                spin_unlock(&dentry->d_lock);
+                rcu_read_unlock();
                goto Elong;
+        }
        end -= namelen;
        memcpy(end, base, namelen);
+        spin_unlock(&dentry->d_lock);
+        rcu_read_unlock();
        return end;
 Elong_unlock:
+        spin_unlock(&dentry->d_lock);
        rcu_read_unlock();
        if (read_seqretry(&rename_lock, seq))
                goto rename_retry;
@@ -96,6 +118,99 @@ Elong:
        return ERR_PTR(-ENAMETOOLONG);
 }
+#ifdef CONFIG_NFS_V4
+static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors, struct inode *inode)
+{
+        struct gss_api_mech *mech;
+        struct xdr_netobj oid;
+        int i;
+        rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
+        for (i = 0; i < flavors->num_flavors; i++) {
+                struct nfs4_secinfo_flavor *flavor;
+                flavor = &flavors->flavors[i];
+                if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) {
+                        pseudoflavor = flavor->flavor;
+                        break;
+                } else if (flavor->flavor == RPC_AUTH_GSS) {
+                        oid.len  = flavor->gss.sec_oid4.len;
+                        oid.data = flavor->gss.sec_oid4.data;
+                        mech = gss_mech_get_by_OID(&oid);
+                        if (!mech)
+                                continue;
+                        pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service);
+                        gss_mech_put(mech);
+                        break;
+                }
+        }
+        return pseudoflavor;
+}
+static int nfs_negotiate_security(const struct dentry *parent,
+                                  const struct dentry *dentry,
+                                  rpc_authflavor_t *flavor)
+{
+        struct page *page;
+        struct nfs4_secinfo_flavors *flavors;
+        int (*secinfo)(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
+        int ret = -EPERM;
+        secinfo = NFS_PROTO(parent->d_inode)->secinfo;
+        if (secinfo != NULL) {
+                page = alloc_page(GFP_KERNEL);
+                if (!page) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                flavors = page_address(page);
+                ret = secinfo(parent->d_inode, &dentry->d_name, flavors);
+                *flavor = nfs_find_best_sec(flavors, dentry->d_inode);
+                put_page(page);
+        }
+out:
+        return ret;
+}
+static int nfs_lookup_with_sec(struct nfs_server *server, struct dentry *parent,
+                               struct dentry *dentry, struct path *path,
+                               struct nfs_fh *fh, struct nfs_fattr *fattr,
+                               rpc_authflavor_t *flavor)
+{
+        struct rpc_clnt *clone;
+        struct rpc_auth *auth;
+        int err;
+        err = nfs_negotiate_security(parent, path->dentry, flavor);
+        if (err < 0)
+                goto out;
+        clone  = rpc_clone_client(server->client);
+        auth   = rpcauth_create(*flavor, clone);
+        if (!auth) {
+                err = -EIO;
+                goto out_shutdown;
+        }
+        err = server->nfs_client->rpc_ops->lookup(clone, parent->d_inode,
+                                                  &path->dentry->d_name,
+                                                  fh, fattr);
+out_shutdown:
+        rpc_shutdown_client(clone);
+out:
+        return err;
+}
+#else /* CONFIG_NFS_V4 */
+static inline int nfs_lookup_with_sec(struct nfs_server *server,
+                                      struct dentry *parent, struct dentry *dentry,
+                                      struct path *path, struct nfs_fh *fh,
+                                      struct nfs_fattr *fattr,
+                                      rpc_authflavor_t *flavor)
+{
+        return -EPERM;
+}
+#endif /* CONFIG_NFS_V4 */
 /*
 * nfs_d_automount - Handle crossing a mountpoint on the server
 * @path - The mountpoint
@@ -116,6 +231,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
        struct nfs_fh *fh = NULL;
        struct nfs_fattr *fattr = NULL;
        int err;
+        rpc_authflavor_t flavor = RPC_AUTH_UNIX;
        dprintk("--> nfs_d_automount()\n");
@@ -133,9 +249,11 @@ struct vfsmount *nfs_d_automount(struct path *path)
        /* Look it up again to get its attributes */
        parent = dget_parent(path->dentry);
-        err = server->nfs_client->rpc_ops->lookup(parent->d_inode,
+        err = server->nfs_client->rpc_ops->lookup(server->client, parent->d_inode,
                                                  &path->dentry->d_name,
                                                  fh, fattr);
+        if (err == -EPERM && NFS_PROTO(parent->d_inode)->secinfo != NULL)
+                err = nfs_lookup_with_sec(server, parent, path->dentry, path, fh, fattr, &flavor);
        dput(parent);
        if (err != 0) {
                mnt = ERR_PTR(err);
@@ -143,9 +261,9 @@ struct vfsmount *nfs_d_automount(struct path *path)
        }
        if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
-                mnt = nfs_do_refmount(path->mnt, path->dentry);
+                mnt = nfs_do_refmount(path->dentry);
        else
-                mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr);
+                mnt = nfs_do_submount(path->dentry, fh, fattr, flavor);
        if (IS_ERR(mnt))
                goto out;
@@ -209,22 +327,23 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 /**
 * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
- * @mnt_parent - mountpoint of parent directory
 * @dentry - parent directory
 * @fh - filehandle for new root dentry
 * @fattr - attributes for new root inode
+ * @authflavor - security flavor to use when performing the mount
 *
 */
-static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+static struct vfsmount *nfs_do_submount(struct dentry *dentry,
-                                        const struct dentry *dentry,
                                        struct nfs_fh *fh,
-                                        struct nfs_fattr *fattr)
+                                        struct nfs_fattr *fattr,
+                                        rpc_authflavor_t authflavor)
 {
        struct nfs_clone_mount mountdata = {
-                .sb = mnt_parent->mnt_sb,
+                .sb = dentry->d_sb,
                .dentry = dentry,
                .fh = fh,
                .fattr = fattr,
+                .authflavor = authflavor,
        };
        struct vfsmount *mnt = ERR_PTR(-ENOMEM);
        char *page = (char *) __get_free_page(GFP_USER);
@@ -237,11 +356,11 @@ static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
                        dentry->d_name.name);
        if (page == NULL)
                goto out;
-        devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE);
+        devname = nfs_devname(dentry, page, PAGE_SIZE);
        mnt = (struct vfsmount *)devname;
        if (IS_ERR(devname))
                goto free_page;
-        mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata);
+        mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata);
 free_page:
        free_page((unsigned long)page);
 out:
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ce939c062a52..38053d823eb0 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -141,7 +141,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 }
 static int
-nfs3_proc_lookup(struct inode *dir, struct qstr *name,
+nfs3_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
                 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
        struct nfs3_diropargs   arg = {
@@ -885,4 +885,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .lock           = nfs3_proc_lock,
        .clear_acl_cache = nfs3_forget_cached_acls,
        .close_context  = nfs_close_context,
+        .init_client    = nfs_init_client,
 };
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7a7474073148..e1c261ddd65d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -57,7 +57,8 @@ enum nfs4_session_state {
 struct nfs4_minor_version_ops {
        u32     minor_version;
-        int     (*call_sync)(struct nfs_server *server,
+        int     (*call_sync)(struct rpc_clnt *clnt,
+                        struct nfs_server *server,
                        struct rpc_message *msg,
                        struct nfs4_sequence_args *args,
                        struct nfs4_sequence_res *res,
@@ -252,6 +253,9 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
 extern int nfs4_setup_sequence(const struct nfs_server *server,
                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
                int cache_reply, struct rpc_task *task);
+extern int nfs41_setup_sequence(struct nfs4_session *session,
+                struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
+                int cache_reply, struct rpc_task *task);
 extern void nfs4_destroy_session(struct nfs4_session *session);
 extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
 extern int nfs4_proc_create_session(struct nfs_client *);
@@ -259,6 +263,21 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *);
 extern int nfs4_init_session(struct nfs_server *server);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
                struct nfs_fsinfo *fsinfo);
+extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
+                                  bool sync);
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+        return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
+                EXCHGID4_FLAG_USE_PNFS_DS;
+}
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+        return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
+}
 #else /* CONFIG_NFS_v4_1 */
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
 {
@@ -276,6 +295,18 @@ static inline int nfs4_init_session(struct nfs_server *server)
 {
        return 0;
 }
+static inline bool
+is_ds_only_client(struct nfs_client *clp)
+{
+        return false;
+}
+static inline bool
+is_ds_client(struct nfs_client *clp)
+{
+        return false;
+}
 #endif /* CONFIG_NFS_V4_1 */
 extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
@@ -298,6 +329,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
 #if defined(CONFIG_NFS_V4_1)
 struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
+extern void nfs4_schedule_session_recovery(struct nfs4_session *);
+#else
+static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
@@ -307,10 +343,9 @@ extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
 extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
 extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
-extern void nfs4_schedule_state_recovery(struct nfs_client *);
+extern void nfs4_schedule_lease_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
-extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
+extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
-extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 23f930caf1e2..6f8192f4cfc7 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -40,32 +40,370 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
 MODULE_DESCRIPTION("The NFSv4 file layout driver");
-static int
+#define FILELAYOUT_POLL_RETRY_MAX     (15*HZ)
-filelayout_set_layoutdriver(struct nfs_server *nfss)
-{
+static loff_t
-        int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client,
+filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
-                                                nfs4_fl_free_deviceid_callback);
+                            loff_t offset)
-        if (status) {
+{
-                printk(KERN_WARNING "%s: deviceid cache could not be "
+        u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
-                        "initialized\n", __func__);
+        u64 tmp;
-                return status;
+        offset -= flseg->pattern_offset;
+        tmp = offset;
+        do_div(tmp, stripe_width);
+        return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
+}
+/* This function is used by the layout driver to calculate the
+ * offset of the file on the dserver based on whether the
+ * layout type is STRIPE_DENSE or STRIPE_SPARSE
+ */
+static loff_t
+filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+        struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+        switch (flseg->stripe_type) {
+        case STRIPE_SPARSE:
+                return offset;
+        case STRIPE_DENSE:
+                return filelayout_get_dense_offset(flseg, offset);
+        }
+        BUG();
+}
+/* For data server errors we don't recover from */
+static void
+filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
+{
+        if (lseg->pls_range.iomode == IOMODE_RW) {
+                dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+        } else {
+                dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+        }
+}
+static int filelayout_async_handle_error(struct rpc_task *task,
+                                         struct nfs4_state *state,
+                                         struct nfs_client *clp,
+                                         int *reset)
+{
+        if (task->tk_status >= 0)
+                return 0;
+        *reset = 0;
+        switch (task->tk_status) {
+        case -NFS4ERR_BADSESSION:
+        case -NFS4ERR_BADSLOT:
+        case -NFS4ERR_BAD_HIGH_SLOT:
+        case -NFS4ERR_DEADSESSION:
+        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+        case -NFS4ERR_SEQ_FALSE_RETRY:
+        case -NFS4ERR_SEQ_MISORDERED:
+                dprintk("%s ERROR %d, Reset session. Exchangeid "
+                        "flags 0x%x\n", __func__, task->tk_status,
+                        clp->cl_exchange_flags);
+                nfs4_schedule_session_recovery(clp->cl_session);
+                break;
+        case -NFS4ERR_DELAY:
+        case -NFS4ERR_GRACE:
+        case -EKEYEXPIRED:
+                rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
+                break;
+        default:
+                dprintk("%s DS error. Retry through MDS %d\n", __func__,
+                        task->tk_status);
+                *reset = 1;
+                break;
+        }
+        task->tk_status = 0;
+        return -EAGAIN;
+}
+/* NFS_PROTO call done callback routines */
+static int filelayout_read_done_cb(struct rpc_task *task,
+                                struct nfs_read_data *data)
+{
+        struct nfs_client *clp = data->ds_clp;
+        int reset = 0;
+        dprintk("%s DS read\n", __func__);
+        if (filelayout_async_handle_error(task, data->args.context->state,
+                                          data->ds_clp, &reset) == -EAGAIN) {
+                dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+                        __func__, data->ds_clp, data->ds_clp->cl_session);
+                if (reset) {
+                        filelayout_set_lo_fail(data->lseg);
+                        nfs4_reset_read(task, data);
+                        clp = NFS_SERVER(data->inode)->nfs_client;
+                }
+                nfs_restart_rpc(task, clp);
+                return -EAGAIN;
        }
-        dprintk("%s: deviceid cache has been initialized successfully\n",
-                __func__);
        return 0;
 }
-/* Clear out the layout by destroying its device list */
+/*
-static int
+ * We reference the rpc_cred of the first WRITE that triggers the need for
-filelayout_clear_layoutdriver(struct nfs_server *nfss)
+ * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
+ * rfc5661 is not clear about which credential should be used.
+ */
+static void
+filelayout_set_layoutcommit(struct nfs_write_data *wdata)
 {
-        dprintk("--> %s\n", __func__);
+        if (FILELAYOUT_LSEG(wdata->lseg)->commit_through_mds ||
+            wdata->res.verf->committed == NFS_FILE_SYNC)
+                return;
+        pnfs_set_layoutcommit(wdata);
+        dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino,
+                (unsigned long) wdata->lseg->pls_end_pos);
+}
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void filelayout_read_prepare(struct rpc_task *task, void *data)
+{
+        struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+        rdata->read_done_cb = filelayout_read_done_cb;
+        if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
+                                &rdata->args.seq_args, &rdata->res.seq_res,
+                                0, task))
+                return;
+        rpc_call_start(task);
+}
+static void filelayout_read_call_done(struct rpc_task *task, void *data)
+{
+        struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+        /* Note this may cause RPC to be resent */
+        rdata->mds_ops->rpc_call_done(task, data);
+}
+static void filelayout_read_release(void *data)
+{
+        struct nfs_read_data *rdata = (struct nfs_read_data *)data;
+        rdata->mds_ops->rpc_release(data);
+}
+static int filelayout_write_done_cb(struct rpc_task *task,
+                                struct nfs_write_data *data)
+{
+        int reset = 0;
+        if (filelayout_async_handle_error(task, data->args.context->state,
+                                          data->ds_clp, &reset) == -EAGAIN) {
+                struct nfs_client *clp;
+                dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+                        __func__, data->ds_clp, data->ds_clp->cl_session);
+                if (reset) {
+                        filelayout_set_lo_fail(data->lseg);
+                        nfs4_reset_write(task, data);
+                        clp = NFS_SERVER(data->inode)->nfs_client;
+                } else
+                        clp = data->ds_clp;
+                nfs_restart_rpc(task, clp);
+                return -EAGAIN;
+        }
-        if (nfss->nfs_client->cl_devid_cache)
+        filelayout_set_layoutcommit(data);
-                pnfs_put_deviceid_cache(nfss->nfs_client);
        return 0;
 }
+/* Fake up some data that will cause nfs_commit_release to retry the writes. */
+static void prepare_to_resend_writes(struct nfs_write_data *data)
+{
+        struct nfs_page *first = nfs_list_entry(data->pages.next);
+        data->task.tk_status = 0;
+        memcpy(data->verf.verifier, first->wb_verf.verifier,
+               sizeof(first->wb_verf.verifier));
+        data->verf.verifier[0]++; /* ensure verifier mismatch */
+}
+static int filelayout_commit_done_cb(struct rpc_task *task,
+                                     struct nfs_write_data *data)
+{
+        int reset = 0;
+        if (filelayout_async_handle_error(task, data->args.context->state,
+                                          data->ds_clp, &reset) == -EAGAIN) {
+                dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
+                        __func__, data->ds_clp, data->ds_clp->cl_session);
+                if (reset) {
+                        prepare_to_resend_writes(data);
+                        filelayout_set_lo_fail(data->lseg);
+                } else
+                        nfs_restart_rpc(task, data->ds_clp);
+                return -EAGAIN;
+        }
+        return 0;
+}
+static void filelayout_write_prepare(struct rpc_task *task, void *data)
+{
+        struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+        if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
+                                &wdata->args.seq_args, &wdata->res.seq_res,
+                                0, task))
+                return;
+        rpc_call_start(task);
+}
+static void filelayout_write_call_done(struct rpc_task *task, void *data)
+{
+        struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+        /* Note this may cause RPC to be resent */
+        wdata->mds_ops->rpc_call_done(task, data);
+}
+static void filelayout_write_release(void *data)
+{
+        struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+        wdata->mds_ops->rpc_release(data);
+}
+static void filelayout_commit_release(void *data)
+{
+        struct nfs_write_data *wdata = (struct nfs_write_data *)data;
+        nfs_commit_release_pages(wdata);
+        if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding))
+                nfs_commit_clear_lock(NFS_I(wdata->inode));
+        nfs_commitdata_release(wdata);
+}
+struct rpc_call_ops filelayout_read_call_ops = {
+        .rpc_call_prepare = filelayout_read_prepare,
+        .rpc_call_done = filelayout_read_call_done,
+        .rpc_release = filelayout_read_release,
+};
+struct rpc_call_ops filelayout_write_call_ops = {
+        .rpc_call_prepare = filelayout_write_prepare,
+        .rpc_call_done = filelayout_write_call_done,
+        .rpc_release = filelayout_write_release,
+};
+struct rpc_call_ops filelayout_commit_call_ops = {
+        .rpc_call_prepare = filelayout_write_prepare,
+        .rpc_call_done = filelayout_write_call_done,
+        .rpc_release = filelayout_commit_release,
+};
+static enum pnfs_try_status
+filelayout_read_pagelist(struct nfs_read_data *data)
+{
+        struct pnfs_layout_segment *lseg = data->lseg;
+        struct nfs4_pnfs_ds *ds;
+        loff_t offset = data->args.offset;
+        u32 j, idx;
+        struct nfs_fh *fh;
+        int status;
+        dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+                __func__, data->inode->i_ino,
+                data->args.pgbase, (size_t)data->args.count, offset);
+        /* Retrieve the correct rpc_client for the byte range */
+        j = nfs4_fl_calc_j_index(lseg, offset);
+        idx = nfs4_fl_calc_ds_index(lseg, j);
+        ds = nfs4_fl_prepare_ds(lseg, idx);
+        if (!ds) {
+                /* Either layout fh index faulty, or ds connect failed */
+                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+                return PNFS_NOT_ATTEMPTED;
+        }
+        dprintk("%s USE DS:ip %x %hu\n", __func__,
+                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+        /* No multipath support. Use first DS */
+        data->ds_clp = ds->ds_clp;
+        fh = nfs4_fl_select_ds_fh(lseg, j);
+        if (fh)
+                data->args.fh = fh;
+        data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+        data->mds_offset = offset;
+        /* Perform an asynchronous read to ds */
+        status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
+                                   &filelayout_read_call_ops);
+        BUG_ON(status != 0);
+        return PNFS_ATTEMPTED;
+}
+/* Perform async writes. */
+static enum pnfs_try_status
+filelayout_write_pagelist(struct nfs_write_data *data, int sync)
+{
+        struct pnfs_layout_segment *lseg = data->lseg;
+        struct nfs4_pnfs_ds *ds;
+        loff_t offset = data->args.offset;
+        u32 j, idx;
+        struct nfs_fh *fh;
+        int status;
+        /* Retrieve the correct rpc_client for the byte range */
+        j = nfs4_fl_calc_j_index(lseg, offset);
+        idx = nfs4_fl_calc_ds_index(lseg, j);
+        ds = nfs4_fl_prepare_ds(lseg, idx);
+        if (!ds) {
+                printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+                return PNFS_NOT_ATTEMPTED;
+        }
+        dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
+                data->inode->i_ino, sync, (size_t) data->args.count, offset,
+                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+        data->write_done_cb = filelayout_write_done_cb;
+        data->ds_clp = ds->ds_clp;
+        fh = nfs4_fl_select_ds_fh(lseg, j);
+        if (fh)
+                data->args.fh = fh;
+        /*
+         * Get the file offset on the dserver. Set the write offset to
+         * this offset and save the original offset.
+         */
+        data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+        data->mds_offset = offset;
+        /* Perform an asynchronous write */
+        status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
+                                    &filelayout_write_call_ops, sync);
+        BUG_ON(status != 0);
+        return PNFS_ATTEMPTED;
+}
 /*
 * filelayout_check_layout()
 *
@@ -92,14 +430,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
                goto out;
        }
-        if (fl->stripe_unit % PAGE_SIZE) {
+        if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) {
-                dprintk("%s Stripe unit (%u) not page aligned\n",
+                dprintk("%s Invalid stripe unit (%u)\n",
                        __func__, fl->stripe_unit);
                goto out;
        }
        /* find and reference the deviceid */
-        dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id);
+        dsaddr = nfs4_fl_find_get_deviceid(id);
        if (dsaddr == NULL) {
                dsaddr = get_device_info(lo->plh_inode, id);
                if (dsaddr == NULL)
@@ -134,7 +472,7 @@ out:
        dprintk("--> %s returns %d\n", __func__, status);
        return status;
 out_put:
-        pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid);
+        nfs4_fl_put_deviceid(dsaddr);
        goto out;
 }
@@ -164,12 +502,33 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
                         struct nfs4_layoutget_res *lgr,
                         struct nfs4_deviceid *id)
 {
-        uint32_t *p = (uint32_t *)lgr->layout.buf;
+        struct xdr_stream stream;
+        struct xdr_buf buf = {
+                .pages =  lgr->layoutp->pages,
+                .page_len =  lgr->layoutp->len,
+                .buflen =  lgr->layoutp->len,
+                .len = lgr->layoutp->len,
+        };
+        struct page *scratch;
+        __be32 *p;
        uint32_t nfl_util;
        int i;
        dprintk("%s: set_layout_map Begin\n", __func__);
+        scratch = alloc_page(GFP_KERNEL);
+        if (!scratch)
+                return -ENOMEM;
+        xdr_init_decode(&stream, &buf, NULL);
+        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+        /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
+         * num_fh (4) */
+        p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20);
+        if (unlikely(!p))
+                goto out_err;
        memcpy(id, p, sizeof(*id));
        p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
        print_deviceid(id);
@@ -191,32 +550,57 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
                __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
                fl->pattern_offset);
+        if (!fl->num_fh)
+                goto out_err;
        fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
                               GFP_KERNEL);
        if (!fl->fh_array)
-                return -ENOMEM;
+                goto out_err;
        for (i = 0; i < fl->num_fh; i++) {
                /* Do we want to use a mempool here? */
                fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
-                if (!fl->fh_array[i]) {
+                if (!fl->fh_array[i])
-                        filelayout_free_fh_array(fl);
+                        goto out_err_free;
-                        return -ENOMEM;
-                }
+                p = xdr_inline_decode(&stream, 4);
+                if (unlikely(!p))
+                        goto out_err_free;
                fl->fh_array[i]->size = be32_to_cpup(p++);
                if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
                        printk(KERN_ERR "Too big fh %d received %d\n",
                               i, fl->fh_array[i]->size);
-                        filelayout_free_fh_array(fl);
+                        goto out_err_free;
-                        return -EIO;
                }
+                p = xdr_inline_decode(&stream, fl->fh_array[i]->size);
+                if (unlikely(!p))
+                        goto out_err_free;
                memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
-                p += XDR_QUADLEN(fl->fh_array[i]->size);
                dprintk("DEBUG: %s: fh len %d\n", __func__,
                        fl->fh_array[i]->size);
        }
+        __free_page(scratch);
        return 0;
+out_err_free:
+        filelayout_free_fh_array(fl);
+out_err:
+        __free_page(scratch);
+        return -EIO;
+}
+static void
+filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+        dprintk("--> %s\n", __func__);
+        nfs4_fl_put_deviceid(fl->dsaddr);
+        kfree(fl->commit_buckets);
+        _filelayout_free_lseg(fl);
 }
 static struct pnfs_layout_segment *
@@ -237,29 +621,252 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
                _filelayout_free_lseg(fl);
                return NULL;
        }
+        /* This assumes there is only one IOMODE_RW lseg.  What
+         * we really want to do is have a layout_hdr level
+         * dictionary of <multipath_list4, fh> keys, each
+         * associated with a struct list_head, populated by calls
+         * to filelayout_write_pagelist().
+         * */
+        if ((!fl->commit_through_mds) && (lgr->range.iomode == IOMODE_RW)) {
+                int i;
+                int size = (fl->stripe_type == STRIPE_SPARSE) ?
+                        fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
+                fl->commit_buckets = kcalloc(size, sizeof(struct list_head), GFP_KERNEL);
+                if (!fl->commit_buckets) {
+                        filelayout_free_lseg(&fl->generic_hdr);
+                        return NULL;
+                }
+                fl->number_of_buckets = size;
+                for (i = 0; i < size; i++)
+                        INIT_LIST_HEAD(&fl->commit_buckets[i]);
+        }
        return &fl->generic_hdr;
 }
-static void
+/*
-filelayout_free_lseg(struct pnfs_layout_segment *lseg)
+ * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
+ *
+ * return 1 :  coalesce page
+ * return 0 :  don't coalesce page
+ */
+int
+filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+                   struct nfs_page *req)
+{
+        u64 p_stripe, r_stripe;
+        u32 stripe_unit;
+        if (!pgio->pg_lseg)
+                return 1;
+        p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
+        r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
+        stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+        do_div(p_stripe, stripe_unit);
+        do_div(r_stripe, stripe_unit);
+        return (p_stripe == r_stripe);
+}
+static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg)
+{
+        return !FILELAYOUT_LSEG(lseg)->commit_through_mds;
+}
+static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
+{
+        if (fl->stripe_type == STRIPE_SPARSE)
+                return nfs4_fl_calc_ds_index(&fl->generic_hdr, j);
+        else
+                return j;
+}
+struct list_head *filelayout_choose_commit_list(struct nfs_page *req)
 {
-        struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode);
+        struct pnfs_layout_segment *lseg = req->wb_commit_lseg;
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+        u32 i, j;
+        struct list_head *list;
+        /* Note that we are calling nfs4_fl_calc_j_index on each page
+         * that ends up being committed to a data server.  An attractive
+         * alternative is to add a field to nfs_write_data and nfs_page
+         * to store the value calculated in filelayout_write_pagelist
+         * and just use that here.
+         */
+        j = nfs4_fl_calc_j_index(lseg,
+                                 (loff_t)req->wb_index << PAGE_CACHE_SHIFT);
+        i = select_bucket_index(fl, j);
+        list = &fl->commit_buckets[i];
+        if (list_empty(list)) {
+                /* Non-empty buckets hold a reference on the lseg */
+                get_lseg(lseg);
+        }
+        return list;
+}
-        dprintk("--> %s\n", __func__);
+static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
-        pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache,
+{
-                          &fl->dsaddr->deviceid);
+        struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
-        _filelayout_free_lseg(fl);
+        if (flseg->stripe_type == STRIPE_SPARSE)
+                return i;
+        else
+                return nfs4_fl_calc_ds_index(lseg, i);
+}
+static struct nfs_fh *
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+        struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+        if (flseg->stripe_type == STRIPE_SPARSE) {
+                if (flseg->num_fh == 1)
+                        i = 0;
+                else if (flseg->num_fh == 0)
+                        /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+                        return NULL;
+        }
+        return flseg->fh_array[i];
+}
+static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
+{
+        struct pnfs_layout_segment *lseg = data->lseg;
+        struct nfs4_pnfs_ds *ds;
+        u32 idx;
+        struct nfs_fh *fh;
+        idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+        ds = nfs4_fl_prepare_ds(lseg, idx);
+        if (!ds) {
+                printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
+                set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
+                set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
+                prepare_to_resend_writes(data);
+                data->mds_ops->rpc_release(data);
+                return -EAGAIN;
+        }
+        dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how);
+        data->write_done_cb = filelayout_commit_done_cb;
+        data->ds_clp = ds->ds_clp;
+        fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+        if (fh)
+                data->args.fh = fh;
+        return nfs_initiate_commit(data, ds->ds_clp->cl_rpcclient,
+                                   &filelayout_commit_call_ops, how);
+}
+/*
+ * This is only useful while we are using whole file layouts.
+ */
+static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode)
+{
+        struct pnfs_layout_segment *lseg, *rv = NULL;
+        spin_lock(&inode->i_lock);
+        list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
+                if (lseg->pls_range.iomode == IOMODE_RW)
+                        rv = get_lseg(lseg);
+        spin_unlock(&inode->i_lock);
+        return rv;
+}
+static int alloc_ds_commits(struct inode *inode, struct list_head *list)
+{
+        struct pnfs_layout_segment *lseg;
+        struct nfs4_filelayout_segment *fl;
+        struct nfs_write_data *data;
+        int i, j;
+        /* Won't need this when non-whole file layout segments are supported
+         * instead we will use a pnfs_layout_hdr structure */
+        lseg = find_only_write_lseg(inode);
+        if (!lseg)
+                return 0;
+        fl = FILELAYOUT_LSEG(lseg);
+        for (i = 0; i < fl->number_of_buckets; i++) {
+                if (list_empty(&fl->commit_buckets[i]))
+                        continue;
+                data = nfs_commitdata_alloc();
+                if (!data)
+                        goto out_bad;
+                data->ds_commit_index = i;
+                data->lseg = lseg;
+                list_add(&data->pages, list);
+        }
+        put_lseg(lseg);
+        return 0;
+out_bad:
+        for (j = i; j < fl->number_of_buckets; j++) {
+                if (list_empty(&fl->commit_buckets[i]))
+                        continue;
+                nfs_retry_commit(&fl->commit_buckets[i], lseg);
+                put_lseg(lseg);  /* associated with emptying bucket */
+        }
+        put_lseg(lseg);
+        /* Caller will clean up entries put on list */
+        return -ENOMEM;
+}
+/* This follows nfs_commit_list pretty closely */
+static int
+filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+                           int how)
+{
+        struct nfs_write_data   *data, *tmp;
+        LIST_HEAD(list);
+        if (!list_empty(mds_pages)) {
+                data = nfs_commitdata_alloc();
+                if (!data)
+                        goto out_bad;
+                data->lseg = NULL;
+                list_add(&data->pages, &list);
+        }
+        if (alloc_ds_commits(inode, &list))
+                goto out_bad;
+        list_for_each_entry_safe(data, tmp, &list, pages) {
+                list_del_init(&data->pages);
+                atomic_inc(&NFS_I(inode)->commits_outstanding);
+                if (!data->lseg) {
+                        nfs_init_commit(data, mds_pages, NULL);
+                        nfs_initiate_commit(data, NFS_CLIENT(inode),
+                                            data->mds_ops, how);
+                } else {
+                        nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index], data->lseg);
+                        filelayout_initiate_commit(data, how);
+                }
+        }
+        return 0;
+ out_bad:
+        list_for_each_entry_safe(data, tmp, &list, pages) {
+                nfs_retry_commit(&data->pages, data->lseg);
+                list_del_init(&data->pages);
+                nfs_commit_free(data);
+        }
+        nfs_retry_commit(mds_pages, NULL);
+        nfs_commit_clear_lock(NFS_I(inode));
+        return -ENOMEM;
 }
 static struct pnfs_layoutdriver_type filelayout_type = {
-        .id = LAYOUT_NFSV4_1_FILES,
+        .id                     = LAYOUT_NFSV4_1_FILES,
-        .name = "LAYOUT_NFSV4_1_FILES",
+        .name                   = "LAYOUT_NFSV4_1_FILES",
-        .owner = THIS_MODULE,
+        .owner                  = THIS_MODULE,
-        .set_layoutdriver = filelayout_set_layoutdriver,
+        .alloc_lseg             = filelayout_alloc_lseg,
-        .clear_layoutdriver = filelayout_clear_layoutdriver,
+        .free_lseg              = filelayout_free_lseg,
-        .alloc_lseg              = filelayout_alloc_lseg,
+        .pg_test                = filelayout_pg_test,
-        .free_lseg               = filelayout_free_lseg,
+        .mark_pnfs_commit       = filelayout_mark_pnfs_commit,
+        .choose_commit_list     = filelayout_choose_commit_list,
+        .commit_pagelist        = filelayout_commit_pagelist,
+        .read_pagelist          = filelayout_read_pagelist,
+        .write_pagelist         = filelayout_write_pagelist,
 };
 static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index bbf60dd2ab9d..7c44579f5832 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -33,7 +33,7 @@
 #include "pnfs.h"
 /*
- * Field testing shows we need to support upto 4096 stripe indices.
+ * Field testing shows we need to support up to 4096 stripe indices.
 * We store each index as a u8 (u32 on the wire) to keep the memory footprint
 * reasonable. This in turn means we support a maximum of 256
 * RFC 5661 multipath_list4 structures.
@@ -55,8 +55,14 @@ struct nfs4_pnfs_ds {
        atomic_t                ds_count;
 };
+/* nfs4_file_layout_dsaddr flags */
+#define NFS4_DEVICE_ID_NEG_ENTRY        0x00000001
 struct nfs4_file_layout_dsaddr {
-        struct pnfs_deviceid_node       deviceid;
+        struct hlist_node               node;
+        struct nfs4_deviceid            deviceid;
+        atomic_t                        ref;
+        unsigned long                   flags;
        u32                             stripe_count;
        u8                              *stripe_indices;
        u32                             ds_num;
@@ -73,6 +79,8 @@ struct nfs4_filelayout_segment {
        struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
        unsigned int num_fh;
        struct nfs_fh **fh_array;
+        struct list_head *commit_buckets; /* Sort commits to ds */
+        int number_of_buckets;
 };
 static inline struct nfs4_filelayout_segment *
@@ -83,11 +91,18 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
                            generic_hdr);
 }
-extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *);
+extern struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
 extern void print_ds(struct nfs4_pnfs_ds *ds);
 extern void print_deviceid(struct nfs4_deviceid *dev_id);
+u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
+u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
+struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
+                                        u32 ds_idx);
 extern struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id);
+nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
+extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
 get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index f5c9b125e8cc..de5350f2b249 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -37,6 +37,30 @@
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 /*
+ * Device ID RCU cache. A device ID is unique per client ID and layout type.
+ */
+#define NFS4_FL_DEVICE_ID_HASH_BITS     5
+#define NFS4_FL_DEVICE_ID_HASH_SIZE     (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
+#define NFS4_FL_DEVICE_ID_HASH_MASK     (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
+static inline u32
+nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
+{
+        unsigned char *cptr = (unsigned char *)id->data;
+        unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+        u32 x = 0;
+        while (nbytes--) {
+                x *= 37;
+                x += *cptr++;
+        }
+        return x & NFS4_FL_DEVICE_ID_HASH_MASK;
+}
+static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(filelayout_deviceid_lock);
+/*
 * Data server cache
 *
 * Data servers can be mapped to different device ids.
@@ -104,6 +128,67 @@ _data_server_lookup_locked(u32 ip_addr, u32 port)
        return NULL;
 }
+/*
+ * Create an rpc connection to the nfs4_pnfs_ds data server
+ * Currently only support IPv4
+ */
+static int
+nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
+{
+        struct nfs_client *clp;
+        struct sockaddr_in sin;
+        int status = 0;
+        dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
+                ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
+                mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
+        sin.sin_family = AF_INET;
+        sin.sin_addr.s_addr = ds->ds_ip_addr;
+        sin.sin_port = ds->ds_port;
+        clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin,
+                                 sizeof(sin), IPPROTO_TCP);
+        if (IS_ERR(clp)) {
+                status = PTR_ERR(clp);
+                goto out;
+        }
+        if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) {
+                if (!is_ds_client(clp)) {
+                        status = -ENODEV;
+                        goto out_put;
+                }
+                ds->ds_clp = clp;
+                dprintk("%s [existing] ip=%x, port=%hu\n", __func__,
+                        ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
+                goto out;
+        }
+        /*
+         * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
+         * be equal to the MDS lease. Renewal is scheduled in create_session.
+         */
+        spin_lock(&mds_srv->nfs_client->cl_lock);
+        clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
+        spin_unlock(&mds_srv->nfs_client->cl_lock);
+        clp->cl_last_renewal = jiffies;
+        /* New nfs_client */
+        status = nfs4_init_ds_session(clp);
+        if (status)
+                goto out_put;
+        ds->ds_clp = clp;
+        dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr),
+                ntohs(ds->ds_port));
+out:
+        return status;
+out_put:
+        nfs_put_client(clp);
+        goto out;
+}
 static void
 destroy_ds(struct nfs4_pnfs_ds *ds)
 {
@@ -122,7 +207,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
        struct nfs4_pnfs_ds *ds;
        int i;
-        print_deviceid(&dsaddr->deviceid.de_id);
+        print_deviceid(&dsaddr->deviceid);
        for (i = 0; i < dsaddr->ds_num; i++) {
                ds = dsaddr->ds_list[i];
@@ -139,15 +224,6 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
        kfree(dsaddr);
 }
-void
-nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
-{
-        struct nfs4_file_layout_dsaddr *dsaddr =
-                container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
-        nfs4_fl_free_deviceid(dsaddr);
-}
 static struct nfs4_pnfs_ds *
 nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
 {
@@ -185,7 +261,7 @@ out:
 * Currently only support ipv4, and one multi-path address.
 */
 static struct nfs4_pnfs_ds *
-decode_and_add_ds(__be32 **pp, struct inode *inode)
+decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode)
 {
        struct nfs4_pnfs_ds *ds = NULL;
        char *buf;
@@ -193,25 +269,34 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
        u32 ip_addr, port;
        int nlen, rlen, i;
        int tmp[2];
-        __be32 *r_netid, *r_addr, *p = *pp;
+        __be32 *p;
        /* r_netid */
+        p = xdr_inline_decode(streamp, 4);
+        if (unlikely(!p))
+                goto out_err;
        nlen = be32_to_cpup(p++);
-        r_netid = p;
-        p += XDR_QUADLEN(nlen);
-        /* r_addr */
+        p = xdr_inline_decode(streamp, nlen);
-        rlen = be32_to_cpup(p++);
+        if (unlikely(!p))
-        r_addr = p;
+                goto out_err;
-        p += XDR_QUADLEN(rlen);
-        *pp = p;
        /* Check that netid is "tcp" */
-        if (nlen != 3 ||  memcmp((char *)r_netid, "tcp", 3)) {
+        if (nlen != 3 ||  memcmp((char *)p, "tcp", 3)) {
                dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
                goto out_err;
        }
+        /* r_addr */
+        p = xdr_inline_decode(streamp, 4);
+        if (unlikely(!p))
+                goto out_err;
+        rlen = be32_to_cpup(p);
+        p = xdr_inline_decode(streamp, rlen);
+        if (unlikely(!p))
+                goto out_err;
        /* ipv6 length plus port is legal */
        if (rlen > INET6_ADDRSTRLEN + 8) {
                dprintk("%s: Invalid address, length %d\n", __func__,
@@ -219,8 +304,12 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
                goto out_err;
        }
        buf = kmalloc(rlen + 1, GFP_KERNEL);
+        if (!buf) {
+                dprintk("%s: Not enough memory\n", __func__);
+                goto out_err;
+        }
        buf[rlen] = '\0';
-        memcpy(buf, r_addr, rlen);
+        memcpy(buf, p, rlen);
        /* replace the port dots with dashes for the in4_pton() delimiter*/
        for (i = 0; i < 2; i++) {
@@ -256,118 +345,191 @@ out_err:
 static struct nfs4_file_layout_dsaddr*
 decode_device(struct inode *ino, struct pnfs_device *pdev)
 {
-        int i, dummy;
+        int i;
        u32 cnt, num;
        u8 *indexp;
-        __be32 *p = (__be32 *)pdev->area, *indicesp;
+        __be32 *p;
-        struct nfs4_file_layout_dsaddr *dsaddr;
+        u8 *stripe_indices;
+        u8 max_stripe_index;
+        struct nfs4_file_layout_dsaddr *dsaddr = NULL;
+        struct xdr_stream stream;
+        struct xdr_buf buf = {
+                .pages = pdev->pages,
+                .page_len = pdev->pglen,
+                .buflen = pdev->pglen,
+                .len = pdev->pglen,
+        };
+        struct page *scratch;
+        /* set up xdr stream */
+        scratch = alloc_page(GFP_KERNEL);
+        if (!scratch)
+                goto out_err;
+        xdr_init_decode(&stream, &buf, NULL);
+        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
        /* Get the stripe count (number of stripe index) */
-        cnt = be32_to_cpup(p++);
+        p = xdr_inline_decode(&stream, 4);
+        if (unlikely(!p))
+                goto out_err_free_scratch;
+        cnt = be32_to_cpup(p);
        dprintk("%s stripe count  %d\n", __func__, cnt);
        if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
                printk(KERN_WARNING "%s: stripe count %d greater than "
                       "supported maximum %d\n", __func__,
                        cnt, NFS4_PNFS_MAX_STRIPE_CNT);
-                goto out_err;
+                goto out_err_free_scratch;
+        }
+        /* read stripe indices */
+        stripe_indices = kcalloc(cnt, sizeof(u8), GFP_KERNEL);
+        if (!stripe_indices)
+                goto out_err_free_scratch;
+        p = xdr_inline_decode(&stream, cnt << 2);
+        if (unlikely(!p))
+                goto out_err_free_stripe_indices;
+        indexp = &stripe_indices[0];
+        max_stripe_index = 0;
+        for (i = 0; i < cnt; i++) {
+                *indexp = be32_to_cpup(p++);
+                max_stripe_index = max(max_stripe_index, *indexp);
+                indexp++;
        }
        /* Check the multipath list count */
-        indicesp = p;
+        p = xdr_inline_decode(&stream, 4);
-        p += XDR_QUADLEN(cnt << 2);
+        if (unlikely(!p))
-        num = be32_to_cpup(p++);
+                goto out_err_free_stripe_indices;
+        num = be32_to_cpup(p);
        dprintk("%s ds_num %u\n", __func__, num);
        if (num > NFS4_PNFS_MAX_MULTI_CNT) {
                printk(KERN_WARNING "%s: multipath count %d greater than "
                        "supported maximum %d\n", __func__,
                        num, NFS4_PNFS_MAX_MULTI_CNT);
-                goto out_err;
+                goto out_err_free_stripe_indices;
+        }
+        /* validate stripe indices are all < num */
+        if (max_stripe_index >= num) {
+                printk(KERN_WARNING "%s: stripe index %u >= num ds %u\n",
+                        __func__, max_stripe_index, num);
+                goto out_err_free_stripe_indices;
        }
        dsaddr = kzalloc(sizeof(*dsaddr) +
                        (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
                        GFP_KERNEL);
        if (!dsaddr)
-                goto out_err;
+                goto out_err_free_stripe_indices;
-        dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
-        if (!dsaddr->stripe_indices)
-                goto out_err_free;
        dsaddr->stripe_count = cnt;
+        dsaddr->stripe_indices = stripe_indices;
+        stripe_indices = NULL;
        dsaddr->ds_num = num;
-        memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id));
+        memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
-        /* Go back an read stripe indices */
-        p = indicesp;
-        indexp = &dsaddr->stripe_indices[0];
-        for (i = 0; i < dsaddr->stripe_count; i++) {
-                *indexp = be32_to_cpup(p++);
-                if (*indexp >= num)
-                        goto out_err_free;
-                indexp++;
-        }
-        /* Skip already read multipath list count */
-        p++;
        for (i = 0; i < dsaddr->ds_num; i++) {
                int j;
+                u32 mp_count;
-                dummy = be32_to_cpup(p++); /* multipath count */
+                p = xdr_inline_decode(&stream, 4);
-                if (dummy > 1) {
+                if (unlikely(!p))
+                        goto out_err_free_deviceid;
+                mp_count = be32_to_cpup(p); /* multipath count */
+                if (mp_count > 1) {
                        printk(KERN_WARNING
                               "%s: Multipath count %d not supported, "
                               "skipping all greater than 1\n", __func__,
-                                dummy);
+                                mp_count);
                }
-                for (j = 0; j < dummy; j++) {
+                for (j = 0; j < mp_count; j++) {
                        if (j == 0) {
-                                dsaddr->ds_list[i] = decode_and_add_ds(&p, ino);
+                                dsaddr->ds_list[i] = decode_and_add_ds(&stream,
+                                        ino);
                                if (dsaddr->ds_list[i] == NULL)
-                                        goto out_err_free;
+                                        goto out_err_free_deviceid;
                        } else {
                                u32 len;
                                /* skip extra multipath */
-                                len = be32_to_cpup(p++);
-                                p += XDR_QUADLEN(len);
+                                /* read len, skip */
-                                len = be32_to_cpup(p++);
+                                p = xdr_inline_decode(&stream, 4);
-                                p += XDR_QUADLEN(len);
+                                if (unlikely(!p))
-                                continue;
+                                        goto out_err_free_deviceid;
+                                len = be32_to_cpup(p);
+                                p = xdr_inline_decode(&stream, len);
+                                if (unlikely(!p))
+                                        goto out_err_free_deviceid;
+                                /* read len, skip */
+                                p = xdr_inline_decode(&stream, 4);
+                                if (unlikely(!p))
+                                        goto out_err_free_deviceid;
+                                len = be32_to_cpup(p);
+                                p = xdr_inline_decode(&stream, len);
+                                if (unlikely(!p))
+                                        goto out_err_free_deviceid;
                        }
                }
        }
+        __free_page(scratch);
        return dsaddr;
-out_err_free:
+out_err_free_deviceid:
        nfs4_fl_free_deviceid(dsaddr);
+        /* stripe_indicies was part of dsaddr */
+        goto out_err_free_scratch;
+out_err_free_stripe_indices:
+        kfree(stripe_indices);
+out_err_free_scratch:
+        __free_page(scratch);
 out_err:
        dprintk("%s ERROR: returning NULL\n", __func__);
        return NULL;
 }
 /*
- * Decode the opaque device specified in 'dev'
+ * Decode the opaque device specified in 'dev' and add it to the cache of
- * and add it to the list of available devices.
+ * available devices.
- * If the deviceid is already cached, nfs4_add_deviceid will return
- * a pointer to the cached struct and throw away the new.
 */
-static struct nfs4_file_layout_dsaddr*
+static struct nfs4_file_layout_dsaddr *
 decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
 {
-        struct nfs4_file_layout_dsaddr *dsaddr;
+        struct nfs4_file_layout_dsaddr *d, *new;
-        struct pnfs_deviceid_node *d;
+        long hash;
-        dsaddr = decode_device(inode, dev);
+        new = decode_device(inode, dev);
-        if (!dsaddr) {
+        if (!new) {
                printk(KERN_WARNING "%s: Could not decode or add device\n",
                        __func__);
                return NULL;
        }
-        d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache,
+        spin_lock(&filelayout_deviceid_lock);
-                              &dsaddr->deviceid);
+        d = nfs4_fl_find_get_deviceid(&new->deviceid);
+        if (d) {
+                spin_unlock(&filelayout_deviceid_lock);
+                nfs4_fl_free_deviceid(new);
+                return d;
+        }
-        return container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+        INIT_HLIST_NODE(&new->node);
+        atomic_set(&new->ref, 1);
+        hash = nfs4_fl_deviceid_hash(&new->deviceid);
+        hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
+        spin_unlock(&filelayout_deviceid_lock);
+        return new;
 }
 /*
@@ -409,11 +571,6 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
                        goto out_free;
        }
-        /* set pdev->area */
-        pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
-        if (!pdev->area)
-                goto out_free;
        memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
        pdev->layout_type = LAYOUT_NFSV4_1_FILES;
        pdev->pages = pages;
@@ -432,8 +589,6 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
         */
        dsaddr = decode_and_add_device(inode, pdev);
 out_free:
-        if (pdev->area != NULL)
-                vunmap(pdev->area);
        for (i = 0; i < max_pages; i++)
                __free_page(pages[i]);
        kfree(pages);
@@ -442,12 +597,123 @@ out_free:
        return dsaddr;
 }
+void
+nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
+{
+        if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
+                hlist_del_rcu(&dsaddr->node);
+                spin_unlock(&filelayout_deviceid_lock);
+                synchronize_rcu();
+                nfs4_fl_free_deviceid(dsaddr);
+        }
+}
 struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id)
+nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
 {
-        struct pnfs_deviceid_node *d;
+        struct nfs4_file_layout_dsaddr *d;
+        struct hlist_node *n;
+        long hash = nfs4_fl_deviceid_hash(id);
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
+                if (!memcmp(&d->deviceid, id, sizeof(*id))) {
+                        if (!atomic_inc_not_zero(&d->ref))
+                                goto fail;
+                        rcu_read_unlock();
+                        return d;
+                }
+        }
+fail:
+        rcu_read_unlock();
+        return NULL;
+}
-        d = pnfs_find_get_deviceid(clp->cl_devid_cache, id);
+/*
-        return (d == NULL) ? NULL :
+ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
-                container_of(d, struct nfs4_file_layout_dsaddr, deviceid);
+ * Then: ((res + fsi) % dsaddr->stripe_count)
+ */
+u32
+nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
+{
+        struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+        u64 tmp;
+        tmp = offset - flseg->pattern_offset;
+        do_div(tmp, flseg->stripe_unit);
+        tmp += flseg->first_stripe_index;
+        return do_div(tmp, flseg->dsaddr->stripe_count);
+}
+u32
+nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
+{
+        return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
+}
+struct nfs_fh *
+nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
+{
+        struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
+        u32 i;
+        if (flseg->stripe_type == STRIPE_SPARSE) {
+                if (flseg->num_fh == 1)
+                        i = 0;
+                else if (flseg->num_fh == 0)
+                        /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
+                        return NULL;
+                else
+                        i = nfs4_fl_calc_ds_index(lseg, j);
+        } else
+                i = j;
+        return flseg->fh_array[i];
+}
+static void
+filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
+                               int err, u32 ds_addr)
+{
+        u32 *p = (u32 *)&dsaddr->deviceid;
+        printk(KERN_ERR "NFS: data server %x connection error %d."
+                " Deviceid [%x%x%x%x] marked out of use.\n",
+                ds_addr, err, p[0], p[1], p[2], p[3]);
+        spin_lock(&filelayout_deviceid_lock);
+        dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
+        spin_unlock(&filelayout_deviceid_lock);
+}
+struct nfs4_pnfs_ds *
+nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
+{
+        struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
+        struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
+        if (ds == NULL) {
+                printk(KERN_ERR "%s: No data server for offset index %d\n",
+                        __func__, ds_idx);
+                return NULL;
+        }
+        if (!ds->ds_clp) {
+                struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
+                int err;
+                if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) {
+                        /* Already tried to connect, don't try again */
+                        dprintk("%s Deviceid marked out of use\n", __func__);
+                        return NULL;
+                }
+                err = nfs4_ds_connect(s, ds);
+                if (err) {
+                        filelayout_mark_devid_negative(dsaddr, err,
+                                                       ntohl(ds->ds_ip_addr));
+                        return NULL;
+                }
+        }
+        return ds;
 }
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 3c2a1724fbd2..bb80c49b6533 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -54,33 +54,29 @@ Elong:
 /*
 * Determine the mount path as a string
 */
-static char *nfs4_path(const struct vfsmount *mnt_parent,
+static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen)
-                       const struct dentry *dentry,
-                       char *buffer, ssize_t buflen)
 {
-        const char *srvpath;
+        char *limit;
+        char *path = nfs_path(&limit, dentry, buffer, buflen);
-        srvpath = strchr(mnt_parent->mnt_devname, ':');
+        if (!IS_ERR(path)) {
-        if (srvpath)
+                char *colon = strchr(path, ':');
-                srvpath++;
+                if (colon && colon < limit)
-        else
+                        path = colon + 1;
-                srvpath = mnt_parent->mnt_devname;
+        }
+        return path;
-        return nfs_path(srvpath, mnt_parent->mnt_root, dentry, buffer, buflen);
 }
 /*
 * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we
 * believe to be the server path to this dentry
 */
-static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
+static int nfs4_validate_fspath(struct dentry *dentry,
-                                const struct dentry *dentry,
                                const struct nfs4_fs_locations *locations,
                                char *page, char *page2)
 {
        const char *path, *fs_path;
-        path = nfs4_path(mnt_parent, dentry, page, PAGE_SIZE);
+        path = nfs4_path(dentry, page, PAGE_SIZE);
        if (IS_ERR(path))
                return PTR_ERR(path);
@@ -165,20 +161,18 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 /**
 * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
- * @mnt_parent - mountpoint of parent directory
 * @dentry - parent directory
 * @locations - array of NFSv4 server location information
 *
 */
-static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
+static struct vfsmount *nfs_follow_referral(struct dentry *dentry,
-                                            const struct dentry *dentry,
                                            const struct nfs4_fs_locations *locations)
 {
        struct vfsmount *mnt = ERR_PTR(-ENOENT);
        struct nfs_clone_mount mountdata = {
-                .sb = mnt_parent->mnt_sb,
+                .sb = dentry->d_sb,
                .dentry = dentry,
-                .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
+                .authflavor = NFS_SB(dentry->d_sb)->client->cl_auth->au_flavor,
        };
        char *page = NULL, *page2 = NULL;
        int loc, error;
@@ -198,7 +192,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
                goto out;
        /* Ensure fs path is a prefix of current dentry path */
-        error = nfs4_validate_fspath(mnt_parent, dentry, locations, page, page2);
+        error = nfs4_validate_fspath(dentry, locations, page, page2);
        if (error < 0) {
                mnt = ERR_PTR(error);
                goto out;
@@ -225,11 +219,10 @@ out:
 /*
 * nfs_do_refmount - handle crossing a referral on server
- * @mnt_parent - mountpoint of referral
 * @dentry - dentry of referral
 *
 */
-struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry)
+struct vfsmount *nfs_do_refmount(struct dentry *dentry)
 {
        struct vfsmount *mnt = ERR_PTR(-ENOMEM);
        struct dentry *parent;
@@ -262,7 +255,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
            fs_locations->fs_path.ncomponents <= 0)
                goto out_free;
-        mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations);
+        mnt = nfs_follow_referral(dentry, fs_locations);
 out_free:
        __free_page(page);
        kfree(fs_locations);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 78936a8f40ab..9bf41eab3e46 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -41,6 +41,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/gss_api.h>
 #include <linux/nfs.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
@@ -71,7 +72,9 @@ static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
-static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
+static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir,
+                             const struct qstr *name, struct nfs_fh *fhandle,
+                             struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
                            struct nfs_fattr *fattr, struct iattr *sattr,
@@ -85,6 +88,11 @@ static int nfs4_map_errors(int err)
        switch (err) {
        case -NFS4ERR_RESOURCE:
                return -EREMOTEIO;
+        case -NFS4ERR_WRONGSEC:
+                return -EPERM;
+        case -NFS4ERR_BADOWNER:
+        case -NFS4ERR_BADNAME:
+                return -EINVAL;
        default:
                dprintk("%s could not handle NFSv4 error %d\n",
                                __func__, -err);
@@ -241,7 +249,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 /* This is the error handling routine for processes that are allowed
 * to sleep.
 */
-static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
 {
        struct nfs_client *clp = server->nfs_client;
        struct nfs4_state *state = exception->state;
@@ -256,12 +264,13 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                case -NFS4ERR_OPENMODE:
                        if (state == NULL)
                                break;
-                        nfs4_state_mark_reclaim_nograce(clp, state);
+                        nfs4_schedule_stateid_recovery(server, state);
-                        goto do_state_recovery;
+                        goto wait_on_recovery;
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
-                        goto do_state_recovery;
+                        nfs4_schedule_lease_recovery(clp);
+                        goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
                case -NFS4ERR_BADSESSION:
                case -NFS4ERR_BADSLOT:
@@ -272,7 +281,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                case -NFS4ERR_SEQ_MISORDERED:
                        dprintk("%s ERROR: %d Reset session\n", __func__,
                                errorcode);
-                        nfs4_schedule_state_recovery(clp);
+                        nfs4_schedule_session_recovery(clp->cl_session);
                        exception->retry = 1;
                        break;
 #endif /* defined(CONFIG_NFS_V4_1) */
@@ -292,11 +301,23 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
                                break;
                case -NFS4ERR_OLD_STATEID:
                        exception->retry = 1;
+                        break;
+                case -NFS4ERR_BADOWNER:
+                        /* The following works around a Linux server bug! */
+                case -NFS4ERR_BADNAME:
+                        if (server->caps & NFS_CAP_UIDGID_NOMAP) {
+                                server->caps &= ~NFS_CAP_UIDGID_NOMAP;
+                                exception->retry = 1;
+                                printk(KERN_WARNING "NFS: v4 server %s "
+                                                "does not accept raw "
+                                                "uid/gids. "
+                                                "Reenabling the idmapper.\n",
+                                                server->nfs_client->cl_hostname);
+                        }
        }
        /* We failed to handle the error */
        return nfs4_map_errors(ret);
-do_state_recovery:
+wait_on_recovery:
-        nfs4_schedule_state_recovery(clp);
        ret = nfs4_wait_clnt_recover(clp);
        if (ret == 0)
                exception->retry = 1;
@@ -435,8 +456,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
                clp = res->sr_session->clp;
                do_renew_lease(clp, timestamp);
                /* Check sequence flags */
-                if (atomic_read(&clp->cl_count) > 1)
+                if (res->sr_status_flags != 0)
-                        nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+                        nfs4_schedule_lease_recovery(clp);
                break;
        case -NFS4ERR_DELAY:
                /* The server detected a resend of the RPC call and
@@ -505,7 +526,7 @@ out:
        return ret_id;
 }
-static int nfs41_setup_sequence(struct nfs4_session *session,
+int nfs41_setup_sequence(struct nfs4_session *session,
                                struct nfs4_sequence_args *args,
                                struct nfs4_sequence_res *res,
                                int cache_reply,
@@ -571,6 +592,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        res->sr_status = 1;
        return 0;
 }
+EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
 int nfs4_setup_sequence(const struct nfs_server *server,
                        struct nfs4_sequence_args *args,
@@ -640,7 +662,8 @@ struct rpc_call_ops nfs41_call_priv_sync_ops = {
        .rpc_call_done = nfs41_call_sync_done,
 };
-static int nfs4_call_sync_sequence(struct nfs_server *server,
+static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
+                                   struct nfs_server *server,
                                   struct rpc_message *msg,
                                   struct nfs4_sequence_args *args,
                                   struct nfs4_sequence_res *res,
@@ -656,7 +679,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server,
                .cache_reply = cache_reply,
        };
        struct rpc_task_setup task_setup = {
-                .rpc_client = server->client,
+                .rpc_client = clnt,
                .rpc_message = msg,
                .callback_ops = &nfs41_call_sync_ops,
                .callback_data = &data
@@ -675,13 +698,14 @@ static int nfs4_call_sync_sequence(struct nfs_server *server,
        return ret;
 }
-int _nfs4_call_sync_session(struct nfs_server *server,
+int _nfs4_call_sync_session(struct rpc_clnt *clnt,
+                            struct nfs_server *server,
                            struct rpc_message *msg,
                            struct nfs4_sequence_args *args,
                            struct nfs4_sequence_res *res,
                            int cache_reply)
 {
-        return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0);
+        return nfs4_call_sync_sequence(clnt, server, msg, args, res, cache_reply, 0);
 }
 #else
@@ -692,19 +716,28 @@ static int nfs4_sequence_done(struct rpc_task *task,
 }
 #endif /* CONFIG_NFS_V4_1 */
-int _nfs4_call_sync(struct nfs_server *server,
+int _nfs4_call_sync(struct rpc_clnt *clnt,
+                    struct nfs_server *server,
                    struct rpc_message *msg,
                    struct nfs4_sequence_args *args,
                    struct nfs4_sequence_res *res,
                    int cache_reply)
 {
        args->sa_session = res->sr_session = NULL;
-        return rpc_call_sync(server->client, msg, 0);
+        return rpc_call_sync(clnt, msg, 0);
 }
-#define nfs4_call_sync(server, msg, args, res, cache_reply) \
+static inline
-        (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \
+int nfs4_call_sync(struct rpc_clnt *clnt,
-                        &(res)->seq_res, (cache_reply))
+                   struct nfs_server *server,
+                   struct rpc_message *msg,
+                   struct nfs4_sequence_args *args,
+                   struct nfs4_sequence_res *res,
+                   int cache_reply)
+{
+        return server->nfs_client->cl_mvops->call_sync(clnt, server, msg,
+                                                args, res, cache_reply);
+}
 static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 {
@@ -1255,14 +1288,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                        case -NFS4ERR_BAD_HIGH_SLOT:
                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                        case -NFS4ERR_DEADSESSION:
-                                nfs4_schedule_state_recovery(
+                                nfs4_schedule_session_recovery(server->nfs_client->cl_session);
-                                        server->nfs_client);
                                goto out;
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
                        case -NFS4ERR_EXPIRED:
                                /* Don't recall a delegation if it was lost */
-                                nfs4_schedule_state_recovery(server->nfs_client);
+                                nfs4_schedule_lease_recovery(server->nfs_client);
                                goto out;
                        case -ERESTARTSYS:
                                /*
@@ -1271,7 +1303,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                                 */
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
-                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                                nfs4_schedule_stateid_recovery(server, state);
                        case -EKEYEXPIRED:
                                /*
                                 * User RPCSEC_GSS context has expired.
@@ -1574,9 +1606,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
        return 0;
 }
-static int nfs4_recover_expired_lease(struct nfs_server *server)
+static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
 {
-        struct nfs_client *clp = server->nfs_client;
        unsigned int loop;
        int ret;
@@ -1587,12 +1618,17 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
                if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
                    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
                        break;
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
                ret = -EIO;
        }
        return ret;
 }
+static int nfs4_recover_expired_lease(struct nfs_server *server)
+{
+        return nfs4_client_recover_expired_lease(server->nfs_client);
+}
 /*
 * OPEN_EXPIRED:
 *      reclaim state on the server after a network partition.
@@ -1811,7 +1847,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
        } else
                memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
-        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
        if (status == 0 && state != NULL)
                renew_lease(server, timestamp);
        return status;
@@ -2070,7 +2106,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
        };
        int status;
-        status = nfs4_call_sync(server, &msg, &args, &res, 0);
+        status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
        if (status == 0) {
                memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
                server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
@@ -2140,7 +2176,7 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
        };
        nfs_fattr_init(info->fattr);
-        return nfs4_call_sync(server, &msg, &args, &res, 0);
+        return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 }
 static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -2156,15 +2192,41 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
        return err;
 }
+static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
+                                struct nfs_fsinfo *info, rpc_authflavor_t flavor)
+{
+        struct rpc_auth *auth;
+        int ret;
+        auth = rpcauth_create(flavor, server->client);
+        if (!auth) {
+                ret = -EIO;
+                goto out;
+        }
+        ret = nfs4_lookup_root(server, fhandle, info);
+out:
+        return ret;
+}
 /*
 * get the file handle for the "/" directory on the server
 */
 static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
                              struct nfs_fsinfo *info)
 {
-        int status;
+        int i, len, status = 0;
+        rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS + 2];
-        status = nfs4_lookup_root(server, fhandle, info);
+        flav_array[0] = RPC_AUTH_UNIX;
+        len = gss_mech_list_pseudoflavors(&flav_array[1]);
+        flav_array[1+len] = RPC_AUTH_NULL;
+        len += 2;
+        for (i = 0; i < len; i++) {
+                status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]);
+                if (status != -EPERM)
+                        break;
+        }
        if (status == 0)
                status = nfs4_server_capabilities(server, fhandle);
        if (status == 0)
@@ -2229,7 +2291,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
        };
        
        nfs_fattr_init(fattr);
-        return nfs4_call_sync(server, &msg, &args, &res, 0);
+        return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 }
 static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
@@ -2289,9 +2351,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
        return status;
 }
-static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *dirfh,
+static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server,
-                const struct qstr *name, struct nfs_fh *fhandle,
+                const struct nfs_fh *dirfh, const struct qstr *name,
-                struct nfs_fattr *fattr)
+                struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
        int                    status;
        struct nfs4_lookup_arg args = {
@@ -2313,7 +2375,7 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *d
        nfs_fattr_init(fattr);
        dprintk("NFS call  lookupfh %s\n", name->name);
-        status = nfs4_call_sync(server, &msg, &args, &res, 0);
+        status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
        dprintk("NFS reply lookupfh: %d\n", status);
        return status;
 }
@@ -2325,7 +2387,7 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
        struct nfs4_exception exception = { };
        int err;
        do {
-                err = _nfs4_proc_lookupfh(server, dirfh, name, fhandle, fattr);
+                err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr);
                /* FIXME: !!!! */
                if (err == -NFS4ERR_MOVED) {
                        err = -EREMOTE;
@@ -2336,27 +2398,41 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
        return err;
 }
-static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name,
+static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
-                struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+                const struct qstr *name, struct nfs_fh *fhandle,
+                struct nfs_fattr *fattr)
 {
        int status;
        
        dprintk("NFS call  lookup %s\n", name->name);
-        status = _nfs4_proc_lookupfh(NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr);
+        status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr);
        if (status == -NFS4ERR_MOVED)
                status = nfs4_get_referral(dir, name, fattr, fhandle);
        dprintk("NFS reply lookup: %d\n", status);
        return status;
 }
-static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
+void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr, struct nfs_fh *fh)
+{
+        memset(fh, 0, sizeof(struct nfs_fh));
+        fattr->fsid.major = 1;
+        fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+                NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_FSID | NFS_ATTR_FATTR_MOUNTPOINT;
+        fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+        fattr->nlink = 2;
+}
+static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
+                            struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
        struct nfs4_exception exception = { };
        int err;
        do {
                err = nfs4_handle_exception(NFS_SERVER(dir),
-                                _nfs4_proc_lookup(dir, name, fhandle, fattr),
+                                _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr),
                                &exception);
+                if (err == -EPERM)
+                        nfs_fixup_secinfo_attributes(fattr, fhandle);
        } while (exception.retry);
        return err;
 }
@@ -2401,7 +2477,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
        if (res.fattr == NULL)
                return -ENOMEM;
-        status = nfs4_call_sync(server, &msg, &args, &res, 0);
+        status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
        if (!status) {
                entry->mask = 0;
                if (res.access & NFS4_ACCESS_READ)
@@ -2468,7 +2544,7 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page,
                .rpc_resp = &res,
        };
-        return nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0);
+        return nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
 }
 static int nfs4_proc_readlink(struct inode *inode, struct page *page,
@@ -2557,7 +2633,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
        if (res.dir_attr == NULL)
                goto out;
-        status = nfs4_call_sync(server, &msg, &args, &res, 1);
+        status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
        if (status == 0) {
                update_changeattr(dir, &res.cinfo);
                nfs_post_op_update_inode(dir, res.dir_attr);
@@ -2658,7 +2734,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
        if (res.old_fattr == NULL || res.new_fattr == NULL)
                goto out;
-        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
        if (!status) {
                update_changeattr(old_dir, &res.old_cinfo);
                nfs_post_op_update_inode(old_dir, res.old_fattr);
@@ -2709,7 +2785,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
        if (res.fattr == NULL || res.dir_attr == NULL)
                goto out;
-        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
        if (!status) {
                update_changeattr(dir, &res.cinfo);
                nfs_post_op_update_inode(dir, res.dir_attr);
@@ -2772,8 +2848,8 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
 static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
 {
-        int status = nfs4_call_sync(NFS_SERVER(dir), &data->msg,
+        int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
-                                    &data->arg, &data->res, 1);
+                                    &data->arg.seq_args, &data->res.seq_res, 1);
        if (status == 0) {
                update_changeattr(dir, &data->res.dir_cinfo);
                nfs_post_op_update_inode(dir, data->res.dir_fattr);
@@ -2885,7 +2961,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                        (unsigned long long)cookie);
        nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
        res.pgbase = args.pgbase;
-        status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0);
+        status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
        if (status >= 0) {
                memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
                status += args.pgbase;
@@ -2977,7 +3053,7 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
        };
        nfs_fattr_init(fsstat->fattr);
-        return  nfs4_call_sync(server, &msg, &args, &res, 0);
+        return  nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 }
 static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
@@ -3008,7 +3084,7 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
                .rpc_resp = &res,
        };
-        return nfs4_call_sync(server, &msg, &args, &res, 0);
+        return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 }
 static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
@@ -3053,7 +3129,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
        }
        nfs_fattr_init(pathconf->fattr);
-        return nfs4_call_sync(server, &msg, &args, &res, 0);
+        return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 }
 static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -3070,15 +3146,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return err;
 }
-static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
-        dprintk("--> %s\n", __func__);
-        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                return -EAGAIN;
        if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
                nfs_restart_rpc(task, server->nfs_client);
                return -EAGAIN;
@@ -3090,19 +3161,44 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
        return 0;
 }
+static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+{
+        dprintk("--> %s\n", __func__);
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
+                return -EAGAIN;
+        return data->read_done_cb(task, data);
+}
 static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
 {
        data->timestamp   = jiffies;
+        data->read_done_cb = nfs4_read_done_cb;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
 }
-static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+/* Reset the the nfs_read_data to send the read to the MDS. */
+void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
+{
+        dprintk("%s Reset task for i/o through\n", __func__);
+        put_lseg(data->lseg);
+        data->lseg = NULL;
+        /* offsets will differ in the dense stripe case */
+        data->args.offset = data->mds_offset;
+        data->ds_clp = NULL;
+        data->args.fh     = NFS_FH(data->inode);
+        data->read_done_cb = nfs4_read_done_cb;
+        task->tk_ops = data->mds_ops;
+        rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_read);
+static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
-        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                return -EAGAIN;
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
                nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
                return -EAGAIN;
@@ -3114,23 +3210,50 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
        return 0;
 }
+static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+{
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
+                return -EAGAIN;
+        return data->write_done_cb(task, data);
+}
+/* Reset the the nfs_write_data to send the write to the MDS. */
+void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
+{
+        dprintk("%s Reset task for i/o through\n", __func__);
+        put_lseg(data->lseg);
+        data->lseg          = NULL;
+        data->ds_clp        = NULL;
+        data->write_done_cb = nfs4_write_done_cb;
+        data->args.fh       = NFS_FH(data->inode);
+        data->args.bitmask  = data->res.server->cache_consistency_bitmask;
+        data->args.offset   = data->mds_offset;
+        data->res.fattr     = &data->fattr;
+        task->tk_ops        = data->mds_ops;
+        rpc_task_reset_client(task, NFS_CLIENT(data->inode));
+}
+EXPORT_SYMBOL_GPL(nfs4_reset_write);
 static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
-        data->args.bitmask = server->cache_consistency_bitmask;
+        if (data->lseg) {
+                data->args.bitmask = NULL;
+                data->res.fattr = NULL;
+        } else
+                data->args.bitmask = server->cache_consistency_bitmask;
+        if (!data->write_done_cb)
+                data->write_done_cb = nfs4_write_done_cb;
        data->res.server = server;
        data->timestamp   = jiffies;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
 }
-static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
-        
-        if (!nfs4_sequence_done(task, &data->res.seq_res))
-                return -EAGAIN;
        if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
                nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
@@ -3140,11 +3263,24 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
        return 0;
 }
+static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
+{
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
+                return -EAGAIN;
+        return data->write_done_cb(task, data);
+}
 static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
-        
-        data->args.bitmask = server->cache_consistency_bitmask;
+        if (data->lseg) {
+                data->args.bitmask = NULL;
+                data->res.fattr = NULL;
+        } else
+                data->args.bitmask = server->cache_consistency_bitmask;
+        if (!data->write_done_cb)
+                data->write_done_cb = nfs4_commit_done_cb;
        data->res.server = server;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 }
@@ -3178,7 +3314,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
        if (task->tk_status < 0) {
                /* Unless we're shutting down, schedule state recovery! */
                if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
-                        nfs4_schedule_state_recovery(clp);
+                        nfs4_schedule_lease_recovery(clp);
                return;
        }
        do_renew_lease(clp, timestamp);
@@ -3252,6 +3388,35 @@ static void buf_to_pages(const void *buf, size_t buflen,
        }
 }
+static int buf_to_pages_noslab(const void *buf, size_t buflen,
+                struct page **pages, unsigned int *pgbase)
+{
+        struct page *newpage, **spages;
+        int rc = 0;
+        size_t len;
+        spages = pages;
+        do {
+                len = min_t(size_t, PAGE_CACHE_SIZE, buflen);
+                newpage = alloc_page(GFP_KERNEL);
+                if (newpage == NULL)
+                        goto unwind;
+                memcpy(page_address(newpage), buf, len);
+                buf += len;
+                buflen -= len;
+                *pages++ = newpage;
+                rc++;
+        } while (buflen != 0);
+        return rc;
+unwind:
+        for(; rc > 0; rc--)
+                __free_page(spages[rc-1]);
+        return -ENOMEM;
+}
 struct nfs4_cached_acl {
        int cached;
        size_t len;
@@ -3353,7 +3518,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
                resp_buf = buf;
                buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
        }
-        ret = nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0);
+        ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
        if (ret)
                goto out_free;
        if (res.acl_len > args.acl_len)
@@ -3420,13 +3585,23 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
                .rpc_argp       = &arg,
                .rpc_resp       = &res,
        };
-        int ret;
+        int ret, i;
        if (!nfs4_server_supports_acls(server))
                return -EOPNOTSUPP;
+        i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
+        if (i < 0)
+                return i;
        nfs_inode_return_delegation(inode);
-        buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
+        ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
-        ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        /*
+         * Free each page after tx, so the only ref left is
+         * held by the network stack
+         */
+        for (; i > 0; i--)
+                put_page(pages[i-1]);
        /*
         * Acl update can result in inode attribute update.
         * so mark the attribute cache invalid.
@@ -3464,12 +3639,13 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                case -NFS4ERR_OPENMODE:
                        if (state == NULL)
                                break;
-                        nfs4_state_mark_reclaim_nograce(clp, state);
+                        nfs4_schedule_stateid_recovery(server, state);
-                        goto do_state_recovery;
+                        goto wait_on_recovery;
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_EXPIRED:
-                        goto do_state_recovery;
+                        nfs4_schedule_lease_recovery(clp);
+                        goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
                case -NFS4ERR_BADSESSION:
                case -NFS4ERR_BADSLOT:
@@ -3480,7 +3656,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                case -NFS4ERR_SEQ_MISORDERED:
                        dprintk("%s ERROR %d, Reset session\n", __func__,
                                task->tk_status);
-                        nfs4_schedule_state_recovery(clp);
+                        nfs4_schedule_session_recovery(clp->cl_session);
                        task->tk_status = 0;
                        return -EAGAIN;
 #endif /* CONFIG_NFS_V4_1 */
@@ -3497,9 +3673,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
        }
        task->tk_status = nfs4_map_errors(task->tk_status);
        return 0;
-do_state_recovery:
+wait_on_recovery:
        rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
-        nfs4_schedule_state_recovery(clp);
        if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
                rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
        task->tk_status = 0;
@@ -3781,7 +3956,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
        lsp = request->fl_u.nfs4_fl.owner;
        arg.lock_owner.id = lsp->ls_id.id;
        arg.lock_owner.s_dev = server->s_dev;
-        status = nfs4_call_sync(server, &msg, &arg, &res, 1);
+        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
        switch (status) {
                case 0:
                        request->fl_type = F_UNLCK;
@@ -4110,7 +4285,7 @@ static void nfs4_lock_release(void *calldata)
                task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
                                data->arg.lock_seqid);
                if (!IS_ERR(task))
-                        rpc_put_task(task);
+                        rpc_put_task_async(task);
                dprintk("%s: cancelling lock!\n", __func__);
        } else
                nfs_free_seqid(data->arg.lock_seqid);
@@ -4134,23 +4309,18 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = {
 static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
 {
-        struct nfs_client *clp = server->nfs_client;
-        struct nfs4_state *state = lsp->ls_state;
        switch (error) {
        case -NFS4ERR_ADMIN_REVOKED:
        case -NFS4ERR_BAD_STATEID:
-        case -NFS4ERR_EXPIRED:
+                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
                if (new_lock_owner != 0 ||
                   (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
-                        nfs4_state_mark_reclaim_nograce(clp, state);
+                        nfs4_schedule_stateid_recovery(server, lsp->ls_state);
-                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
                break;
        case -NFS4ERR_STALE_STATEID:
-                if (new_lock_owner != 0 ||
-                    (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
-                        nfs4_state_mark_reclaim_reboot(clp, state);
                lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+        case -NFS4ERR_EXPIRED:
+                nfs4_schedule_lease_recovery(server->nfs_client);
        };
 }
@@ -4366,12 +4536,14 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                        case -NFS4ERR_EXPIRED:
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
+                                nfs4_schedule_lease_recovery(server->nfs_client);
+                                goto out;
                        case -NFS4ERR_BADSESSION:
                        case -NFS4ERR_BADSLOT:
                        case -NFS4ERR_BAD_HIGH_SLOT:
                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                        case -NFS4ERR_DEADSESSION:
-                                nfs4_schedule_state_recovery(server->nfs_client);
+                                nfs4_schedule_session_recovery(server->nfs_client->cl_session);
                                goto out;
                        case -ERESTARTSYS:
                                /*
@@ -4381,7 +4553,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
                        case -NFS4ERR_OPENMODE:
-                                nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+                                nfs4_schedule_stateid_recovery(server, state);
                                err = 0;
                                goto out;
                        case -EKEYEXPIRED:
@@ -4512,12 +4684,46 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
        nfs_fattr_init(&fs_locations->fattr);
        fs_locations->server = server;
        fs_locations->nlocations = 0;
-        status = nfs4_call_sync(server, &msg, &args, &res, 0);
+        status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
        nfs_fixup_referral_attributes(&fs_locations->fattr);
        dprintk("%s: returned status = %d\n", __func__, status);
        return status;
 }
+static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors)
+{
+        int status;
+        struct nfs4_secinfo_arg args = {
+                .dir_fh = NFS_FH(dir),
+                .name   = name,
+        };
+        struct nfs4_secinfo_res res = {
+                .flavors     = flavors,
+        };
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO],
+                .rpc_argp = &args,
+                .rpc_resp = &res,
+        };
+        dprintk("NFS call  secinfo %s\n", name->name);
+        status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
+        dprintk("NFS reply  secinfo: %d\n", status);
+        return status;
+}
+int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors)
+{
+        struct nfs4_exception exception = { };
+        int err;
+        do {
+                err = nfs4_handle_exception(NFS_SERVER(dir),
+                                _nfs4_proc_secinfo(dir, name, flavors),
+                                &exception);
+        } while (exception.retry);
+        return err;
+}
 #ifdef CONFIG_NFS_V4_1
 /*
 * Check the exchange flags returned by the server for invalid flags, having
@@ -4988,10 +5194,20 @@ int nfs4_proc_create_session(struct nfs_client *clp)
        int status;
        unsigned *ptr;
        struct nfs4_session *session = clp->cl_session;
+        long timeout = 0;
+        int err;
        dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
-        status = _nfs4_proc_create_session(clp);
+        do {
+                status = _nfs4_proc_create_session(clp);
+                if (status == -NFS4ERR_DELAY) {
+                        err = nfs4_delay(clp->cl_rpcclient, &timeout);
+                        if (err)
+                                status = err;
+                }
+        } while (status == -NFS4ERR_DELAY);
        if (status)
                goto out;
@@ -5073,6 +5289,27 @@ int nfs4_init_session(struct nfs_server *server)
        return ret;
 }
+int nfs4_init_ds_session(struct nfs_client *clp)
+{
+        struct nfs4_session *session = clp->cl_session;
+        int ret;
+        if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
+                return 0;
+        ret = nfs4_client_recover_expired_lease(clp);
+        if (!ret)
+                /* Test for the DS role */
+                if (!is_ds_client(clp))
+                        ret = -ENODEV;
+        if (!ret)
+                ret = nfs4_check_client_ready(clp);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
 /*
 * Renew the cl_session lease.
 */
@@ -5100,7 +5337,7 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
                rpc_delay(task, NFS4_POLL_RETRY_MAX);
                return -EAGAIN;
        default:
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_lease_recovery(clp);
        }
        return 0;
 }
@@ -5187,7 +5424,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
        if (IS_ERR(task))
                ret = PTR_ERR(task);
        else
-                rpc_put_task(task);
+                rpc_put_task_async(task);
        dprintk("<-- %s status=%d\n", __func__, ret);
        return ret;
 }
@@ -5203,8 +5440,13 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
                goto out;
        }
        ret = rpc_wait_for_completion_task(task);
-        if (!ret)
+        if (!ret) {
+                struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
+                if (task->tk_status == 0)
+                        nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
                ret = task->tk_status;
+        }
        rpc_put_task(task);
 out:
        dprintk("<-- %s status=%d\n", __func__, ret);
@@ -5241,7 +5483,7 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
                rpc_delay(task, NFS4_POLL_RETRY_MAX);
                return -EAGAIN;
        default:
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_lease_recovery(clp);
        }
        return 0;
 }
@@ -5309,6 +5551,9 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
                status = PTR_ERR(task);
                goto out;
        }
+        status = nfs4_wait_for_completion_rpc_task(task);
+        if (status == 0)
+                status = task->tk_status;
        rpc_put_task(task);
        return 0;
 out:
@@ -5371,8 +5616,6 @@ static void nfs4_layoutget_release(void *calldata)
        struct nfs4_layoutget *lgp = calldata;
        dprintk("--> %s\n", __func__);
-        if (lgp->res.layout.buf != NULL)
-                free_page((unsigned long) lgp->res.layout.buf);
        put_nfs_open_context(lgp->args.ctx);
        kfree(calldata);
        dprintk("<-- %s\n", __func__);
@@ -5404,12 +5647,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
        dprintk("--> %s\n", __func__);
-        lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS);
+        lgp->res.layoutp = &lgp->args.layout;
-        if (lgp->res.layout.buf == NULL) {
-                nfs4_layoutget_release(lgp);
-                return -ENOMEM;
-        }
        lgp->res.seq_res.sr_slot = NULL;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
@@ -5441,7 +5679,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
        int status;
        dprintk("--> %s\n", __func__);
-        status = nfs4_call_sync(server, &msg, &args, &res, 0);
+        status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
        dprintk("<-- %s status=%d\n", __func__, status);
        return status;
@@ -5461,6 +5699,100 @@ int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
 }
 EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
+static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_layoutcommit_data *data = calldata;
+        struct nfs_server *server = NFS_SERVER(data->args.inode);
+        if (nfs4_setup_sequence(server, &data->args.seq_args,
+                                &data->res.seq_res, 1, task))
+                return;
+        rpc_call_start(task);
+}
+static void
+nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_layoutcommit_data *data = calldata;
+        struct nfs_server *server = NFS_SERVER(data->args.inode);
+        if (!nfs4_sequence_done(task, &data->res.seq_res))
+                return;
+        switch (task->tk_status) { /* Just ignore these failures */
+        case NFS4ERR_DELEG_REVOKED: /* layout was recalled */
+        case NFS4ERR_BADIOMODE:     /* no IOMODE_RW layout for range */
+        case NFS4ERR_BADLAYOUT:     /* no layout */
+        case NFS4ERR_GRACE:         /* loca_recalim always false */
+                task->tk_status = 0;
+        }
+        if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+                nfs_restart_rpc(task, server->nfs_client);
+                return;
+        }
+        if (task->tk_status == 0)
+                nfs_post_op_update_inode_force_wcc(data->args.inode,
+                                                   data->res.fattr);
+}
+static void nfs4_layoutcommit_release(void *calldata)
+{
+        struct nfs4_layoutcommit_data *data = calldata;
+        /* Matched by references in pnfs_set_layoutcommit */
+        put_lseg(data->lseg);
+        put_rpccred(data->cred);
+        kfree(data);
+}
+static const struct rpc_call_ops nfs4_layoutcommit_ops = {
+        .rpc_call_prepare = nfs4_layoutcommit_prepare,
+        .rpc_call_done = nfs4_layoutcommit_done,
+        .rpc_release = nfs4_layoutcommit_release,
+};
+int
+nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
+{
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT],
+                .rpc_argp = &data->args,
+                .rpc_resp = &data->res,
+                .rpc_cred = data->cred,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .task = &data->task,
+                .rpc_client = NFS_CLIENT(data->args.inode),
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_layoutcommit_ops,
+                .callback_data = data,
+                .flags = RPC_TASK_ASYNC,
+        };
+        struct rpc_task *task;
+        int status = 0;
+        dprintk("NFS: %4d initiating layoutcommit call. sync %d "
+                "lbw: %llu inode %lu\n",
+                data->task.tk_pid, sync,
+                data->args.lastbytewritten,
+                data->args.inode->i_ino);
+        task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task))
+                return PTR_ERR(task);
+        if (sync == false)
+                goto out;
+        status = nfs4_wait_for_completion_rpc_task(task);
+        if (status != 0)
+                goto out;
+        status = task->tk_status;
+out:
+        dprintk("%s: status %d\n", __func__, status);
+        rpc_put_task(task);
+        return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -5595,6 +5927,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .clear_acl_cache = nfs4_zap_acl_attr,
        .close_context  = nfs4_close_context,
        .open_context   = nfs4_atomic_open,
+        .init_client    = nfs4_init_client,
+        .secinfo        = nfs4_proc_secinfo,
 };
 static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 402143d75fc5..df8e7f3ca56d 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -64,12 +64,8 @@ nfs4_renew_state(struct work_struct *work)
        ops = clp->cl_mvops->state_renewal_ops;
        dprintk("%s: start\n", __func__);
-        rcu_read_lock();
+        if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
-        if (list_empty(&clp->cl_superblocks)) {
-                rcu_read_unlock();
                goto out;
-        }
-        rcu_read_unlock();
        spin_lock(&clp->cl_lock);
        lease = clp->cl_lease_time;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e6742b57a04c..a6804f704d9d 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
        int status;
        struct nfs_fsinfo fsinfo;
+        if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
+                nfs4_schedule_state_renewal(clp);
+                return 0;
+        }
        status = nfs4_proc_get_lease_time(clp, &fsinfo);
        if (status == 0) {
                /* Update lease time and schedule renewal */
@@ -585,7 +590,8 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)
                state->owner = owner;
                atomic_inc(&owner->so_count);
                list_add(&state->inode_states, &nfsi->open_states);
-                state->inode = igrab(inode);
+                ihold(inode);
+                state->inode = inode;
                spin_unlock(&inode->i_lock);
                /* Note: The reclaim code dictates that we add stateless
                 * and read-only stateids to the end of the list */
@@ -1007,9 +1013,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 }
 /*
- * Schedule a state recovery attempt
+ * Schedule a lease recovery attempt
 */
-void nfs4_schedule_state_recovery(struct nfs_client *clp)
+void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 {
        if (!clp)
                return;
@@ -1018,7 +1024,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
        nfs4_schedule_state_manager(clp);
 }
-int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
 {
        set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1032,7 +1038,7 @@ int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *st
        return 1;
 }
-int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
 {
        set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
        clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1041,6 +1047,14 @@ int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *s
        return 1;
 }
+void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
+{
+        struct nfs_client *clp = server->nfs_client;
+        nfs4_state_mark_reclaim_nograce(clp, state);
+        nfs4_schedule_state_manager(clp);
+}
 static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
 {
        struct inode *inode = state->inode;
@@ -1436,10 +1450,16 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
 }
 #ifdef CONFIG_NFS_V4_1
+void nfs4_schedule_session_recovery(struct nfs4_session *session)
+{
+        nfs4_schedule_lease_recovery(session->clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
 void nfs41_handle_recall_slot(struct nfs_client *clp)
 {
        set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
-        nfs4_schedule_state_recovery(clp);
+        nfs4_schedule_state_manager(clp);
 }
 static void nfs4_reset_all_state(struct nfs_client *clp)
@@ -1447,7 +1467,7 @@ static void nfs4_reset_all_state(struct nfs_client *clp)
        if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
                clp->cl_boot_time = CURRENT_TIME;
                nfs4_state_start_reclaim_nograce(clp);
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
        }
 }
@@ -1455,7 +1475,7 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)
 {
        if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
                nfs4_state_start_reclaim_reboot(clp);
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
        }
 }
@@ -1475,7 +1495,7 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp)
 {
        nfs_expire_all_delegations(clp);
        if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
-                nfs4_schedule_state_recovery(clp);
+                nfs4_schedule_state_manager(clp);
 }
 void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4e2c168b6ee9..dddfb5795d7b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -46,6 +46,7 @@
 #include <linux/kdev_t.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/gss_api.h>
 #include <linux/nfs.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
@@ -112,7 +113,7 @@ static int nfs4_stat_to_errno(int);
 #define encode_restorefh_maxsz  (op_encode_hdr_maxsz)
 #define decode_restorefh_maxsz  (op_decode_hdr_maxsz)
 #define encode_fsinfo_maxsz     (encode_getattr_maxsz)
-#define decode_fsinfo_maxsz     (op_decode_hdr_maxsz + 11)
+#define decode_fsinfo_maxsz     (op_decode_hdr_maxsz + 15)
 #define encode_renew_maxsz      (op_encode_hdr_maxsz + 3)
 #define decode_renew_maxsz      (op_decode_hdr_maxsz)
 #define encode_setclientid_maxsz \
@@ -253,6 +254,8 @@ static int nfs4_stat_to_errno(int);
                                (encode_getattr_maxsz)
 #define decode_fs_locations_maxsz \
                                (0)
+#define encode_secinfo_maxsz    (op_encode_hdr_maxsz + nfs4_name_maxsz)
+#define decode_secinfo_maxsz    (op_decode_hdr_maxsz + 4 + (NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)))
 #if defined(CONFIG_NFS_V4_1)
 #define NFS4_MAX_MACHINE_NAME_LEN (64)
@@ -324,6 +327,18 @@ static int nfs4_stat_to_errno(int);
 #define decode_layoutget_maxsz  (op_decode_hdr_maxsz + 8 + \
                                decode_stateid_maxsz + \
                                XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
+#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz +          \
+                                2 /* offset */ + \
+                                2 /* length */ + \
+                                1 /* reclaim */ + \
+                                encode_stateid_maxsz + \
+                                1 /* new offset (true) */ + \
+                                2 /* last byte written */ + \
+                                1 /* nt_timechanged (false) */ + \
+                                1 /* layoutupdate4 layout type */ + \
+                                1 /* NULL filelayout layoutupdate4 payload */)
+#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz   0
 #define decode_sequence_maxsz   0
@@ -676,6 +691,14 @@ static int nfs4_stat_to_errno(int);
                                 decode_putfh_maxsz + \
                                 decode_lookup_maxsz + \
                                 decode_fs_locations_maxsz)
+#define NFS4_enc_secinfo_sz     (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
+                                encode_putfh_maxsz + \
+                                encode_secinfo_maxsz)
+#define NFS4_dec_secinfo_sz     (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
+                                decode_putfh_maxsz + \
+                                decode_secinfo_maxsz)
 #if defined(CONFIG_NFS_V4_1)
 #define NFS4_enc_exchange_id_sz \
                                (compound_encode_hdr_maxsz + \
@@ -727,6 +750,17 @@ static int nfs4_stat_to_errno(int);
                                decode_sequence_maxsz + \
                                decode_putfh_maxsz +        \
                                decode_layoutget_maxsz)
+#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz +\
+                                encode_putfh_maxsz + \
+                                encode_layoutcommit_maxsz + \
+                                encode_getattr_maxsz)
+#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
+                                decode_putfh_maxsz + \
+                                decode_layoutcommit_maxsz + \
+                                decode_getattr_maxsz)
 const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
                                      compound_encode_hdr_maxsz +
@@ -844,7 +878,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
        if (iap->ia_valid & ATTR_MODE)
                len += 4;
        if (iap->ia_valid & ATTR_UID) {
-                owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ);
+                owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
                if (owner_namelen < 0) {
                        dprintk("nfs: couldn't resolve uid %d to string\n",
                                        iap->ia_uid);
@@ -856,7 +890,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
                len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
        }
        if (iap->ia_valid & ATTR_GID) {
-                owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ);
+                owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
                if (owner_grouplen < 0) {
                        dprintk("nfs: couldn't resolve gid %d to string\n",
                                        iap->ia_gid);
@@ -1384,7 +1418,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
        hdr->replen += decode_putrootfh_maxsz;
 }
-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
+static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid)
 {
        nfs4_stateid stateid;
        __be32 *p;
@@ -1392,6 +1426,8 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
        p = reserve_space(xdr, NFS4_STATEID_SIZE);
        if (ctx->state != NULL) {
                nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
+                if (zero_seqid)
+                        stateid.stateid.seqid = 0;
                xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
        } else
                xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1404,7 +1440,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(OP_READ);
-        encode_stateid(xdr, args->context, args->lock_context);
+        encode_stateid(xdr, args->context, args->lock_context,
+                       hdr->minorversion);
        p = reserve_space(xdr, 12);
        p = xdr_encode_hyper(p, args->offset);
@@ -1592,7 +1629,8 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
        p = reserve_space(xdr, 4);
        *p = cpu_to_be32(OP_WRITE);
-        encode_stateid(xdr, args->context, args->lock_context);
+        encode_stateid(xdr, args->context, args->lock_context,
+                       hdr->minorversion);
        p = reserve_space(xdr, 16);
        p = xdr_encode_hyper(p, args->offset);
@@ -1616,6 +1654,18 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
        hdr->replen += decode_delegreturn_maxsz;
 }
+static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
+{
+        int len = name->len;
+        __be32 *p;
+        p = reserve_space(xdr, 8 + len);
+        *p++ = cpu_to_be32(OP_SECINFO);
+        xdr_encode_opaque(p, name->name, len);
+        hdr->nops++;
+        hdr->replen += decode_secinfo_maxsz;
+}
 #if defined(CONFIG_NFS_V4_1)
 /* NFSv4.1 operations */
 static void encode_exchange_id(struct xdr_stream *xdr,
@@ -1660,7 +1710,7 @@ static void encode_create_session(struct xdr_stream *xdr,
        p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
        *p++ = cpu_to_be32(OP_CREATE_SESSION);
-        p = xdr_encode_hyper(p, clp->cl_ex_clid);
+        p = xdr_encode_hyper(p, clp->cl_clientid);
        *p++ = cpu_to_be32(clp->cl_seqid);                      /*Sequence id */
        *p++ = cpu_to_be32(args->flags);                        /*flags */
@@ -1812,6 +1862,34 @@ encode_layoutget(struct xdr_stream *xdr,
        hdr->nops++;
        hdr->replen += decode_layoutget_maxsz;
 }
+static int
+encode_layoutcommit(struct xdr_stream *xdr,
+                    const struct nfs4_layoutcommit_args *args,
+                    struct compound_hdr *hdr)
+{
+        __be32 *p;
+        dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
+                NFS_SERVER(args->inode)->pnfs_curr_ld->id);
+        p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE);
+        *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
+        /* Only whole file layouts */
+        p = xdr_encode_hyper(p, 0); /* offset */
+        p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */
+        *p++ = cpu_to_be32(0); /* reclaim */
+        p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
+        *p++ = cpu_to_be32(1); /* newoffset = TRUE */
+        p = xdr_encode_hyper(p, args->lastbytewritten);
+        *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
+        *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
+        *p++ = cpu_to_be32(0); /* no file layout payload */
+        hdr->nops++;
+        hdr->replen += decode_layoutcommit_maxsz;
+        return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 /*
@@ -2271,7 +2349,8 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
        encode_putfh(xdr, args->fh, &hdr);
        encode_write(xdr, args, &hdr);
        req->rq_snd_buf.flags |= XDRBUF_WRITE;
-        encode_getfattr(xdr, args->bitmask, &hdr);
+        if (args->bitmask)
+                encode_getfattr(xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
 }
@@ -2289,7 +2368,8 @@ static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
        encode_sequence(xdr, &args->seq_args, &hdr);
        encode_putfh(xdr, args->fh, &hdr);
        encode_commit(xdr, args, &hdr);
-        encode_getfattr(xdr, args->bitmask, &hdr);
+        if (args->bitmask)
+                encode_getfattr(xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
 }
@@ -2460,6 +2540,24 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
        encode_nops(&hdr);
 }
+/*
+ * Encode SECINFO request
+ */
+static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req,
+                                struct xdr_stream *xdr,
+                                struct nfs4_secinfo_arg *args)
+{
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+        };
+        encode_compound_hdr(xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, args->dir_fh, &hdr);
+        encode_secinfo(xdr, args->name, &hdr);
+        encode_nops(&hdr);
+}
 #if defined(CONFIG_NFS_V4_1)
 /*
 * EXCHANGE_ID request
@@ -2599,8 +2697,32 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
        encode_sequence(xdr, &args->seq_args, &hdr);
        encode_putfh(xdr, NFS_FH(args->inode), &hdr);
        encode_layoutget(xdr, args, &hdr);
+        xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
+            args->layout.pages, 0, args->layout.pglen);
        encode_nops(&hdr);
 }
+/*
+ *  Encode LAYOUTCOMMIT request
+ */
+static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_layoutcommit_args *args)
+{
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+        };
+        encode_compound_hdr(xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+        encode_layoutcommit(xdr, args, &hdr);
+        encode_getfattr(xdr, args->bitmask, &hdr);
+        encode_nops(&hdr);
+        return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2920,6 +3042,7 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)
                if (unlikely(!p))
                        goto out_overflow;
                bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
+                return -be32_to_cpup(p);
        }
        return 0;
 out_overflow:
@@ -3382,7 +3505,7 @@ out_overflow:
 }
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
-                struct nfs_client *clp, uint32_t *uid, int may_sleep)
+                const struct nfs_server *server, uint32_t *uid, int may_sleep)
 {
        uint32_t len;
        __be32 *p;
@@ -3402,7 +3525,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
                if (!may_sleep) {
                        /* do nothing */
                } else if (len < XDR_MAX_NETOBJ) {
-                        if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
+                        if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
                                ret = NFS_ATTR_FATTR_OWNER;
                        else
                                dprintk("%s: nfs_map_name_to_uid failed!\n",
@@ -3420,7 +3543,7 @@ out_overflow:
 }
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
-                struct nfs_client *clp, uint32_t *gid, int may_sleep)
+                const struct nfs_server *server, uint32_t *gid, int may_sleep)
 {
        uint32_t len;
        __be32 *p;
@@ -3440,7 +3563,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
                if (!may_sleep) {
                        /* do nothing */
                } else if (len < XDR_MAX_NETOBJ) {
-                        if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
+                        if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
                                ret = NFS_ATTR_FATTR_GROUP;
                        else
                                dprintk("%s: nfs_map_group_to_gid failed!\n",
@@ -3907,6 +4030,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
        fattr->valid |= status;
        status = decode_attr_error(xdr, bitmap);
+        if (status == -NFS4ERR_WRONGSEC) {
+                nfs_fixup_secinfo_attributes(fattr, fh);
+                status = 0;
+        }
        if (status < 0)
                goto xdr_error;
@@ -3939,14 +4066,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
                goto xdr_error;
        fattr->valid |= status;
-        status = decode_attr_owner(xdr, bitmap, server->nfs_client,
+        status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
-                        &fattr->uid, may_sleep);
        if (status < 0)
                goto xdr_error;
        fattr->valid |= status;
-        status = decode_attr_group(xdr, bitmap, server->nfs_client,
+        status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
-                        &fattr->gid, may_sleep);
        if (status < 0)
                goto xdr_error;
        fattr->valid |= status;
@@ -4677,6 +4802,73 @@ static int decode_delegreturn(struct xdr_stream *xdr)
        return decode_op_hdr(xdr, OP_DELEGRETURN);
 }
+static int decode_secinfo_gss(struct xdr_stream *xdr, struct nfs4_secinfo_flavor *flavor)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        flavor->gss.sec_oid4.len = be32_to_cpup(p);
+        if (flavor->gss.sec_oid4.len > GSS_OID_MAX_LEN)
+                goto out_err;
+        p = xdr_inline_decode(xdr, flavor->gss.sec_oid4.len);
+        if (unlikely(!p))
+                goto out_overflow;
+        memcpy(flavor->gss.sec_oid4.data, p, flavor->gss.sec_oid4.len);
+        p = xdr_inline_decode(xdr, 8);
+        if (unlikely(!p))
+                goto out_overflow;
+        flavor->gss.qop4 = be32_to_cpup(p++);
+        flavor->gss.service = be32_to_cpup(p);
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+out_err:
+        return -EINVAL;
+}
+static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
+{
+        struct nfs4_secinfo_flavor *sec_flavor;
+        int status;
+        __be32 *p;
+        int i;
+        status = decode_op_hdr(xdr, OP_SECINFO);
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        res->flavors->num_flavors = be32_to_cpup(p);
+        for (i = 0; i < res->flavors->num_flavors; i++) {
+                sec_flavor = &res->flavors->flavors[i];
+                if ((char *)&sec_flavor[1] - (char *)res > PAGE_SIZE)
+                        break;
+                p = xdr_inline_decode(xdr, 4);
+                if (unlikely(!p))
+                        goto out_overflow;
+                sec_flavor->flavor = be32_to_cpup(p);
+                if (sec_flavor->flavor == RPC_AUTH_GSS) {
+                        if (decode_secinfo_gss(xdr, sec_flavor))
+                                break;
+                }
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 #if defined(CONFIG_NFS_V4_1)
 static int decode_exchange_id(struct xdr_stream *xdr,
                              struct nfs41_exchange_id_res *res)
@@ -4694,7 +4886,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
        p = xdr_inline_decode(xdr, 8);
        if (unlikely(!p))
                goto out_overflow;
-        xdr_decode_hyper(p, &clp->cl_ex_clid);
+        xdr_decode_hyper(p, &clp->cl_clientid);
        p = xdr_inline_decode(xdr, 12);
        if (unlikely(!p))
                goto out_overflow;
@@ -4947,6 +5139,9 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
        __be32 *p;
        int status;
        u32 layout_count;
+        struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
+        struct kvec *iov = rcvbuf->head;
+        u32 hdrlen, recvd;
        status = decode_op_hdr(xdr, OP_LAYOUTGET);
        if (status)
@@ -4963,17 +5158,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
                return -EINVAL;
        }
-        p = xdr_inline_decode(xdr, 24);
+        p = xdr_inline_decode(xdr, 28);
        if (unlikely(!p))
                goto out_overflow;
        p = xdr_decode_hyper(p, &res->range.offset);
        p = xdr_decode_hyper(p, &res->range.length);
        res->range.iomode = be32_to_cpup(p++);
        res->type = be32_to_cpup(p++);
+        res->layoutp->len = be32_to_cpup(p);
-        status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
-        if (unlikely(status))
-                return status;
        dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
                __func__,
@@ -4981,12 +5173,18 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
                (unsigned long)res->range.length,
                res->range.iomode,
                res->type,
-                res->layout.len);
+                res->layoutp->len);
+        hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base;
+        recvd = req->rq_rcv_buf.len - hdrlen;
+        if (res->layoutp->len > recvd) {
+                dprintk("NFS: server cheating in layoutget reply: "
+                                "layout len %u > recvd %u\n",
+                                res->layoutp->len, recvd);
+                return -EINVAL;
+        }
-        /* nfs4_proc_layoutget allocated a single page */
+        xdr_read_pages(xdr, res->layoutp->len);
-        if (res->layout.len > PAGE_SIZE)
-                return -ENOMEM;
-        memcpy(res->layout.buf, p, res->layout.len);
        if (layout_count > 1) {
                /* We only handle a length one array at the moment.  Any
@@ -5003,6 +5201,35 @@ out_overflow:
        print_overflow_msg(__func__, xdr);
        return -EIO;
 }
+static int decode_layoutcommit(struct xdr_stream *xdr,
+                               struct rpc_rqst *req,
+                               struct nfs4_layoutcommit_res *res)
+{
+        __be32 *p;
+        __u32 sizechanged;
+        int status;
+        status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
+        if (status)
+                return status;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        sizechanged = be32_to_cpup(p);
+        if (sizechanged) {
+                /* throw away new size */
+                p = xdr_inline_decode(xdr, 8);
+                if (unlikely(!p))
+                        goto out_overflow;
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 #endif /* CONFIG_NFS_V4_1 */
 /*
@@ -5690,8 +5917,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_write(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(xdr, res->fattr, res->server,
+        if (res->fattr)
-                        !RPC_IS_ASYNC(rqstp->rq_task));
+                decode_getfattr(xdr, res->fattr, res->server,
+                                !RPC_IS_ASYNC(rqstp->rq_task));
        if (!status)
                status = res->count;
 out:
@@ -5719,8 +5947,9 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_commit(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(xdr, res->fattr, res->server,
+        if (res->fattr)
-                        !RPC_IS_ASYNC(rqstp->rq_task));
+                decode_getfattr(xdr, res->fattr, res->server,
+                                !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -5915,6 +6144,32 @@ out:
        return status;
 }
+/*
+ * Decode SECINFO response
+ */
+static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
+                                struct xdr_stream *xdr,
+                                struct nfs4_secinfo_res *res)
+{
+        struct compound_hdr hdr;
+        int status;
+        status = decode_compound_hdr(xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
+        status = decode_putfh(xdr);
+        if (status)
+                goto out;
+        status = decode_secinfo(xdr, res);
+        if (status)
+                goto out;
+out:
+        return status;
+}
 #if defined(CONFIG_NFS_V4_1)
 /*
 * Decode EXCHANGE_ID response
@@ -6062,6 +6317,34 @@ static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
 out:
        return status;
 }
+/*
+ * Decode LAYOUTCOMMIT response
+ */
+static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_layoutcommit_res *res)
+{
+        struct compound_hdr hdr;
+        int status;
+        status = decode_compound_hdr(xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
+        status = decode_putfh(xdr);
+        if (status)
+                goto out;
+        status = decode_layoutcommit(xdr, rqstp, res);
+        if (status)
+                goto out;
+        decode_getfattr(xdr, res->fattr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
+out:
+        return status;
+}
 #endif /* CONFIG_NFS_V4_1 */
 /**
@@ -6167,8 +6450,6 @@ static struct {
        { NFS4ERR_DQUOT,        -EDQUOT         },
        { NFS4ERR_STALE,        -ESTALE         },
        { NFS4ERR_BADHANDLE,    -EBADHANDLE     },
-        { NFS4ERR_BADOWNER,     -EINVAL         },
-        { NFS4ERR_BADNAME,      -EINVAL         },
        { NFS4ERR_BAD_COOKIE,   -EBADCOOKIE     },
        { NFS4ERR_NOTSUPP,      -ENOTSUPP       },
        { NFS4ERR_TOOSMALL,     -ETOOSMALL      },
@@ -6178,10 +6459,6 @@ static struct {
        { NFS4ERR_SYMLINK,      -ELOOP          },
        { NFS4ERR_OP_ILLEGAL,   -EOPNOTSUPP     },
        { NFS4ERR_DEADLOCK,     -EDEADLK        },
-        { NFS4ERR_WRONGSEC,     -EPERM          }, /* FIXME: this needs
-                                                    * to be handled by a
-                                                    * middle-layer.
-                                                    */
        { -1,                   -EIO            }
 };
@@ -6256,6 +6533,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
        PROC(SETACL,            enc_setacl,             dec_setacl),
        PROC(FS_LOCATIONS,      enc_fs_locations,       dec_fs_locations),
        PROC(RELEASE_LOCKOWNER, enc_release_lockowner,  dec_release_lockowner),
+        PROC(SECINFO,           enc_secinfo,            dec_secinfo),
 #if defined(CONFIG_NFS_V4_1)
        PROC(EXCHANGE_ID,       enc_exchange_id,        dec_exchange_id),
        PROC(CREATE_SESSION,    enc_create_session,     dec_create_session),
@@ -6265,6 +6543,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
        PROC(RECLAIM_COMPLETE,  enc_reclaim_complete,   dec_reclaim_complete),
        PROC(GETDEVICEINFO,     enc_getdeviceinfo,      dec_getdeviceinfo),
        PROC(LAYOUTGET,         enc_layoutget,          dec_layoutget),
+        PROC(LAYOUTCOMMIT,      enc_layoutcommit,       dec_layoutcommit),
 #endif /* CONFIG_NFS_V4_1 */
 };
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 903908a20023..c541093a5bf2 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -86,11 +86,14 @@
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT                "/tftpboot/%s"
+/* Default NFSROOT mount options. */
+#define NFS_DEF_OPTIONS         "udp"
 /* Parameters passed from the kernel command line */
 static char nfs_root_parms[256] __initdata = "";
 /* Text-based mount options passed to super.c */
-static char nfs_root_options[256] __initdata = "";
+static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
 /* Address of NFS server */
 static __be32 servaddr __initdata = htonl(INADDR_NONE);
@@ -160,8 +163,14 @@ static int __init root_nfs_copy(char *dest, const char *src,
 }
 static int __init root_nfs_cat(char *dest, const char *src,
-                                  const size_t destlen)
+                               const size_t destlen)
 {
+        size_t len = strlen(dest);
+        if (len && dest[len - 1] != ',')
+                if (strlcat(dest, ",", destlen) > destlen)
+                        return -1;
        if (strlcat(dest, src, destlen) > destlen)
                return -1;
        return 0;
@@ -194,16 +203,6 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
                if (root_nfs_cat(nfs_root_options, incoming,
                                                sizeof(nfs_root_options)))
                        return -1;
-        /*
-         * Possibly prepare for more options to be appended
-         */
-        if (nfs_root_options[0] != '\0' &&
-            nfs_root_options[strlen(nfs_root_options)] != ',')
-                if (root_nfs_cat(nfs_root_options, ",",
-                                                sizeof(nfs_root_options)))
-                        return -1;
        return 0;
 }
@@ -217,7 +216,7 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
 */
 static int __init root_nfs_data(char *cmdline)
 {
-        char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
+        char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
        int len, retval = -1;
        char *tmp = NULL;
        const size_t tmplen = sizeof(nfs_export_path);
@@ -244,9 +243,9 @@ static int __init root_nfs_data(char *cmdline)
         * Append mandatory options for nfsroot so they override
         * what has come before
         */
-        snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
+        snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4",
                        &servaddr);
-        if (root_nfs_cat(nfs_root_options, addr_option,
+        if (root_nfs_cat(nfs_root_options, mand_options,
                                                sizeof(nfs_root_options)))
                goto out_optionstoolong;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e1164e3f9e69..c80add6e2213 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -20,6 +20,7 @@
 #include <linux/nfs_mount.h>
 #include "internal.h"
+#include "pnfs.h"
 static struct kmem_cache *nfs_page_cachep;
@@ -134,14 +135,14 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
                nfs_unlock_request(req);
 }
-/**
+/*
 * nfs_clear_request - Free up all resources allocated to the request
 * @req:
 *
 * Release page and open context resources associated with a read/write
 * request after it has completed.
 */
-void nfs_clear_request(struct nfs_page *req)
+static void nfs_clear_request(struct nfs_page *req)
 {
        struct page *page = req->wb_page;
        struct nfs_open_context *ctx = req->wb_context;
@@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req)
 */
 void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
                     struct inode *inode,
-                     int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int),
+                     int (*doio)(struct nfs_pageio_descriptor *),
                     size_t bsize,
                     int io_flags)
 {
@@ -222,10 +223,12 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
        desc->pg_count = 0;
        desc->pg_bsize = bsize;
        desc->pg_base = 0;
+        desc->pg_moreio = 0;
        desc->pg_inode = inode;
        desc->pg_doio = doio;
        desc->pg_ioflags = io_flags;
        desc->pg_error = 0;
+        desc->pg_lseg = NULL;
 }
 /**
@@ -240,7 +243,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 * Return 'true' if this is the case, else return 'false'.
 */
 static int nfs_can_coalesce_requests(struct nfs_page *prev,
-                                     struct nfs_page *req)
+                                     struct nfs_page *req,
+                                     struct nfs_pageio_descriptor *pgio)
 {
        if (req->wb_context->cred != prev->wb_context->cred)
                return 0;
@@ -254,6 +258,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
                return 0;
        if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
                return 0;
+        /*
+         * Non-whole file layouts need to check that req is inside of
+         * pgio->pg_lseg.
+         */
+        if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
+                return 0;
        return 1;
 }
@@ -286,7 +296,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
                if (newlen > desc->pg_bsize)
                        return 0;
                prev = nfs_list_entry(desc->pg_list.prev);
-                if (!nfs_can_coalesce_requests(prev, req))
+                if (!nfs_can_coalesce_requests(prev, req, desc))
                        return 0;
        } else
                desc->pg_base = req->wb_pgbase;
@@ -302,12 +312,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 {
        if (!list_empty(&desc->pg_list)) {
-                int error = desc->pg_doio(desc->pg_inode,
+                int error = desc->pg_doio(desc);
-                                          &desc->pg_list,
-                                          nfs_page_array_len(desc->pg_base,
-                                                             desc->pg_count),
-                                          desc->pg_count,
-                                          desc->pg_ioflags);
                if (error < 0)
                        desc->pg_error = error;
                else
@@ -331,9 +336,11 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                           struct nfs_page *req)
 {
        while (!nfs_pageio_do_add_request(desc, req)) {
+                desc->pg_moreio = 1;
                nfs_pageio_doio(desc);
                if (desc->pg_error < 0)
                        return 0;
+                desc->pg_moreio = 0;
        }
        return 1;
 }
@@ -391,6 +398,7 @@ int nfs_scan_list(struct nfs_inode *nfsi,
        pgoff_t idx_end;
        int found, i;
        int res;
+        struct list_head *list;
        res = 0;
        if (npages == 0)
@@ -411,10 +419,10 @@ int nfs_scan_list(struct nfs_inode *nfsi,
                        idx_start = req->wb_index + 1;
                        if (nfs_set_page_tag_locked(req)) {
                                kref_get(&req->wb_kref);
-                                nfs_list_remove_request(req);
                                radix_tree_tag_clear(&nfsi->nfs_page_tree,
                                                req->wb_index, tag);
-                                nfs_list_add_request(req, dst);
+                                list = pnfs_choose_commit_list(req, dst);
+                                nfs_list_add_request(req, list);
                                res++;
                                if (res == INT_MAX)
                                        goto out;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 1b1bc1a0fb0a..d9ab97269ce6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -30,6 +30,7 @@
 #include <linux/nfs_fs.h>
 #include "internal.h"
 #include "pnfs.h"
+#include "iostat.h"
 #define NFSDBG_FACILITY         NFSDBG_PNFS
@@ -74,10 +75,8 @@ find_pnfs_driver(u32 id)
 void
 unset_pnfs_layoutdriver(struct nfs_server *nfss)
 {
-        if (nfss->pnfs_curr_ld) {
+        if (nfss->pnfs_curr_ld)
-                nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
                module_put(nfss->pnfs_curr_ld->owner);
-        }
        nfss->pnfs_curr_ld = NULL;
 }
@@ -115,13 +114,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
                goto out_no_driver;
        }
        server->pnfs_curr_ld = ld_type;
-        if (ld_type->set_layoutdriver(server)) {
-                printk(KERN_ERR
-                       "%s: Error initializing mount point for layout driver %u.\n",
-                       __func__, id);
-                module_put(ld_type->owner);
-                goto out_no_driver;
-        }
        dprintk("%s: pNFS module for %u set\n", __func__, id);
        return;
@@ -230,38 +223,43 @@ static void free_lseg(struct pnfs_layout_segment *lseg)
        put_layout_hdr(NFS_I(ino)->layout);
 }
-/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg
+static void
- * could sleep, so must be called outside of the lock.
+put_lseg_common(struct pnfs_layout_segment *lseg)
- * Returns 1 if object was removed, otherwise return 0.
- */
-static int
-put_lseg_locked(struct pnfs_layout_segment *lseg,
-                struct list_head *tmp_list)
 {
+        struct inode *inode = lseg->pls_layout->plh_inode;
+        BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+        list_del_init(&lseg->pls_list);
+        if (list_empty(&lseg->pls_layout->plh_segs)) {
+                set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
+                /* Matched by initial refcount set in alloc_init_layout_hdr */
+                put_layout_hdr_locked(lseg->pls_layout);
+        }
+        rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
+}
+void
+put_lseg(struct pnfs_layout_segment *lseg)
+{
+        struct inode *inode;
+        if (!lseg)
+                return;
        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
                atomic_read(&lseg->pls_refcount),
                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
-        if (atomic_dec_and_test(&lseg->pls_refcount)) {
+        inode = lseg->pls_layout->plh_inode;
-                struct inode *ino = lseg->pls_layout->plh_inode;
+        if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
+                LIST_HEAD(free_me);
-                BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+                put_lseg_common(lseg);
-                list_del(&lseg->pls_list);
+                list_add(&lseg->pls_list, &free_me);
-                if (list_empty(&lseg->pls_layout->plh_segs)) {
+                spin_unlock(&inode->i_lock);
-                        struct nfs_client *clp;
+                pnfs_free_lseg_list(&free_me);
-                        clp = NFS_SERVER(ino)->nfs_client;
-                        spin_lock(&clp->cl_lock);
-                        /* List does not take a reference, so no need for put here */
-                        list_del_init(&lseg->pls_layout->plh_layouts);
-                        spin_unlock(&clp->cl_lock);
-                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
-                }
-                rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
-                list_add(&lseg->pls_list, tmp_list);
-                return 1;
        }
-        return 0;
 }
+EXPORT_SYMBOL_GPL(put_lseg);
 static bool
 should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
@@ -281,7 +279,13 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
                 * list.  It will now be removed when all
                 * outstanding io is finished.
                 */
-                rv = put_lseg_locked(lseg, tmp_list);
+                dprintk("%s: lseg %p ref %d\n", __func__, lseg,
+                        atomic_read(&lseg->pls_refcount));
+                if (atomic_dec_and_test(&lseg->pls_refcount)) {
+                        put_lseg_common(lseg);
+                        list_add(&lseg->pls_list, tmp_list);
+                        rv = 1;
+                }
        }
        return rv;
 }
@@ -299,6 +303,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
        dprintk("%s:Begin lo %p\n", __func__, lo);
+        if (list_empty(&lo->plh_segs)) {
+                if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
+                        put_layout_hdr_locked(lo);
+                return 0;
+        }
        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
                if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
                        dprintk("%s: freeing lseg %p iomode %d "
@@ -312,11 +321,27 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
        return invalid - removed;
 }
+/* note free_me must contain lsegs from a single layout_hdr */
 void
 pnfs_free_lseg_list(struct list_head *free_me)
 {
        struct pnfs_layout_segment *lseg, *tmp;
+        struct pnfs_layout_hdr *lo;
+        if (list_empty(free_me))
+                return;
+        lo = list_first_entry(free_me, struct pnfs_layout_segment,
+                              pls_list)->pls_layout;
+        if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
+                struct nfs_client *clp;
+                clp = NFS_SERVER(lo->plh_inode)->nfs_client;
+                spin_lock(&clp->cl_lock);
+                list_del_init(&lo->plh_layouts);
+                spin_unlock(&clp->cl_lock);
+        }
        list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
                list_del(&lseg->pls_list);
                free_lseg(lseg);
@@ -332,10 +357,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
        spin_lock(&nfsi->vfs_inode.i_lock);
        lo = nfsi->layout;
        if (lo) {
-                set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags);
+                lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
                mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
-                /* Matched by refcount set to 1 in alloc_init_layout_hdr */
-                put_layout_hdr_locked(lo);
        }
        spin_unlock(&nfsi->vfs_inode.i_lock);
        pnfs_free_lseg_list(&tmp_list);
@@ -403,6 +426,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
            (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
                return true;
        return lo->plh_block_lgets ||
+                test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
                (list_empty(&lo->plh_segs) &&
                 (atomic_read(&lo->plh_outstanding) > lget));
@@ -448,6 +472,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs4_layoutget *lgp;
        struct pnfs_layout_segment *lseg = NULL;
+        struct page **pages = NULL;
+        int i;
+        u32 max_resp_sz, max_pages;
        dprintk("--> %s\n", __func__);
@@ -455,6 +482,21 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
        if (lgp == NULL)
                return NULL;
+        /* allocate pages for xdr post processing */
+        max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+        max_pages = max_resp_sz >> PAGE_SHIFT;
+        pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
+        if (!pages)
+                goto out_err_free;
+        for (i = 0; i < max_pages; i++) {
+                pages[i] = alloc_page(GFP_KERNEL);
+                if (!pages[i])
+                        goto out_err_free;
+        }
        lgp->args.minlength = NFS4_MAX_UINT64;
        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
        lgp->args.range.iomode = iomode;
@@ -463,6 +505,8 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        lgp->args.type = server->pnfs_curr_ld->id;
        lgp->args.inode = ino;
        lgp->args.ctx = get_nfs_open_context(ctx);
+        lgp->args.layout.pages = pages;
+        lgp->args.layout.pglen = max_pages * PAGE_SIZE;
        lgp->lsegpp = &lseg;
        /* Synchronously retrieve layout information from server and
@@ -473,7 +517,26 @@ send_layoutget(struct pnfs_layout_hdr *lo,
                /* remember that LAYOUTGET failed and suspend trying */
                set_bit(lo_fail_bit(iomode), &lo->plh_flags);
        }
+        /* free xdr pages */
+        for (i = 0; i < max_pages; i++)
+                __free_page(pages[i]);
+        kfree(pages);
        return lseg;
+out_err_free:
+        /* free any allocated xdr pages, lgp as it's not used */
+        if (pages) {
+                for (i = 0; i < max_pages; i++) {
+                        if (!pages[i])
+                                break;
+                        __free_page(pages[i]);
+                }
+                kfree(pages);
+        }
+        kfree(lgp);
+        return NULL;
 }
 bool pnfs_roc(struct inode *ino)
@@ -674,7 +737,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
                    is_matching_lseg(lseg, iomode)) {
-                        ret = lseg;
+                        ret = get_lseg(lseg);
                        break;
                }
                if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
@@ -699,6 +762,7 @@ pnfs_update_layout(struct inode *ino,
        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg = NULL;
+        bool first = false;
        if (!pnfs_enabled_sb(NFS_SERVER(ino)))
                return NULL;
@@ -715,21 +779,25 @@ pnfs_update_layout(struct inode *ino,
                dprintk("%s matches recall, use MDS\n", __func__);
                goto out_unlock;
        }
-        /* Check to see if the layout for the given range already exists */
-        lseg = pnfs_find_lseg(lo, iomode);
-        if (lseg)
-                goto out_unlock;
        /* if LAYOUTGET already failed once we don't try again */
        if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
                goto out_unlock;
+        /* Check to see if the layout for the given range already exists */
+        lseg = pnfs_find_lseg(lo, iomode);
+        if (lseg)
+                goto out_unlock;
        if (pnfs_layoutgets_blocked(lo, NULL, 0))
                goto out_unlock;
        atomic_inc(&lo->plh_outstanding);
        get_layout_hdr(lo);
-        if (list_empty(&lo->plh_segs)) {
+        if (list_empty(&lo->plh_segs))
+                first = true;
+        spin_unlock(&ino->i_lock);
+        if (first) {
                /* The lo must be on the clp list if there is any
                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
                 */
@@ -738,24 +806,18 @@ pnfs_update_layout(struct inode *ino,
                list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
                spin_unlock(&clp->cl_lock);
        }
-        spin_unlock(&ino->i_lock);
        lseg = send_layoutget(lo, ctx, iomode);
-        if (!lseg) {
+        if (!lseg && first) {
-                spin_lock(&ino->i_lock);
+                spin_lock(&clp->cl_lock);
-                if (list_empty(&lo->plh_segs)) {
+                list_del_init(&lo->plh_layouts);
-                        spin_lock(&clp->cl_lock);
+                spin_unlock(&clp->cl_lock);
-                        list_del_init(&lo->plh_layouts);
-                        spin_unlock(&clp->cl_lock);
-                        clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
-                }
-                spin_unlock(&ino->i_lock);
        }
        atomic_dec(&lo->plh_outstanding);
        put_layout_hdr(lo);
 out:
        dprintk("%s end, state 0x%lx lseg %p\n", __func__,
-                nfsi->layout->plh_flags, lseg);
+                nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
        return lseg;
 out_unlock:
        spin_unlock(&ino->i_lock);
@@ -808,7 +870,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        }
        init_lseg(lo, lseg);
        lseg->pls_range = res->range;
-        *lgp->lsegpp = lseg;
+        *lgp->lsegpp = get_lseg(lseg);
        pnfs_insert_layout(lo, lseg);
        if (res->return_on_close) {
@@ -829,137 +891,199 @@ out_forget_reply:
        goto out;
 }
+static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
+                             struct nfs_page *prev,
+                             struct nfs_page *req)
+{
+        if (pgio->pg_count == prev->wb_bytes) {
+                /* This is first coelesce call for a series of nfs_pages */
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                                   prev->wb_context,
+                                                   IOMODE_READ);
+        }
+        return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
+}
+void
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+{
+        struct pnfs_layoutdriver_type *ld;
+        ld = NFS_SERVER(inode)->pnfs_curr_ld;
+        pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
+}
+static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
+                              struct nfs_page *prev,
+                              struct nfs_page *req)
+{
+        if (pgio->pg_count == prev->wb_bytes) {
+                /* This is first coelesce call for a series of nfs_pages */
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                                   prev->wb_context,
+                                                   IOMODE_RW);
+        }
+        return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
+}
+void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+{
+        struct pnfs_layoutdriver_type *ld;
+        ld = NFS_SERVER(inode)->pnfs_curr_ld;
+        pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
+}
+enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *wdata,
+                        const struct rpc_call_ops *call_ops, int how)
+{
+        struct inode *inode = wdata->inode;
+        enum pnfs_try_status trypnfs;
+        struct nfs_server *nfss = NFS_SERVER(inode);
+        wdata->mds_ops = call_ops;
+        dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
+                inode->i_ino, wdata->args.count, wdata->args.offset, how);
+        trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
+        if (trypnfs == PNFS_NOT_ATTEMPTED) {
+                put_lseg(wdata->lseg);
+                wdata->lseg = NULL;
+        } else
+                nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
+        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
+        return trypnfs;
+}
 /*
- * Device ID cache. Currently supports one layout type per struct nfs_client.
+ * Call the appropriate parallel I/O subsystem read function.
- * Add layout type to the lookup key to expand to support multiple types.
 */
-int
+enum pnfs_try_status
-pnfs_alloc_init_deviceid_cache(struct nfs_client *clp,
+pnfs_try_to_read_data(struct nfs_read_data *rdata,
-                         void (*free_callback)(struct pnfs_deviceid_node *))
+                       const struct rpc_call_ops *call_ops)
 {
-        struct pnfs_deviceid_cache *c;
+        struct inode *inode = rdata->inode;
+        struct nfs_server *nfss = NFS_SERVER(inode);
+        enum pnfs_try_status trypnfs;
-        c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL);
+        rdata->mds_ops = call_ops;
-        if (!c)
-                return -ENOMEM;
+        dprintk("%s: Reading ino:%lu %u@%llu\n",
-        spin_lock(&clp->cl_lock);
+                __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
-        if (clp->cl_devid_cache != NULL) {
-                atomic_inc(&clp->cl_devid_cache->dc_ref);
+        trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
-                dprintk("%s [kref [%d]]\n", __func__,
+        if (trypnfs == PNFS_NOT_ATTEMPTED) {
-                        atomic_read(&clp->cl_devid_cache->dc_ref));
+                put_lseg(rdata->lseg);
-                kfree(c);
+                rdata->lseg = NULL;
        } else {
-                /* kzalloc initializes hlists */
+                nfs_inc_stats(inode, NFSIOS_PNFS_READ);
-                spin_lock_init(&c->dc_lock);
-                atomic_set(&c->dc_ref, 1);
-                c->dc_free_callback = free_callback;
-                clp->cl_devid_cache = c;
-                dprintk("%s [new]\n", __func__);
        }
-        spin_unlock(&clp->cl_lock);
+        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
-        return 0;
+        return trypnfs;
 }
-EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
 /*
- * Called from pnfs_layoutdriver_type->free_lseg
+ * Currently there is only one (whole file) write lseg.
- * last layout segment reference frees deviceid
 */
-void
+static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode)
-pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
-                  struct pnfs_deviceid_node *devid)
 {
-        struct nfs4_deviceid *id = &devid->de_id;
+        struct pnfs_layout_segment *lseg, *rv = NULL;
-        struct pnfs_deviceid_node *d;
-        struct hlist_node *n;
-        long h = nfs4_deviceid_hash(id);
-        dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref));
+        list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
-        if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock))
+                if (lseg->pls_range.iomode == IOMODE_RW)
-                return;
+                        rv = lseg;
+        return rv;
+}
-        hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node)
+void
-                if (!memcmp(&d->de_id, id, sizeof(*id))) {
+pnfs_set_layoutcommit(struct nfs_write_data *wdata)
-                        hlist_del_rcu(&d->de_node);
+{
-                        spin_unlock(&c->dc_lock);
+        struct nfs_inode *nfsi = NFS_I(wdata->inode);
-                        synchronize_rcu();
+        loff_t end_pos = wdata->args.offset + wdata->res.count;
-                        c->dc_free_callback(devid);
-                        return;
+        spin_lock(&nfsi->vfs_inode.i_lock);
-                }
+        if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
-        spin_unlock(&c->dc_lock);
+                /* references matched in nfs4_layoutcommit_release */
-        /* Why wasn't it found in  the list? */
+                get_lseg(wdata->lseg);
-        BUG();
+                wdata->lseg->pls_lc_cred =
-}
+                        get_rpccred(wdata->args.context->state->owner->so_cred);
-EXPORT_SYMBOL_GPL(pnfs_put_deviceid);
+                mark_inode_dirty_sync(wdata->inode);
+                dprintk("%s: Set layoutcommit for inode %lu ",
-/* Find and reference a deviceid */
+                        __func__, wdata->inode->i_ino);
-struct pnfs_deviceid_node *
-pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
-{
-        struct pnfs_deviceid_node *d;
-        struct hlist_node *n;
-        long hash = nfs4_deviceid_hash(id);
-        dprintk("--> %s hash %ld\n", __func__, hash);
-        rcu_read_lock();
-        hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
-                if (!memcmp(&d->de_id, id, sizeof(*id))) {
-                        if (!atomic_inc_not_zero(&d->de_ref)) {
-                                goto fail;
-                        } else {
-                                rcu_read_unlock();
-                                return d;
-                        }
-                }
        }
-fail:
+        if (end_pos > wdata->lseg->pls_end_pos)
-        rcu_read_unlock();
+                wdata->lseg->pls_end_pos = end_pos;
-        return NULL;
+        spin_unlock(&nfsi->vfs_inode.i_lock);
 }
-EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid);
+EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
 /*
- * Add a deviceid to the cache.
+ * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
- * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new
+ * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
+ * data to disk to allow the server to recover the data if it crashes.
+ * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
+ * is off, and a COMMIT is sent to a data server, or
+ * if WRITEs to a data server return NFS_DATA_SYNC.
 */
-struct pnfs_deviceid_node *
+int
-pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new)
+pnfs_layoutcommit_inode(struct inode *inode, bool sync)
-{
-        struct pnfs_deviceid_node *d;
-        long hash = nfs4_deviceid_hash(&new->de_id);
-        dprintk("--> %s hash %ld\n", __func__, hash);
-        spin_lock(&c->dc_lock);
-        d = pnfs_find_get_deviceid(c, &new->de_id);
-        if (d) {
-                spin_unlock(&c->dc_lock);
-                dprintk("%s [discard]\n", __func__);
-                c->dc_free_callback(new);
-                return d;
-        }
-        INIT_HLIST_NODE(&new->de_node);
-        atomic_set(&new->de_ref, 1);
-        hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
-        spin_unlock(&c->dc_lock);
-        dprintk("%s [new]\n", __func__);
-        return new;
-}
-EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
-void
-pnfs_put_deviceid_cache(struct nfs_client *clp)
 {
-        struct pnfs_deviceid_cache *local = clp->cl_devid_cache;
+        struct nfs4_layoutcommit_data *data;
+        struct nfs_inode *nfsi = NFS_I(inode);
+        struct pnfs_layout_segment *lseg;
+        struct rpc_cred *cred;
+        loff_t end_pos;
+        int status = 0;
-        dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref));
+        dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
-        if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) {
-                int i;
+        if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
-                /* Verify cache is empty */
+                return 0;
-                for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++)
-                        BUG_ON(!hlist_empty(&local->dc_deviceids[i]));
+        /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
-                clp->cl_devid_cache = NULL;
+        data = kzalloc(sizeof(*data), GFP_NOFS);
-                spin_unlock(&clp->cl_lock);
+        if (!data) {
-                kfree(local);
+                mark_inode_dirty_sync(inode);
+                status = -ENOMEM;
+                goto out;
        }
+        spin_lock(&inode->i_lock);
+        if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
+                spin_unlock(&inode->i_lock);
+                kfree(data);
+                goto out;
+        }
+        /*
+         * Currently only one (whole file) write lseg which is referenced
+         * in pnfs_set_layoutcommit and will be found.
+         */
+        lseg = pnfs_list_write_lseg(inode);
+        end_pos = lseg->pls_end_pos;
+        cred = lseg->pls_lc_cred;
+        lseg->pls_end_pos = 0;
+        lseg->pls_lc_cred = NULL;
+        memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
+                sizeof(nfsi->layout->plh_stateid.data));
+        spin_unlock(&inode->i_lock);
+        data->args.inode = inode;
+        data->lseg = lseg;
+        data->cred = cred;
+        nfs_fattr_init(&data->fattr);
+        data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
+        data->res.fattr = &data->fattr;
+        data->args.lastbytewritten = end_pos - 1;
+        data->res.server = NFS_SERVER(inode);
+        status = nfs4_proc_layoutcommit(data, sync);
+out:
+        dprintk("<-- %s status %d\n", __func__, status);
+        return status;
 }
-EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e2612ea0cbed..bc4827202e7a 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,8 @@
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
+#include <linux/nfs_page.h>
 enum {
        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
        NFS_LSEG_ROC,           /* roc bit received from server */
@@ -41,6 +43,13 @@ struct pnfs_layout_segment {
        atomic_t pls_refcount;
        unsigned long pls_flags;
        struct pnfs_layout_hdr *pls_layout;
+        struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */
+        loff_t pls_end_pos; /* LAYOUTCOMMIT write end */
+};
+enum pnfs_try_status {
+        PNFS_ATTEMPTED     = 0,
+        PNFS_NOT_ATTEMPTED = 1,
 };
 #ifdef CONFIG_NFS_V4_1
@@ -61,10 +70,25 @@ struct pnfs_layoutdriver_type {
        const u32 id;
        const char *name;
        struct module *owner;
-        int (*set_layoutdriver) (struct nfs_server *);
-        int (*clear_layoutdriver) (struct nfs_server *);
        struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
        void (*free_lseg) (struct pnfs_layout_segment *lseg);
+        /* test for nfs page cache coalescing */
+        int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+        /* Returns true if layoutdriver wants to divert this request to
+         * driver's commit routine.
+         */
+        bool (*mark_pnfs_commit)(struct pnfs_layout_segment *lseg);
+        struct list_head * (*choose_commit_list) (struct nfs_page *req);
+        int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how);
+        /*
+         * Return PNFS_ATTEMPTED to indicate the layout code has attempted
+         * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
+         */
+        enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
+        enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
 };
 struct pnfs_layout_hdr {
@@ -85,57 +109,10 @@ struct pnfs_device {
        unsigned int  layout_type;
        unsigned int  mincount;
        struct page **pages;
-        void          *area;
        unsigned int  pgbase;
        unsigned int  pglen;
 };
-/*
- * Device ID RCU cache. A device ID is unique per client ID and layout type.
- */
-#define NFS4_DEVICE_ID_HASH_BITS        5
-#define NFS4_DEVICE_ID_HASH_SIZE        (1 << NFS4_DEVICE_ID_HASH_BITS)
-#define NFS4_DEVICE_ID_HASH_MASK        (NFS4_DEVICE_ID_HASH_SIZE - 1)
-static inline u32
-nfs4_deviceid_hash(struct nfs4_deviceid *id)
-{
-        unsigned char *cptr = (unsigned char *)id->data;
-        unsigned int nbytes = NFS4_DEVICEID4_SIZE;
-        u32 x = 0;
-        while (nbytes--) {
-                x *= 37;
-                x += *cptr++;
-        }
-        return x & NFS4_DEVICE_ID_HASH_MASK;
-}
-struct pnfs_deviceid_node {
-        struct hlist_node       de_node;
-        struct nfs4_deviceid    de_id;
-        atomic_t                de_ref;
-};
-struct pnfs_deviceid_cache {
-        spinlock_t              dc_lock;
-        atomic_t                dc_ref;
-        void                    (*dc_free_callback)(struct pnfs_deviceid_node *);
-        struct hlist_head       dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
-};
-extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
-                        void (*free_callback)(struct pnfs_deviceid_node *));
-extern void pnfs_put_deviceid_cache(struct nfs_client *);
-extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
-                                struct pnfs_deviceid_cache *,
-                                struct nfs4_deviceid *);
-extern struct pnfs_deviceid_node *pnfs_add_deviceid(
-                                struct pnfs_deviceid_cache *,
-                                struct pnfs_deviceid_node *);
-extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
-                              struct pnfs_deviceid_node *devid);
 extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
@@ -146,11 +123,18 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
 /* pnfs.c */
 void get_layout_hdr(struct pnfs_layout_hdr *lo);
+void put_lseg(struct pnfs_layout_segment *lseg);
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
                   enum pnfs_iomode access_type);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
+enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
+                                             const struct rpc_call_ops *, int);
+enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
+                                            const struct rpc_call_ops *);
+void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
+void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
@@ -169,7 +153,8 @@ bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
+void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
+int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 static inline int lo_fail_bit(u32 iomode)
 {
@@ -177,12 +162,67 @@ static inline int lo_fail_bit(u32 iomode)
                         NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
 }
+static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+        if (lseg) {
+                atomic_inc(&lseg->pls_refcount);
+                smp_mb__after_atomic_inc();
+        }
+        return lseg;
+}
 /* Return true if a layout driver is being used for this mountpoint */
 static inline int pnfs_enabled_sb(struct nfs_server *nfss)
 {
        return nfss->pnfs_curr_ld != NULL;
 }
+static inline void
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+{
+        if (lseg) {
+                struct pnfs_layoutdriver_type *ld;
+                ld = NFS_SERVER(req->wb_page->mapping->host)->pnfs_curr_ld;
+                if (ld->mark_pnfs_commit && ld->mark_pnfs_commit(lseg)) {
+                        set_bit(PG_PNFS_COMMIT, &req->wb_flags);
+                        req->wb_commit_lseg = get_lseg(lseg);
+                }
+        }
+}
+static inline int
+pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
+{
+        if (!test_and_clear_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags))
+                return PNFS_NOT_ATTEMPTED;
+        return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how);
+}
+static inline struct list_head *
+pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds)
+{
+        struct list_head *rv;
+        if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) {
+                struct inode *inode = req->wb_commit_lseg->pls_layout->plh_inode;
+                set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags);
+                rv = NFS_SERVER(inode)->pnfs_curr_ld->choose_commit_list(req);
+                /* matched by ref taken when PG_PNFS_COMMIT is set */
+                put_lseg(req->wb_commit_lseg);
+        } else
+                rv = mds;
+        return rv;
+}
+static inline void pnfs_clear_request_commit(struct nfs_page *req)
+{
+        if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags))
+                put_lseg(req->wb_commit_lseg);
+}
 #else  /* CONFIG_NFS_V4_1 */
 static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -194,12 +234,36 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
 }
 static inline struct pnfs_layout_segment *
+get_lseg(struct pnfs_layout_segment *lseg)
+{
+        return NULL;
+}
+static inline void put_lseg(struct pnfs_layout_segment *lseg)
+{
+}
+static inline struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
                   enum pnfs_iomode access_type)
 {
        return NULL;
 }
+static inline enum pnfs_try_status
+pnfs_try_to_read_data(struct nfs_read_data *data,
+                      const struct rpc_call_ops *call_ops)
+{
+        return PNFS_NOT_ATTEMPTED;
+}
+static inline enum pnfs_try_status
+pnfs_try_to_write_data(struct nfs_write_data *data,
+                       const struct rpc_call_ops *call_ops, int how)
+{
+        return PNFS_NOT_ATTEMPTED;
+}
 static inline bool
 pnfs_roc(struct inode *ino)
 {
@@ -230,6 +294,43 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
 {
 }
+static inline void
+pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+{
+        pgio->pg_test = NULL;
+}
+static inline void
+pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+{
+        pgio->pg_test = NULL;
+}
+static inline void
+pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
+{
+}
+static inline int
+pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
+{
+        return PNFS_NOT_ATTEMPTED;
+}
+static inline struct list_head *
+pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds)
+{
+        return mds;
+}
+static inline void pnfs_clear_request_commit(struct nfs_page *req)
+{
+}
+static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
+{
+        return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 #endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 77d5e21c4ad6..ac40b8535d7e 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -177,7 +177,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 }
 static int
-nfs_proc_lookup(struct inode *dir, struct qstr *name,
+nfs_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
                struct nfs_fh *fhandle, struct nfs_fattr *fattr)
 {
        struct nfs_diropargs    arg = {
@@ -741,4 +741,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .lock           = nfs_proc_lock,
        .lock_check_bounds = nfs_lock_check_bounds,
        .close_context  = nfs_close_context,
+        .init_client    = nfs_init_client,
 };
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index aedcaa7f291f..7cded2b12a05 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -18,19 +18,20 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
+#include <linux/module.h>
 #include <asm/system.h>
+#include "pnfs.h"
 #include "nfs4_fs.h"
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
-#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
-static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int);
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc);
-static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int);
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc);
 static const struct rpc_call_ops nfs_read_partial_ops;
 static const struct rpc_call_ops nfs_read_full_ops;
@@ -69,6 +70,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
 static void nfs_readdata_release(struct nfs_read_data *rdata)
 {
+        put_lseg(rdata->lseg);
        put_nfs_open_context(rdata->args.context);
        nfs_readdata_free(rdata);
 }
@@ -114,14 +116,13 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
 int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
                       struct page *page)
 {
-        LIST_HEAD(one_request);
        struct nfs_page *new;
        unsigned int len;
+        struct nfs_pageio_descriptor pgio;
        len = nfs_page_length(page);
        if (len == 0)
                return nfs_return_empty_page(page);
-        pnfs_update_layout(inode, ctx, IOMODE_READ);
        new = nfs_create_request(ctx, inode, page, 0, len);
        if (IS_ERR(new)) {
                unlock_page(page);
@@ -130,11 +131,14 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
        if (len < PAGE_CACHE_SIZE)
                zero_user_segment(page, len, PAGE_CACHE_SIZE);
-        nfs_list_add_request(new, &one_request);
+        nfs_pageio_init(&pgio, inode, NULL, 0, 0);
+        nfs_list_add_request(new, &pgio.pg_list);
+        pgio.pg_count = len;
        if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
-                nfs_pagein_multi(inode, &one_request, 1, len, 0);
+                nfs_pagein_multi(&pgio);
        else
-                nfs_pagein_one(inode, &one_request, 1, len, 0);
+                nfs_pagein_one(&pgio);
        return 0;
 }
@@ -155,24 +159,20 @@ static void nfs_readpage_release(struct nfs_page *req)
        nfs_release_request(req);
 }
-/*
+int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
- * Set up the NFS read request struct
+                      const struct rpc_call_ops *call_ops)
- */
-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
-                const struct rpc_call_ops *call_ops,
-                unsigned int count, unsigned int offset)
 {
-        struct inode *inode = req->wb_context->path.dentry->d_inode;
+        struct inode *inode = data->inode;
        int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
        struct rpc_task *task;
        struct rpc_message msg = {
                .rpc_argp = &data->args,
                .rpc_resp = &data->res,
-                .rpc_cred = req->wb_context->cred,
+                .rpc_cred = data->cred,
        };
        struct rpc_task_setup task_setup_data = {
                .task = &data->task,
-                .rpc_client = NFS_CLIENT(inode),
+                .rpc_client = clnt,
                .rpc_message = &msg,
                .callback_ops = call_ops,
                .callback_data = data,
@@ -180,9 +180,39 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
                .flags = RPC_TASK_ASYNC | swap_flags,
        };
+        /* Set up the initial task struct. */
+        NFS_PROTO(inode)->read_setup(data, &msg);
+        dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ "
+                        "offset %llu)\n",
+                        data->task.tk_pid,
+                        inode->i_sb->s_id,
+                        (long long)NFS_FILEID(inode),
+                        data->args.count,
+                        (unsigned long long)data->args.offset);
+        task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task))
+                return PTR_ERR(task);
+        rpc_put_task(task);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_read);
+/*
+ * Set up the NFS read request struct
+ */
+static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
+                const struct rpc_call_ops *call_ops,
+                unsigned int count, unsigned int offset,
+                struct pnfs_layout_segment *lseg)
+{
+        struct inode *inode = req->wb_context->path.dentry->d_inode;
        data->req         = req;
        data->inode       = inode;
-        data->cred        = msg.rpc_cred;
+        data->cred        = req->wb_context->cred;
+        data->lseg        = get_lseg(lseg);
        data->args.fh     = NFS_FH(inode);
        data->args.offset = req_offset(req) + offset;
@@ -197,21 +227,11 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
        data->res.eof     = 0;
        nfs_fattr_init(&data->fattr);
-        /* Set up the initial task struct. */
+        if (data->lseg &&
-        NFS_PROTO(inode)->read_setup(data, &msg);
+            (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
+                return 0;
-        dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
-                        data->task.tk_pid,
-                        inode->i_sb->s_id,
-                        (long long)NFS_FILEID(inode),
-                        count,
-                        (unsigned long long)data->args.offset);
-        task = rpc_run_task(&task_setup_data);
+        return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
-        if (IS_ERR(task))
-                return PTR_ERR(task);
-        rpc_put_task(task);
-        return 0;
 }
 static void
@@ -240,20 +260,21 @@ nfs_async_read_error(struct list_head *head)
 * won't see the new data until our attribute cache is updated.  This is more
 * or less conventional NFS client behavior.
 */
-static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
+static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
 {
-        struct nfs_page *req = nfs_list_entry(head->next);
+        struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
        struct page *page = req->wb_page;
        struct nfs_read_data *data;
-        size_t rsize = NFS_SERVER(inode)->rsize, nbytes;
+        size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes;
        unsigned int offset;
        int requests = 0;
        int ret = 0;
+        struct pnfs_layout_segment *lseg;
        LIST_HEAD(list);
        nfs_list_remove_request(req);
-        nbytes = count;
+        nbytes = desc->pg_count;
        do {
                size_t len = min(nbytes,rsize);
@@ -266,9 +287,11 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
        } while(nbytes != 0);
        atomic_set(&req->wb_complete, requests);
+        BUG_ON(desc->pg_lseg != NULL);
+        lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
        ClearPageError(page);
        offset = 0;
-        nbytes = count;
+        nbytes = desc->pg_count;
        do {
                int ret2;
@@ -280,12 +303,14 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
                if (nbytes < rsize)
                        rsize = nbytes;
                ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
-                                  rsize, offset);
+                                         rsize, offset, lseg);
                if (ret == 0)
                        ret = ret2;
                offset += rsize;
                nbytes -= rsize;
        } while (nbytes != 0);
+        put_lseg(lseg);
+        desc->pg_lseg = NULL;
        return ret;
@@ -300,16 +325,21 @@ out_bad:
        return -ENOMEM;
 }
-static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags)
+static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
 {
        struct nfs_page         *req;
        struct page             **pages;
        struct nfs_read_data    *data;
+        struct list_head *head = &desc->pg_list;
+        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        int ret = -ENOMEM;
-        data = nfs_readdata_alloc(npages);
+        data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
-        if (!data)
+                                                     desc->pg_count));
-                goto out_bad;
+        if (!data) {
+                nfs_async_read_error(head);
+                goto out;
+        }
        pages = data->pagevec;
        while (!list_empty(head)) {
@@ -320,10 +350,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
                *pages++ = req->wb_page;
        }
        req = nfs_list_entry(data->pages.next);
+        if ((!lseg) && list_is_singular(&data->pages))
+                lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
-        return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0);
+        ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
-out_bad:
+                                0, lseg);
-        nfs_async_read_error(head);
+out:
+        put_lseg(lseg);
+        desc->pg_lseg = NULL;
        return ret;
 }
@@ -366,6 +400,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
                return;
        /* Yes, so retry the read at the end of the data */
+        data->mds_offset += resp->count;
        argp->offset += resp->count;
        argp->pgbase += resp->count;
        argp->count -= resp->count;
@@ -625,7 +660,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        if (ret == 0)
                goto read_complete; /* all pages were read */
-        pnfs_update_layout(inode, desc.ctx, IOMODE_READ);
+        pnfs_pageio_init_read(&pgio, inode);
        if (rsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
        else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b68c8607770f..2b8e9a5e366a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -263,8 +263,11 @@ static match_table_t nfs_local_lock_tokens = {
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct dentry *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
+static int  nfs_show_devname(struct seq_file *, struct vfsmount *);
+static int  nfs_show_path(struct seq_file *, struct vfsmount *);
 static int  nfs_show_stats(struct seq_file *, struct vfsmount *);
-static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *);
+static struct dentry *nfs_fs_mount(struct file_system_type *,
+                int, const char *, void *);
 static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
                int flags, const char *dev_name, void *raw_data);
 static void nfs_put_super(struct super_block *);
@@ -274,7 +277,7 @@ static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 static struct file_system_type nfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs",
-        .get_sb         = nfs_get_sb,
+        .mount          = nfs_fs_mount,
        .kill_sb        = nfs_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -296,6 +299,8 @@ static const struct super_operations nfs_sops = {
        .evict_inode    = nfs_evict_inode,
        .umount_begin   = nfs_umount_begin,
        .show_options   = nfs_show_options,
+        .show_devname   = nfs_show_devname,
+        .show_path      = nfs_show_path,
        .show_stats     = nfs_show_stats,
        .remount_fs     = nfs_remount,
 };
@@ -303,16 +308,16 @@ static const struct super_operations nfs_sops = {
 #ifdef CONFIG_NFS_V4
 static int nfs4_validate_text_mount_data(void *options,
        struct nfs_parsed_mount_data *args, const char *dev_name);
-static int nfs4_try_mount(int flags, const char *dev_name,
+static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
-        struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
+        struct nfs_parsed_mount_data *data);
-static int nfs4_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
 static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data);
 static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data);
-static int nfs4_referral_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
+        int flags, const char *dev_name, void *raw_data);
 static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data);
 static void nfs4_kill_super(struct super_block *sb);
@@ -320,7 +325,7 @@ static void nfs4_kill_super(struct super_block *sb);
 static struct file_system_type nfs4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_get_sb,
+        .mount          = nfs4_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -352,7 +357,7 @@ static struct file_system_type nfs4_remote_referral_fs_type = {
 struct file_system_type nfs4_referral_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
-        .get_sb         = nfs4_referral_get_sb,
+        .mount          = nfs4_referral_mount,
        .kill_sb        = nfs4_kill_super,
        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
 };
@@ -366,6 +371,8 @@ static const struct super_operations nfs4_sops = {
        .evict_inode    = nfs4_evict_inode,
        .umount_begin   = nfs_umount_begin,
        .show_options   = nfs_show_options,
+        .show_devname   = nfs_show_devname,
+        .show_path      = nfs_show_path,
        .show_stats     = nfs_show_stats,
        .remount_fs     = nfs_remount,
 };
@@ -726,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
+static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
+{
+        char *page = (char *) __get_free_page(GFP_KERNEL);
+        char *devname, *dummy;
+        int err = 0;
+        if (!page)
+                return -ENOMEM;
+        devname = nfs_path(&dummy, mnt->mnt_root, page, PAGE_SIZE);
+        if (IS_ERR(devname))
+                err = PTR_ERR(devname);
+        else
+                seq_escape(m, devname, " \t\n\\");
+        free_page((unsigned long)page);
+        return err;
+}
+static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt)
+{
+        seq_puts(m, "/");
+        return 0;
+}
 /*
 * Present statistical information for this VFS mountpoint
 */
@@ -979,6 +1008,27 @@ static int nfs_parse_security_flavors(char *value,
        return 1;
 }
+static int nfs_get_option_str(substring_t args[], char **option)
+{
+        kfree(*option);
+        *option = match_strdup(args);
+        return !option;
+}
+static int nfs_get_option_ul(substring_t args[], unsigned long *option)
+{
+        int rc;
+        char *string;
+        string = match_strdup(args);
+        if (string == NULL)
+                return -ENOMEM;
+        rc = strict_strtoul(string, 10, option);
+        kfree(string);
+        return rc;
+}
 /*
 * Error-check and convert a string of mount options from user space into
 * a data structure.  The whole mount string is processed; bad options are
@@ -1127,155 +1177,82 @@ static int nfs_parse_mount_options(char *raw,
                 * options that take numeric values
                 */
                case Opt_port:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) ||
-                        if (string == NULL)
+                            option > USHRT_MAX)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 || option > USHRT_MAX)
                                goto out_invalid_value;
                        mnt->nfs_server.port = option;
                        break;
                case Opt_rsize:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->rsize = option;
                        break;
                case Opt_wsize:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->wsize = option;
                        break;
                case Opt_bsize:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->bsize = option;
                        break;
                case Opt_timeo:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) || option == 0)
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 || option == 0)
                                goto out_invalid_value;
                        mnt->timeo = option;
                        break;
                case Opt_retrans:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) || option == 0)
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 || option == 0)
                                goto out_invalid_value;
                        mnt->retrans = option;
                        break;
                case Opt_acregmin:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acregmin = option;
                        break;
                case Opt_acregmax:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acregmax = option;
                        break;
                case Opt_acdirmin:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acdirmin = option;
                        break;
                case Opt_acdirmax:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acdirmax = option;
                        break;
                case Opt_actimeo:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->acregmin = mnt->acregmax =
                        mnt->acdirmin = mnt->acdirmax = option;
                        break;
                case Opt_namelen:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        mnt->namlen = option;
                        break;
                case Opt_mountport:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) ||
-                        if (string == NULL)
+                            option > USHRT_MAX)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 || option > USHRT_MAX)
                                goto out_invalid_value;
                        mnt->mount_server.port = option;
                        break;
                case Opt_mountvers:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option) ||
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0 ||
                            option < NFS_MNT_VERSION ||
                            option > NFS_MNT3_VERSION)
                                goto out_invalid_value;
                        mnt->mount_server.version = option;
                        break;
                case Opt_nfsvers:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        switch (option) {
                        case NFS2_VERSION:
@@ -1295,12 +1272,7 @@ static int nfs_parse_mount_options(char *raw,
                        }
                        break;
                case Opt_minorversion:
-                        string = match_strdup(args);
+                        if (nfs_get_option_ul(args, &option))
-                        if (string == NULL)
-                                goto out_nomem;
-                        rc = strict_strtoul(string, 10, &option);
-                        kfree(string);
-                        if (rc != 0)
                                goto out_invalid_value;
                        if (option > NFS4_MAX_MINOR_VERSION)
                                goto out_invalid_value;
@@ -1336,21 +1308,18 @@ static int nfs_parse_mount_options(char *raw,
                        case Opt_xprt_udp:
                                mnt->flags &= ~NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
-                                kfree(string);
                                break;
                        case Opt_xprt_tcp6:
                                protofamily = AF_INET6;
                        case Opt_xprt_tcp:
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-                                kfree(string);
                                break;
                        case Opt_xprt_rdma:
                                /* vector side protocols to TCP */
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
                                xprt_load_transport(string);
-                                kfree(string);
                                break;
                        default:
                                dfprintk(MOUNT, "NFS:   unrecognized "
@@ -1358,6 +1327,7 @@ static int nfs_parse_mount_options(char *raw,
                                kfree(string);
                                return 0;
                        }
+                        kfree(string);
                        break;
                case Opt_mountproto:
                        string = match_strdup(args);
@@ -1400,18 +1370,13 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_invalid_address;
                        break;
                case Opt_clientaddr:
-                        string = match_strdup(args);
+                        if (nfs_get_option_str(args, &mnt->client_address))
-                        if (string == NULL)
                                goto out_nomem;
-                        kfree(mnt->client_address);
-                        mnt->client_address = string;
                        break;
                case Opt_mounthost:
-                        string = match_strdup(args);
+                        if (nfs_get_option_str(args,
-                        if (string == NULL)
+                                               &mnt->mount_server.hostname))
                                goto out_nomem;
-                        kfree(mnt->mount_server.hostname);
-                        mnt->mount_server.hostname = string;
                        break;
                case Opt_mountaddr:
                        string = match_strdup(args);
@@ -1451,11 +1416,8 @@ static int nfs_parse_mount_options(char *raw,
                        };
                        break;
                case Opt_fscache_uniq:
-                        string = match_strdup(args);
+                        if (nfs_get_option_str(args, &mnt->fscache_uniq))
-                        if (string == NULL)
                                goto out_nomem;
-                        kfree(mnt->fscache_uniq);
-                        mnt->fscache_uniq = string;
                        mnt->options |= NFS_OPTION_FSCACHE;
                        break;
                case Opt_local_lock:
@@ -1665,99 +1627,59 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
        return nfs_walk_authlist(args, &request);
 }
-static int nfs_parse_simple_hostname(const char *dev_name,
+/*
-                                     char **hostname, size_t maxnamlen,
+ * Split "dev_name" into "hostname:export_path".
-                                     char **export_path, size_t maxpathlen)
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path.  If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+                             char **hostname, size_t maxnamlen,
+                             char **export_path, size_t maxpathlen)
 {
        size_t len;
-        char *colon, *comma;
+        char *end;
-        colon = strchr(dev_name, ':');
-        if (colon == NULL)
-                goto out_bad_devname;
-        len = colon - dev_name;
-        if (len > maxnamlen)
-                goto out_hostname;
-        /* N.B. caller will free nfs_server.hostname in all cases */
+        /* Is the host name protected with square brakcets? */
-        *hostname = kstrndup(dev_name, len, GFP_KERNEL);
+        if (*dev_name == '[') {
-        if (!*hostname)
+                end = strchr(++dev_name, ']');
-                goto out_nomem;
+                if (end == NULL || end[1] != ':')
-        /* kill possible hostname list: not supported */
-        comma = strchr(*hostname, ',');
-        if (comma != NULL) {
-                if (comma == *hostname)
                        goto out_bad_devname;
-                *comma = '\0';
-        }
-        colon++;
+                len = end - dev_name;
-        len = strlen(colon);
+                end++;
-        if (len > maxpathlen)
+        } else {
-                goto out_path;
+                char *comma;
-        *export_path = kstrndup(colon, len, GFP_KERNEL);
-        if (!*export_path)
-                goto out_nomem;
-        dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
-        return 0;
-out_bad_devname:
-        dfprintk(MOUNT, "NFS: device name not in host:path format\n");
-        return -EINVAL;
-out_nomem:
-        dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
-        return -ENOMEM;
-out_hostname:
-        dfprintk(MOUNT, "NFS: server hostname too long\n");
-        return -ENAMETOOLONG;
-out_path:
-        dfprintk(MOUNT, "NFS: export pathname too long\n");
-        return -ENAMETOOLONG;
-}
-/*
- * Hostname has square brackets around it because it contains one or
- * more colons.  We look for the first closing square bracket, and a
- * colon must follow it.
- */
-static int nfs_parse_protected_hostname(const char *dev_name,
-                                        char **hostname, size_t maxnamlen,
-                                        char **export_path, size_t maxpathlen)
-{
-        size_t len;
-        char *start, *end;
-        start = (char *)(dev_name + 1);
+                end = strchr(dev_name, ':');
+                if (end == NULL)
+                        goto out_bad_devname;
+                len = end - dev_name;
-        end = strchr(start, ']');
+                /* kill possible hostname list: not supported */
-        if (end == NULL)
+                comma = strchr(dev_name, ',');
-                goto out_bad_devname;
+                if (comma != NULL && comma < end)
-        if (*(end + 1) != ':')
+                        *comma = 0;
-                goto out_bad_devname;
+        }
-        len = end - start;
        if (len > maxnamlen)
                goto out_hostname;
        /* N.B. caller will free nfs_server.hostname in all cases */
-        *hostname = kstrndup(start, len, GFP_KERNEL);
+        *hostname = kstrndup(dev_name, len, GFP_KERNEL);
        if (*hostname == NULL)
                goto out_nomem;
+        len = strlen(++end);
-        end += 2;
-        len = strlen(end);
        if (len > maxpathlen)
                goto out_path;
        *export_path = kstrndup(end, len, GFP_KERNEL);
        if (!*export_path)
                goto out_nomem;
+        dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
        return 0;
 out_bad_devname:
@@ -1778,29 +1700,6 @@ out_path:
 }
 /*
- * Split "dev_name" into "hostname:export_path".
- *
- * The leftmost colon demarks the split between the server's hostname
- * and the export path.  If the hostname starts with a left square
- * bracket, then it may contain colons.
- *
- * Note: caller frees hostname and export path, even on error.
- */
-static int nfs_parse_devname(const char *dev_name,
-                             char **hostname, size_t maxnamlen,
-                             char **export_path, size_t maxpathlen)
-{
-        if (*dev_name == '[')
-                return nfs_parse_protected_hostname(dev_name,
-                                                    hostname, maxnamlen,
-                                                    export_path, maxpathlen);
-        return nfs_parse_simple_hostname(dev_name,
-                                         hostname, maxnamlen,
-                                         export_path, maxpathlen);
-}
-/*
 * Validate the NFS2/NFS3 mount data
 * - fills in the mount root filehandle
 *
@@ -2267,19 +2166,19 @@ static int nfs_bdi_register(struct nfs_server *server)
        return bdi_register_dev(&server->backing_dev_info, server->s_dev);
 }
-static int nfs_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *raw_data)
 {
        struct nfs_server *server = NULL;
        struct super_block *s;
        struct nfs_parsed_mount_data *data;
        struct nfs_fh *mntfh;
-        struct dentry *mntroot;
+        struct dentry *mntroot = ERR_PTR(-ENOMEM);
        int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
        struct nfs_sb_mountdata sb_mntdata = {
                .mntflags = flags,
        };
-        int error = -ENOMEM;
+        int error;
        data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
        mntfh = nfs_alloc_fhandle();
@@ -2290,12 +2189,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        /* Validate the mount data */
        error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
-        if (error < 0)
+        if (error < 0) {
+                mntroot = ERR_PTR(error);
                goto out;
+        }
 #ifdef CONFIG_NFS_V4
        if (data->version == 4) {
-                error = nfs4_try_mount(flags, dev_name, data, mnt);
+                mntroot = nfs4_try_mount(flags, dev_name, data);
                kfree(data->client_address);
                kfree(data->nfs_server.export_path);
                goto out;
@@ -2305,7 +2206,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        /* Get a volume representation */
        server = nfs_create_server(data, mntfh);
        if (IS_ERR(server)) {
-                error = PTR_ERR(server);
+                mntroot = ERR_CAST(server);
                goto out;
        }
        sb_mntdata.server = server;
@@ -2316,7 +2217,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
-                error = PTR_ERR(s);
+                mntroot = ERR_CAST(s);
                goto out_err_nosb;
        }
@@ -2325,8 +2226,10 @@ static int nfs_get_sb(struct file_system_type *fs_type,
                server = NULL;
        } else {
                error = nfs_bdi_register(server);
-                if (error)
+                if (error) {
+                        mntroot = ERR_PTR(error);
                        goto error_splat_bdi;
+                }
        }
        if (!s->s_root) {
@@ -2336,20 +2239,15 @@ static int nfs_get_sb(struct file_system_type *fs_type,
                        s, data ? data->fscache_uniq : NULL, NULL);
        }
-        mntroot = nfs_get_root(s, mntfh);
+        mntroot = nfs_get_root(s, mntfh, dev_name);
-        if (IS_ERR(mntroot)) {
+        if (IS_ERR(mntroot))
-                error = PTR_ERR(mntroot);
                goto error_splat_super;
-        }
        error = security_sb_set_mnt_opts(s, &data->lsm_opts);
        if (error)
                goto error_splat_root;
        s->s_flags |= MS_ACTIVE;
-        mnt->mnt_sb = s;
-        mnt->mnt_root = mntroot;
-        error = 0;
 out:
        kfree(data->nfs_server.hostname);
@@ -2359,7 +2257,7 @@ out:
 out_free_fh:
        nfs_free_fhandle(mntfh);
        kfree(data);
-        return error;
+        return mntroot;
 out_err_nosb:
        nfs_free_server(server);
@@ -2367,6 +2265,7 @@ out_err_nosb:
 error_splat_root:
        dput(mntroot);
+        mntroot = ERR_PTR(error);
 error_splat_super:
        if (server && !s->s_root)
                bdi_unregister(&server->backing_dev_info);
@@ -2450,7 +2349,7 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
                nfs_fscache_get_super_cookie(s, NULL, data);
        }
-        mntroot = nfs_get_root(s, data->fh);
+        mntroot = nfs_get_root(s, data->fh, dev_name);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -2718,7 +2617,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
                        s, data ? data->fscache_uniq : NULL, NULL);
        }
-        mntroot = nfs4_get_root(s, mntfh);
+        mntroot = nfs4_get_root(s, mntfh, dev_name);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -2771,27 +2670,6 @@ static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
        return root_mnt;
 }
-static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
-{
-        char *page = (char *) __get_free_page(GFP_KERNEL);
-        char *devname, *tmp;
-        if (page == NULL)
-                return;
-        devname = nfs_path(path->mnt->mnt_devname,
-                        path->mnt->mnt_root, path->dentry,
-                        page, PAGE_SIZE);
-        if (IS_ERR(devname))
-                goto out_freepage;
-        tmp = kstrdup(devname, GFP_KERNEL);
-        if (tmp == NULL)
-                goto out_freepage;
-        kfree(mnt->mnt_devname);
-        mnt->mnt_devname = tmp;
-out_freepage:
-        free_page((unsigned long)page);
-}
 struct nfs_referral_count {
        struct list_head list;
        const struct task_struct *task;
@@ -2858,17 +2736,18 @@ static void nfs_referral_loop_unprotect(void)
        kfree(p);
 }
-static int nfs_follow_remote_path(struct vfsmount *root_mnt,
+static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
-                const char *export_path, struct vfsmount *mnt_target)
+                const char *export_path)
 {
        struct nameidata *nd = NULL;
        struct mnt_namespace *ns_private;
        struct super_block *s;
+        struct dentry *dentry;
        int ret;
        nd = kmalloc(sizeof(*nd), GFP_KERNEL);
        if (nd == NULL)
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        ns_private = create_mnt_ns(root_mnt);
        ret = PTR_ERR(ns_private);
@@ -2890,32 +2769,27 @@ static int nfs_follow_remote_path(struct vfsmount *root_mnt,
        s = nd->path.mnt->mnt_sb;
        atomic_inc(&s->s_active);
-        mnt_target->mnt_sb = s;
+        dentry = dget(nd->path.dentry);
-        mnt_target->mnt_root = dget(nd->path.dentry);
-        /* Correct the device pathname */
-        nfs_fix_devname(&nd->path, mnt_target);
        path_put(&nd->path);
        kfree(nd);
        down_write(&s->s_umount);
-        return 0;
+        return dentry;
 out_put_mnt_ns:
        put_mnt_ns(ns_private);
 out_mntput:
        mntput(root_mnt);
 out_err:
        kfree(nd);
-        return ret;
+        return ERR_PTR(ret);
 }
-static int nfs4_try_mount(int flags, const char *dev_name,
+static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
-                         struct nfs_parsed_mount_data *data,
+                         struct nfs_parsed_mount_data *data)
-                         struct vfsmount *mnt)
 {
        char *export_path;
        struct vfsmount *root_mnt;
-        int error;
+        struct dentry *res;
        dfprintk(MOUNT, "--> nfs4_try_mount()\n");
@@ -2925,26 +2799,25 @@ static int nfs4_try_mount(int flags, const char *dev_name,
                        data->nfs_server.hostname);
        data->nfs_server.export_path = export_path;
-        error = PTR_ERR(root_mnt);
+        res = ERR_CAST(root_mnt);
-        if (IS_ERR(root_mnt))
+        if (!IS_ERR(root_mnt))
-                goto out;
+                res = nfs_follow_remote_path(root_mnt, export_path);
-        error = nfs_follow_remote_path(root_mnt, export_path, mnt);
-out:
+        dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n",
-        dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error,
+                        IS_ERR(res) ? PTR_ERR(res) : 0,
-                        error != 0 ? " [error]" : "");
+                        IS_ERR(res) ? " [error]" : "");
-        return error;
+        return res;
 }
 /*
 * Get the superblock for an NFS4 mountpoint
 */
-static int nfs4_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *raw_data)
 {
        struct nfs_parsed_mount_data *data;
        int error = -ENOMEM;
+        struct dentry *res = ERR_PTR(-ENOMEM);
        data = nfs_alloc_parsed_mount_data(4);
        if (data == NULL)
@@ -2952,10 +2825,14 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        /* Validate the mount data */
        error = nfs4_validate_mount_data(raw_data, data, dev_name);
-        if (error < 0)
+        if (error < 0) {
+                res = ERR_PTR(error);
                goto out;
+        }
-        error = nfs4_try_mount(flags, dev_name, data, mnt);
+        res = nfs4_try_mount(flags, dev_name, data);
+        if (IS_ERR(res))
+                error = PTR_ERR(res);
 out:
        kfree(data->client_address);
@@ -2964,9 +2841,9 @@ out:
        kfree(data->fscache_uniq);
 out_free_data:
        kfree(data);
-        dprintk("<-- nfs4_get_sb() = %d%s\n", error,
+        dprintk("<-- nfs4_mount() = %d%s\n", error,
                        error != 0 ? " [error]" : "");
-        return error;
+        return res;
 }
 static void nfs4_kill_super(struct super_block *sb)
@@ -3033,7 +2910,7 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
                nfs_fscache_get_super_cookie(s, NULL, data);
        }
-        mntroot = nfs4_get_root(s, data->fh);
+        mntroot = nfs4_get_root(s, data->fh, dev_name);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -3120,7 +2997,7 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
                nfs_fscache_get_super_cookie(s, NULL, data);
        }
-        mntroot = nfs4_get_root(s, mntfh);
+        mntroot = nfs4_get_root(s, mntfh, dev_name);
        if (IS_ERR(mntroot)) {
                error = PTR_ERR(mntroot);
                goto error_splat_super;
@@ -3160,16 +3037,15 @@ error_splat_bdi:
 /*
 * Create an NFS4 server record on referral traversal
 */
-static int nfs4_referral_get_sb(struct file_system_type *fs_type,
+static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
-                int flags, const char *dev_name, void *raw_data,
+                int flags, const char *dev_name, void *raw_data)
-                struct vfsmount *mnt)
 {
        struct nfs_clone_mount *data = raw_data;
        char *export_path;
        struct vfsmount *root_mnt;
-        int error;
+        struct dentry *res;
-        dprintk("--> nfs4_referral_get_sb()\n");
+        dprintk("--> nfs4_referral_mount()\n");
        export_path = data->mnt_path;
        data->mnt_path = "/";
@@ -3178,15 +3054,13 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type,
                        flags, data, data->hostname);
        data->mnt_path = export_path;
-        error = PTR_ERR(root_mnt);
+        res = ERR_CAST(root_mnt);
-        if (IS_ERR(root_mnt))
+        if (!IS_ERR(root_mnt))
-                goto out;
+                res = nfs_follow_remote_path(root_mnt, export_path);
+        dprintk("<-- nfs4_referral_mount() = %ld%s\n",
-        error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+                        IS_ERR(res) ? PTR_ERR(res) : 0,
-out:
+                        IS_ERR(res) ? " [error]" : "");
-        dprintk("<-- nfs4_referral_get_sb() = %d%s\n", error,
+        return res;
-                        error != 0 ? " [error]" : "");
-        return error;
 }
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index e313a51acdd1..8d6864c2a5fa 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -148,6 +148,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
        alias = d_lookup(parent, &data->args.name);
        if (alias != NULL) {
                int ret = 0;
+                void *devname_garbage = NULL;
                /*
                 * Hey, we raced with lookup... See if we need to transfer
@@ -157,6 +158,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
                spin_lock(&alias->d_lock);
                if (alias->d_inode != NULL &&
                    !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
+                        devname_garbage = alias->d_fsdata;
                        alias->d_fsdata = data;
                        alias->d_flags |= DCACHE_NFSFS_RENAMED;
                        ret = 1;
@@ -164,6 +166,13 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
                spin_unlock(&alias->d_lock);
                nfs_dec_sillycount(dir);
                dput(alias);
+                /*
+                 * If we'd displaced old cached devname, free it.  At that
+                 * point dentry is definitely not a root, so we won't need
+                 * that anymore.
+                 */
+                if (devname_garbage)
+                        kfree(devname_garbage);
                return ret;
        }
        data->dir = igrab(dir);
@@ -180,7 +189,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
        task_setup_data.rpc_client = NFS_CLIENT(dir);
        task = rpc_run_task(&task_setup_data);
        if (!IS_ERR(task))
-                rpc_put_task(task);
+                rpc_put_task_async(task);
        return 1;
 }
@@ -252,6 +261,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct nfs_unlinkdata *data;
        int status = -ENOMEM;
+        void *devname_garbage = NULL;
        data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (data == NULL)
@@ -269,8 +279,16 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
        if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
                goto out_unlock;
        dentry->d_flags |= DCACHE_NFSFS_RENAMED;
+        devname_garbage = dentry->d_fsdata;
        dentry->d_fsdata = data;
        spin_unlock(&dentry->d_lock);
+        /*
+         * If we'd displaced old cached devname, free it.  At that
+         * point dentry is definitely not a root, so we won't need
+         * that anymore.
+         */
+        if (devname_garbage)
+                kfree(devname_garbage);
        return 0;
 out_unlock:
        spin_unlock(&dentry->d_lock);
@@ -299,6 +317,7 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
        if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
                dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
                data = dentry->d_fsdata;
+                dentry->d_fsdata = NULL;
        }
        spin_unlock(&dentry->d_lock);
@@ -315,6 +334,7 @@ nfs_cancel_async_unlink(struct dentry *dentry)
                struct nfs_unlinkdata *data = dentry->d_fsdata;
                dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
+                dentry->d_fsdata = NULL;
                spin_unlock(&dentry->d_lock);
                nfs_free_unlinkdata(data);
                return;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c8278f4046cb..e4cbc11a74ab 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -28,6 +28,7 @@
 #include "iostat.h"
 #include "nfs4_fs.h"
 #include "fscache.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
@@ -58,6 +59,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
        }
        return p;
 }
+EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
 void nfs_commit_free(struct nfs_write_data *p)
 {
@@ -65,6 +67,7 @@ void nfs_commit_free(struct nfs_write_data *p)
                kfree(p->pagevec);
        mempool_free(p, nfs_commit_mempool);
 }
+EXPORT_SYMBOL_GPL(nfs_commit_free);
 struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 {
@@ -96,6 +99,7 @@ void nfs_writedata_free(struct nfs_write_data *p)
 static void nfs_writedata_release(struct nfs_write_data *wdata)
 {
+        put_lseg(wdata->lseg);
        put_nfs_open_context(wdata->args.context);
        nfs_writedata_free(wdata);
 }
@@ -177,8 +181,8 @@ static int wb_priority(struct writeback_control *wbc)
        if (wbc->for_reclaim)
                return FLUSH_HIGHPRI | FLUSH_STABLE;
        if (wbc->for_kupdate || wbc->for_background)
-                return FLUSH_LOWPRI;
+                return FLUSH_LOWPRI | FLUSH_COND_STABLE;
-        return 0;
+        return FLUSH_COND_STABLE;
 }
 /*
@@ -385,11 +389,8 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
        spin_lock(&inode->i_lock);
        error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
        BUG_ON(error);
-        if (!nfsi->npages) {
+        if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
-                igrab(inode);
+                nfsi->change_attr++;
-                if (nfs_have_delegation(inode, FMODE_WRITE))
-                        nfsi->change_attr++;
-        }
        set_bit(PG_MAPPED, &req->wb_flags);
        SetPagePrivate(req->wb_page);
        set_page_private(req->wb_page, (unsigned long)req);
@@ -419,11 +420,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        clear_bit(PG_MAPPED, &req->wb_flags);
        radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
        nfsi->npages--;
-        if (!nfsi->npages) {
+        spin_unlock(&inode->i_lock);
-                spin_unlock(&inode->i_lock);
-                iput(inode);
-        } else
-                spin_unlock(&inode->i_lock);
        nfs_release_request(req);
 }
@@ -439,7 +436,7 @@ nfs_mark_request_dirty(struct nfs_page *req)
 * Add a request to the inode's commit list.
 */
 static void
-nfs_mark_request_commit(struct nfs_page *req)
+nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
 {
        struct inode *inode = req->wb_context->path.dentry->d_inode;
        struct nfs_inode *nfsi = NFS_I(inode);
@@ -451,6 +448,7 @@ nfs_mark_request_commit(struct nfs_page *req)
                        NFS_PAGE_TAG_COMMIT);
        nfsi->ncommit++;
        spin_unlock(&inode->i_lock);
+        pnfs_mark_request_commit(req, lseg);
        inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
        inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
@@ -472,14 +470,18 @@ nfs_clear_request_commit(struct nfs_page *req)
 static inline
 int nfs_write_need_commit(struct nfs_write_data *data)
 {
-        return data->verf.committed != NFS_FILE_SYNC;
+        if (data->verf.committed == NFS_DATA_SYNC)
+                return data->lseg == NULL;
+        else
+                return data->verf.committed != NFS_FILE_SYNC;
 }
 static inline
-int nfs_reschedule_unstable_write(struct nfs_page *req)
+int nfs_reschedule_unstable_write(struct nfs_page *req,
+                                  struct nfs_write_data *data)
 {
        if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
-                nfs_mark_request_commit(req);
+                nfs_mark_request_commit(req, data->lseg);
                return 1;
        }
        if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) {
@@ -490,7 +492,7 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
 }
 #else
 static inline void
-nfs_mark_request_commit(struct nfs_page *req)
+nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
 {
 }
@@ -507,7 +509,8 @@ int nfs_write_need_commit(struct nfs_write_data *data)
 }
 static inline
-int nfs_reschedule_unstable_write(struct nfs_page *req)
+int nfs_reschedule_unstable_write(struct nfs_page *req,
+                                  struct nfs_write_data *data)
 {
        return 0;
 }
@@ -539,11 +542,15 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u
        if (!nfs_need_commit(nfsi))
                return 0;
+        spin_lock(&inode->i_lock);
        ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
        if (ret > 0)
                nfsi->ncommit -= ret;
+        spin_unlock(&inode->i_lock);
        if (nfs_need_commit(NFS_I(inode)))
                __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
        return ret;
 }
 #else
@@ -610,9 +617,11 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
        }
        if (nfs_clear_request_commit(req) &&
-                        radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
+            radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
-                                req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL)
+                                 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) {
                NFS_I(inode)->ncommit--;
+                pnfs_clear_request_commit(req);
+        }
        /* Okay, the request matches. Update the region */
        if (offset < req->wb_offset) {
@@ -760,11 +769,12 @@ int nfs_updatepage(struct file *file, struct page *page,
        return status;
 }
-static void nfs_writepage_release(struct nfs_page *req)
+static void nfs_writepage_release(struct nfs_page *req,
+                                  struct nfs_write_data *data)
 {
        struct page *page = req->wb_page;
-        if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req))
+        if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data))
                nfs_inode_remove_request(req);
        nfs_clear_page_tag_locked(req);
        nfs_end_page_writeback(page);
@@ -781,25 +791,21 @@ static int flush_task_priority(int how)
        return RPC_PRIORITY_NORMAL;
 }
-/*
+int nfs_initiate_write(struct nfs_write_data *data,
- * Set up the argument/result storage required for the RPC call.
+                       struct rpc_clnt *clnt,
- */
+                       const struct rpc_call_ops *call_ops,
-static int nfs_write_rpcsetup(struct nfs_page *req,
+                       int how)
-                struct nfs_write_data *data,
-                const struct rpc_call_ops *call_ops,
-                unsigned int count, unsigned int offset,
-                int how)
 {
-        struct inode *inode = req->wb_context->path.dentry->d_inode;
+        struct inode *inode = data->inode;
        int priority = flush_task_priority(how);
        struct rpc_task *task;
        struct rpc_message msg = {
                .rpc_argp = &data->args,
                .rpc_resp = &data->res,
-                .rpc_cred = req->wb_context->cred,
+                .rpc_cred = data->cred,
        };
        struct rpc_task_setup task_setup_data = {
-                .rpc_client = NFS_CLIENT(inode),
+                .rpc_client = clnt,
                .task = &data->task,
                .rpc_message = &msg,
                .callback_ops = call_ops,
@@ -810,12 +816,52 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        };
        int ret = 0;
+        /* Set up the initial task struct.  */
+        NFS_PROTO(inode)->write_setup(data, &msg);
+        dprintk("NFS: %5u initiated write call "
+                "(req %s/%lld, %u bytes @ offset %llu)\n",
+                data->task.tk_pid,
+                inode->i_sb->s_id,
+                (long long)NFS_FILEID(inode),
+                data->args.count,
+                (unsigned long long)data->args.offset);
+        task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task)) {
+                ret = PTR_ERR(task);
+                goto out;
+        }
+        if (how & FLUSH_SYNC) {
+                ret = rpc_wait_for_completion_task(task);
+                if (ret == 0)
+                        ret = task->tk_status;
+        }
+        rpc_put_task(task);
+out:
+        return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_write);
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+static int nfs_write_rpcsetup(struct nfs_page *req,
+                struct nfs_write_data *data,
+                const struct rpc_call_ops *call_ops,
+                unsigned int count, unsigned int offset,
+                struct pnfs_layout_segment *lseg,
+                int how)
+{
+        struct inode *inode = req->wb_context->path.dentry->d_inode;
        /* Set up the RPC argument and reply structs
         * NB: take care not to mess about with data->commit et al. */
        data->req = req;
        data->inode = inode = req->wb_context->path.dentry->d_inode;
-        data->cred = msg.rpc_cred;
+        data->cred = req->wb_context->cred;
+        data->lseg = get_lseg(lseg);
        data->args.fh     = NFS_FH(inode);
        data->args.offset = req_offset(req) + offset;
@@ -825,7 +871,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        data->args.context = get_nfs_open_context(req->wb_context);
        data->args.lock_context = req->wb_lock_context;
        data->args.stable  = NFS_UNSTABLE;
-        if (how & FLUSH_STABLE) {
+        if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
                data->args.stable = NFS_DATA_SYNC;
                if (!nfs_need_commit(NFS_I(inode)))
                        data->args.stable = NFS_FILE_SYNC;
@@ -836,30 +882,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        data->res.verf    = &data->verf;
        nfs_fattr_init(&data->fattr);
-        /* Set up the initial task struct.  */
+        if (data->lseg &&
-        NFS_PROTO(inode)->write_setup(data, &msg);
+            (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
+                return 0;
-        dprintk("NFS: %5u initiated write call "
-                "(req %s/%lld, %u bytes @ offset %llu)\n",
-                data->task.tk_pid,
-                inode->i_sb->s_id,
-                (long long)NFS_FILEID(inode),
-                count,
-                (unsigned long long)data->args.offset);
-        task = rpc_run_task(&task_setup_data);
+        return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
-        if (IS_ERR(task)) {
-                ret = PTR_ERR(task);
-                goto out;
-        }
-        if (how & FLUSH_SYNC) {
-                ret = rpc_wait_for_completion_task(task);
-                if (ret == 0)
-                        ret = task->tk_status;
-        }
-        rpc_put_task(task);
-out:
-        return ret;
 }
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -879,20 +906,27 @@ static void nfs_redirty_request(struct nfs_page *req)
 * Generate multiple small requests to write out a single
 * contiguous dirty area on one page.
 */
-static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
+static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
 {
-        struct nfs_page *req = nfs_list_entry(head->next);
+        struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
        struct page *page = req->wb_page;
        struct nfs_write_data *data;
-        size_t wsize = NFS_SERVER(inode)->wsize, nbytes;
+        size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes;
        unsigned int offset;
        int requests = 0;
        int ret = 0;
+        struct pnfs_layout_segment *lseg;
        LIST_HEAD(list);
        nfs_list_remove_request(req);
-        nbytes = count;
+        if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
+            (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit ||
+             desc->pg_count > wsize))
+                desc->pg_ioflags &= ~FLUSH_COND_STABLE;
+        nbytes = desc->pg_count;
        do {
                size_t len = min(nbytes, wsize);
@@ -905,9 +939,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
        } while (nbytes != 0);
        atomic_set(&req->wb_complete, requests);
+        BUG_ON(desc->pg_lseg);
+        lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
        ClearPageError(page);
        offset = 0;
-        nbytes = count;
+        nbytes = desc->pg_count;
        do {
                int ret2;
@@ -919,13 +955,15 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
                if (nbytes < wsize)
                        wsize = nbytes;
                ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
-                                   wsize, offset, how);
+                                          wsize, offset, lseg, desc->pg_ioflags);
                if (ret == 0)
                        ret = ret2;
                offset += wsize;
                nbytes -= wsize;
        } while (nbytes != 0);
+        put_lseg(lseg);
+        desc->pg_lseg = NULL;
        return ret;
 out_bad:
@@ -946,16 +984,26 @@ out_bad:
 * This is the case if nfs_updatepage detects a conflicting request
 * that has been written but not committed.
 */
-static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how)
+static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
 {
        struct nfs_page         *req;
        struct page             **pages;
        struct nfs_write_data   *data;
+        struct list_head *head = &desc->pg_list;
+        struct pnfs_layout_segment *lseg = desc->pg_lseg;
+        int ret;
-        data = nfs_writedata_alloc(npages);
+        data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
-        if (!data)
+                                                      desc->pg_count));
-                goto out_bad;
+        if (!data) {
+                while (!list_empty(head)) {
+                        req = nfs_list_entry(head->next);
+                        nfs_list_remove_request(req);
+                        nfs_redirty_request(req);
+                }
+                ret = -ENOMEM;
+                goto out;
+        }
        pages = data->pagevec;
        while (!list_empty(head)) {
                req = nfs_list_entry(head->next);
@@ -965,16 +1013,19 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
                *pages++ = req->wb_page;
        }
        req = nfs_list_entry(data->pages.next);
+        if ((!lseg) && list_is_singular(&data->pages))
+                lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
+        if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
+            (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
+                desc->pg_ioflags &= ~FLUSH_COND_STABLE;
        /* Set up the argument struct */
-        return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how);
+        ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
- out_bad:
+out:
-        while (!list_empty(head)) {
+        put_lseg(lseg); /* Cleans any gotten in ->pg_test */
-                req = nfs_list_entry(head->next);
+        desc->pg_lseg = NULL;
-                nfs_list_remove_request(req);
+        return ret;
-                nfs_redirty_request(req);
-        }
-        return -ENOMEM;
 }
 static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -982,6 +1033,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 {
        size_t wsize = NFS_SERVER(inode)->wsize;
+        pnfs_pageio_init_write(pgio, inode);
        if (wsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
        else
@@ -1039,7 +1092,7 @@ static void nfs_writeback_release_partial(void *calldata)
 out:
        if (atomic_dec_and_test(&req->wb_complete))
-                nfs_writepage_release(req);
+                nfs_writepage_release(req, data);
        nfs_writedata_release(calldata);
 }
@@ -1106,7 +1159,7 @@ static void nfs_writeback_release_full(void *calldata)
                if (nfs_write_need_commit(data)) {
                        memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
-                        nfs_mark_request_commit(req);
+                        nfs_mark_request_commit(req, data->lseg);
                        dprintk(" marked for commit\n");
                        goto next;
                }
@@ -1132,7 +1185,7 @@ static const struct rpc_call_ops nfs_write_full_ops = {
 /*
 * This function is called when the WRITE call is complete.
 */
-int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct nfs_writeargs    *argp = &data->args;
        struct nfs_writeres     *resp = &data->res;
@@ -1151,7 +1204,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
         */
        status = NFS_PROTO(data->inode)->write_done(task, data);
        if (status != 0)
-                return status;
+                return;
        nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -1166,6 +1219,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                 */
                static unsigned long    complain;
+                /* Note this will print the MDS for a DS write */
                if (time_before(complain, jiffies)) {
                        dprintk("NFS:       faulty NFS server %s:"
                                " (committed = %d) != (stable = %d)\n",
@@ -1186,6 +1240,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                        /* Was this an NFSv2 write or an NFSv3 stable write? */
                        if (resp->verf->committed != NFS_UNSTABLE) {
                                /* Resend from where the server left off */
+                                data->mds_offset += resp->count;
                                argp->offset += resp->count;
                                argp->pgbase += resp->count;
                                argp->count -= resp->count;
@@ -1196,7 +1251,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                                argp->stable = NFS_FILE_SYNC;
                        }
                        nfs_restart_rpc(task, server->nfs_client);
-                        return -EAGAIN;
+                        return;
                }
                if (time_before(complain, jiffies)) {
                        printk(KERN_WARNING
@@ -1207,64 +1262,89 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
                /* Can't do anything about it except throw an error. */
                task->tk_status = -EIO;
        }
-        return 0;
+        return;
 }
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
 {
+        int ret;
        if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
                return 1;
-        if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags,
+        if (!may_wait)
-                                NFS_INO_COMMIT, nfs_wait_bit_killable,
+                return 0;
-                                TASK_KILLABLE))
+        ret = out_of_line_wait_on_bit_lock(&nfsi->flags,
-                return 1;
+                                NFS_INO_COMMIT,
-        return 0;
+                                nfs_wait_bit_killable,
+                                TASK_KILLABLE);
+        return (ret < 0) ? ret : 1;
 }
-static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+void nfs_commit_clear_lock(struct nfs_inode *nfsi)
 {
        clear_bit(NFS_INO_COMMIT, &nfsi->flags);
        smp_mb__after_clear_bit();
        wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
 }
+EXPORT_SYMBOL_GPL(nfs_commit_clear_lock);
+void nfs_commitdata_release(void *data)
-static void nfs_commitdata_release(void *data)
 {
        struct nfs_write_data *wdata = data;
+        put_lseg(wdata->lseg);
        put_nfs_open_context(wdata->args.context);
        nfs_commit_free(wdata);
 }
+EXPORT_SYMBOL_GPL(nfs_commitdata_release);
-/*
+int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt,
- * Set up the argument/result storage required for the RPC call.
+                        const struct rpc_call_ops *call_ops,
- */
+                        int how)
-static int nfs_commit_rpcsetup(struct list_head *head,
-                struct nfs_write_data *data,
-                int how)
 {
-        struct nfs_page *first = nfs_list_entry(head->next);
-        struct inode *inode = first->wb_context->path.dentry->d_inode;
-        int priority = flush_task_priority(how);
        struct rpc_task *task;
+        int priority = flush_task_priority(how);
        struct rpc_message msg = {
                .rpc_argp = &data->args,
                .rpc_resp = &data->res,
-                .rpc_cred = first->wb_context->cred,
+                .rpc_cred = data->cred,
        };
        struct rpc_task_setup task_setup_data = {
                .task = &data->task,
-                .rpc_client = NFS_CLIENT(inode),
+                .rpc_client = clnt,
                .rpc_message = &msg,
-                .callback_ops = &nfs_commit_ops,
+                .callback_ops = call_ops,
                .callback_data = data,
                .workqueue = nfsiod_workqueue,
                .flags = RPC_TASK_ASYNC,
                .priority = priority,
        };
+        /* Set up the initial task struct.  */
+        NFS_PROTO(data->inode)->commit_setup(data, &msg);
+        dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+        task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task))
+                return PTR_ERR(task);
+        if (how & FLUSH_SYNC)
+                rpc_wait_for_completion_task(task);
+        rpc_put_task(task);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_commit);
+/*
+ * Set up the argument/result storage required for the RPC call.
+ */
+void nfs_init_commit(struct nfs_write_data *data,
+                            struct list_head *head,
+                            struct pnfs_layout_segment *lseg)
+{
+        struct nfs_page *first = nfs_list_entry(head->next);
+        struct inode *inode = first->wb_context->path.dentry->d_inode;
        /* Set up the RPC argument and reply structs
         * NB: take care not to mess about with data->commit et al. */
@@ -1272,7 +1352,9 @@ static int nfs_commit_rpcsetup(struct list_head *head,
        list_splice_init(head, &data->pages);
        data->inode       = inode;
-        data->cred        = msg.rpc_cred;
+        data->cred        = first->wb_context->cred;
+        data->lseg        = lseg; /* reference transferred */
+        data->mds_ops     = &nfs_commit_ops;
        data->args.fh     = NFS_FH(data->inode);
        /* Note: we always request a commit of the entire inode */
@@ -1283,18 +1365,25 @@ static int nfs_commit_rpcsetup(struct list_head *head,
        data->res.fattr   = &data->fattr;
        data->res.verf    = &data->verf;
        nfs_fattr_init(&data->fattr);
+}
+EXPORT_SYMBOL_GPL(nfs_init_commit);
-        /* Set up the initial task struct.  */
+void nfs_retry_commit(struct list_head *page_list,
-        NFS_PROTO(inode)->commit_setup(data, &msg);
+                      struct pnfs_layout_segment *lseg)
+{
-        dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
+        struct nfs_page *req;
-        task = rpc_run_task(&task_setup_data);
+        while (!list_empty(page_list)) {
-        if (IS_ERR(task))
+                req = nfs_list_entry(page_list->next);
-                return PTR_ERR(task);
+                nfs_list_remove_request(req);
-        rpc_put_task(task);
+                nfs_mark_request_commit(req, lseg);
-        return 0;
+                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+                dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
+                             BDI_RECLAIMABLE);
+                nfs_clear_page_tag_locked(req);
+        }
 }
+EXPORT_SYMBOL_GPL(nfs_retry_commit);
 /*
 * Commit dirty pages
@@ -1303,7 +1392,6 @@ static int
 nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 {
        struct nfs_write_data   *data;
-        struct nfs_page         *req;
        data = nfs_commitdata_alloc();
@@ -1311,17 +1399,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
                goto out_bad;
        /* Set up the argument struct */
-        return nfs_commit_rpcsetup(head, data, how);
+        nfs_init_commit(data, head, NULL);
+        return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how);
 out_bad:
-        while (!list_empty(head)) {
+        nfs_retry_commit(head, NULL);
-                req = nfs_list_entry(head->next);
-                nfs_list_remove_request(req);
-                nfs_mark_request_commit(req);
-                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-                dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-                                BDI_RECLAIMABLE);
-                nfs_clear_page_tag_locked(req);
-        }
        nfs_commit_clear_lock(NFS_I(inode));
        return -ENOMEM;
 }
@@ -1341,10 +1422,9 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
                return;
 }
-static void nfs_commit_release(void *calldata)
+void nfs_commit_release_pages(struct nfs_write_data *data)
 {
-        struct nfs_write_data   *data = calldata;
+        struct nfs_page *req;
-        struct nfs_page         *req;
        int status = data->task.tk_status;
        while (!list_empty(&data->pages)) {
@@ -1378,6 +1458,14 @@ static void nfs_commit_release(void *calldata)
        next:
                nfs_clear_page_tag_locked(req);
        }
+}
+EXPORT_SYMBOL_GPL(nfs_commit_release_pages);
+static void nfs_commit_release(void *calldata)
+{
+        struct nfs_write_data *data = calldata;
+        nfs_commit_release_pages(data);
        nfs_commit_clear_lock(NFS_I(data->inode));
        nfs_commitdata_release(calldata);
 }
@@ -1394,23 +1482,28 @@ int nfs_commit_inode(struct inode *inode, int how)
 {
        LIST_HEAD(head);
        int may_wait = how & FLUSH_SYNC;
-        int res = 0;
+        int res;
-        if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
+        res = nfs_commit_set_lock(NFS_I(inode), may_wait);
+        if (res <= 0)
                goto out_mark_dirty;
-        spin_lock(&inode->i_lock);
        res = nfs_scan_commit(inode, &head, 0, 0);
-        spin_unlock(&inode->i_lock);
        if (res) {
-                int error = nfs_commit_list(inode, &head, how);
+                int error;
+                error = pnfs_commit_list(inode, &head, how);
+                if (error == PNFS_NOT_ATTEMPTED)
+                        error = nfs_commit_list(inode, &head, how);
                if (error < 0)
                        return error;
-                if (may_wait)
+                if (!may_wait)
-                        wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
-                                        nfs_wait_bit_killable,
-                                        TASK_KILLABLE);
-                else
                        goto out_mark_dirty;
+                error = wait_on_bit(&NFS_I(inode)->flags,
+                                NFS_INO_COMMIT,
+                                nfs_wait_bit_killable,
+                                TASK_KILLABLE);
+                if (error < 0)
+                        return error;
        } else
                nfs_commit_clear_lock(NFS_I(inode));
        return res;
@@ -1464,7 +1557,22 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
 int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-        return nfs_commit_unstable_pages(inode, wbc);
+        int ret;
+        ret = nfs_commit_unstable_pages(inode, wbc);
+        if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) {
+                int status;
+                bool sync = true;
+                if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking ||
+                    wbc->for_background)
+                        sync = false;
+                status = pnfs_layoutcommit_inode(inode, sync);
+                if (status < 0)
+                        return status;
+        }
+        return ret;
 }
 /*
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 84c27d69d421..6940439bd609 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -117,7 +117,6 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
                 * invoked in contexts where a memory allocation failure is
                 * fatal.  Fortunately this fake ACL is small enough to
                 * construct on the stack. */
-                memset(acl2, 0, sizeof(acl2));
                posix_acl_init(acl2, 4);
                /* Insert entries in canonical order: other orders seem
@@ -174,7 +173,7 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)
                                return -EINVAL;
                        break;
                case ACL_MASK:
-                        /* Solaris sometimes sets additonal bits in the mask */
+                        /* Solaris sometimes sets additional bits in the mask */
                        entry->e_perm &= S_IRWXO;
                        break;
                default:
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index bf9cbd242ddd..124e8fcb0dd6 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -22,30 +22,17 @@
 static struct file *do_open(char *name, int flags)
 {
-        struct nameidata nd;
        struct vfsmount *mnt;
-        int error;
+        struct file *file;
        mnt = do_kern_mount("nfsd", 0, "nfsd", NULL);
        if (IS_ERR(mnt))
                return (struct file *)mnt;
-        error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd);
+        file = file_open_root(mnt->mnt_root, mnt, name, flags);
-        mntput(mnt);    /* drop do_kern_mount reference */
-        if (error)
-                return ERR_PTR(error);
-        if (flags == O_RDWR)
-                error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
-        else
-                error = may_open(&nd.path, MAY_WRITE, flags);
-        if (!error)
+        mntput(mnt);    /* drop do_kern_mount reference */
-                return dentry_open(nd.path.dentry, nd.path.mnt, flags,
+        return file;
-                                   current_cred());
-        path_put(&nd.path);
-        return ERR_PTR(error);
 }
 static struct {
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 8b31e5f8795d..ad000aeb21a2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -299,7 +299,6 @@ svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
 #define EXPORT_HASHBITS         8
 #define EXPORT_HASHMAX          (1<< EXPORT_HASHBITS)
-#define EXPORT_HASHMASK         (EXPORT_HASHMAX -1)
 static struct cache_head *export_table[EXPORT_HASHMAX];
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 0c6d81670137..7c831a2731fa 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -38,7 +38,6 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
        exp_readlock();
        nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
        fh_put(&fh);
-        rqstp->rq_client = NULL;
        exp_readunlock();
        /* We return nlm error codes as nlm doesn't know
         * about nfsd, but nfsd does know about nlm..
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 7e84a852cdae..ad48faca20fc 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -702,7 +702,7 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
                *p++ = htonl(resp->eof);
                *p++ = htonl(resp->count);      /* xdr opaque count */
                xdr_ressize_check(rqstp, p);
-                /* now update rqstp->rq_res to reflect data aswell */
+                /* now update rqstp->rq_res to reflect data as well */
                rqstp->rq_res.page_len = resp->count;
                if (resp->count & 3) {
                        /* need to pad the tail */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3be975e18919..02eb4edf0ece 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -432,7 +432,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
         * If the server returns different values for sessionID, slotID or
         * sequence number, the server is looney tunes.
         */
-        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
+        p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
        if (unlikely(p == NULL))
                goto out_overflow;
        memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
@@ -484,7 +484,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr,
 out:
        return status;
 out_default:
-        return nfs_cb_stat_to_errno(status);
+        return nfs_cb_stat_to_errno(nfserr);
 }
 /*
@@ -564,11 +564,9 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
        if (unlikely(status))
                goto out;
        if (unlikely(nfserr != NFS4_OK))
-                goto out_default;
+                status = nfs_cb_stat_to_errno(nfserr);
 out:
        return status;
-out_default:
-        return nfs_cb_stat_to_errno(status);
 }
 /*
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6d2c397d458b..55780a22fdbd 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -63,7 +63,6 @@ struct ent {
 #define ENT_HASHBITS          8
 #define ENT_HASHMAX           (1 << ENT_HASHBITS)
-#define ENT_HASHMASK          (ENT_HASHMAX - 1)
 static void
 ent_init(struct cache_head *cnew, struct cache_head *citm)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index db52546143d1..5fcb1396a7e3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -984,8 +984,8 @@ typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
                              void *);
 enum nfsd4_op_flags {
        ALLOWED_WITHOUT_FH = 1 << 0,    /* No current filehandle required */
-        ALLOWED_ON_ABSENT_FS = 2 << 0,  /* ops processed on absent fs */
+        ALLOWED_ON_ABSENT_FS = 1 << 1,  /* ops processed on absent fs */
-        ALLOWED_AS_FIRST_OP = 3 << 0,   /* ops reqired first in compound */
+        ALLOWED_AS_FIRST_OP = 1 << 2,   /* ops reqired first in compound */
 };
 struct nfsd4_operation {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d98d0213285d..aa309aa93fe8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -148,7 +148,7 @@ static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
 /* hash table for nfs4_file */
 #define FILE_HASH_BITS                   8
 #define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
-#define FILE_HASH_MASK                  (FILE_HASH_SIZE - 1)
 /* hash table for (open)nfs4_stateid */
 #define STATEID_HASH_BITS              10
 #define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
@@ -230,9 +230,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        dp->dl_client = clp;
        get_nfs4_file(fp);
        dp->dl_file = fp;
-        dp->dl_vfs_file = find_readable_file(fp);
-        get_file(dp->dl_vfs_file);
-        dp->dl_flock = NULL;
        dp->dl_type = type;
        dp->dl_stateid.si_boot = boot_time;
        dp->dl_stateid.si_stateownerid = current_delegid++;
@@ -241,8 +238,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
        fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
        dp->dl_time = 0;
        atomic_set(&dp->dl_count, 1);
-        list_add(&dp->dl_perfile, &fp->fi_delegations);
-        list_add(&dp->dl_perclnt, &clp->cl_delegations);
        INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
        return dp;
 }
@@ -253,36 +248,30 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
        if (atomic_dec_and_test(&dp->dl_count)) {
                dprintk("NFSD: freeing dp %p\n",dp);
                put_nfs4_file(dp->dl_file);
-                fput(dp->dl_vfs_file);
                kmem_cache_free(deleg_slab, dp);
                num_delegations--;
        }
 }
-/* Remove the associated file_lock first, then remove the delegation.
+static void nfs4_put_deleg_lease(struct nfs4_file *fp)
- * lease_modify() is called to remove the FS_LEASE file_lock from
- * the i_flock list, eventually calling nfsd's lock_manager
- * fl_release_callback.
- */
-static void
-nfs4_close_delegation(struct nfs4_delegation *dp)
 {
-        dprintk("NFSD: close_delegation dp %p\n",dp);
+        if (atomic_dec_and_test(&fp->fi_delegees)) {
-        /* XXX: do we even need this check?: */
+                vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease);
-        if (dp->dl_flock)
+                fp->fi_lease = NULL;
-                vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock);
+                fp->fi_deleg_file = NULL;
+        }
 }
 /* Called under the state lock. */
 static void
 unhash_delegation(struct nfs4_delegation *dp)
 {
-        list_del_init(&dp->dl_perfile);
        list_del_init(&dp->dl_perclnt);
        spin_lock(&recall_lock);
+        list_del_init(&dp->dl_perfile);
        list_del_init(&dp->dl_recall_lru);
        spin_unlock(&recall_lock);
-        nfs4_close_delegation(dp);
+        nfs4_put_deleg_lease(dp->dl_file);
        nfs4_put_delegation(dp);
 }
@@ -327,64 +316,6 @@ static struct list_head	unconf_id_hashtbl[CLIENT_HASH_SIZE];
 static struct list_head client_lru;
 static struct list_head close_lru;
-static void unhash_generic_stateid(struct nfs4_stateid *stp)
-{
-        list_del(&stp->st_hash);
-        list_del(&stp->st_perfile);
-        list_del(&stp->st_perstateowner);
-}
-static void free_generic_stateid(struct nfs4_stateid *stp)
-{
-        put_nfs4_file(stp->st_file);
-        kmem_cache_free(stateid_slab, stp);
-}
-static void release_lock_stateid(struct nfs4_stateid *stp)
-{
-        struct file *file;
-        unhash_generic_stateid(stp);
-        file = find_any_file(stp->st_file);
-        if (file)
-                locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
-        free_generic_stateid(stp);
-}
-static void unhash_lockowner(struct nfs4_stateowner *sop)
-{
-        struct nfs4_stateid *stp;
-        list_del(&sop->so_idhash);
-        list_del(&sop->so_strhash);
-        list_del(&sop->so_perstateid);
-        while (!list_empty(&sop->so_stateids)) {
-                stp = list_first_entry(&sop->so_stateids,
-                                struct nfs4_stateid, st_perstateowner);
-                release_lock_stateid(stp);
-        }
-}
-static void release_lockowner(struct nfs4_stateowner *sop)
-{
-        unhash_lockowner(sop);
-        nfs4_put_stateowner(sop);
-}
-static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
-{
-        struct nfs4_stateowner *lock_sop;
-        while (!list_empty(&open_stp->st_lockowners)) {
-                lock_sop = list_entry(open_stp->st_lockowners.next,
-                                struct nfs4_stateowner, so_perstateid);
-                /* list_del(&open_stp->st_lockowners);  */
-                BUG_ON(lock_sop->so_is_open_owner);
-                release_lockowner(lock_sop);
-        }
-}
 /*
 * We store the NONE, READ, WRITE, and BOTH bits separately in the
 * st_{access,deny}_bmap field of the stateid, in order to track not
@@ -457,13 +388,74 @@ static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp)
        return nfs4_access_to_omode(access);
 }
-static void release_open_stateid(struct nfs4_stateid *stp)
+static void unhash_generic_stateid(struct nfs4_stateid *stp)
+{
+        list_del(&stp->st_hash);
+        list_del(&stp->st_perfile);
+        list_del(&stp->st_perstateowner);
+}
+static void free_generic_stateid(struct nfs4_stateid *stp)
+{
+        int oflag;
+        if (stp->st_access_bmap) {
+                oflag = nfs4_access_bmap_to_omode(stp);
+                nfs4_file_put_access(stp->st_file, oflag);
+                put_nfs4_file(stp->st_file);
+        }
+        kmem_cache_free(stateid_slab, stp);
+}
+static void release_lock_stateid(struct nfs4_stateid *stp)
+{
+        struct file *file;
+        unhash_generic_stateid(stp);
+        file = find_any_file(stp->st_file);
+        if (file)
+                locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
+        free_generic_stateid(stp);
+}
+static void unhash_lockowner(struct nfs4_stateowner *sop)
+{
+        struct nfs4_stateid *stp;
+        list_del(&sop->so_idhash);
+        list_del(&sop->so_strhash);
+        list_del(&sop->so_perstateid);
+        while (!list_empty(&sop->so_stateids)) {
+                stp = list_first_entry(&sop->so_stateids,
+                                struct nfs4_stateid, st_perstateowner);
+                release_lock_stateid(stp);
+        }
+}
+static void release_lockowner(struct nfs4_stateowner *sop)
+{
+        unhash_lockowner(sop);
+        nfs4_put_stateowner(sop);
+}
+static void
+release_stateid_lockowners(struct nfs4_stateid *open_stp)
 {
-        int oflag = nfs4_access_bmap_to_omode(stp);
+        struct nfs4_stateowner *lock_sop;
+        while (!list_empty(&open_stp->st_lockowners)) {
+                lock_sop = list_entry(open_stp->st_lockowners.next,
+                                struct nfs4_stateowner, so_perstateid);
+                /* list_del(&open_stp->st_lockowners);  */
+                BUG_ON(lock_sop->so_is_open_owner);
+                release_lockowner(lock_sop);
+        }
+}
+static void release_open_stateid(struct nfs4_stateid *stp)
+{
        unhash_generic_stateid(stp);
        release_stateid_lockowners(stp);
-        nfs4_file_put_access(stp->st_file, oflag);
        free_generic_stateid(stp);
 }
@@ -619,7 +611,8 @@ static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4
        u32 maxrpc = nfsd_serv->sv_max_mesg;
        new->maxreqs = numslots;
-        new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ;
+        new->maxresp_cached = min_t(u32, req->maxresp_cached,
+                                        slotsize + NFSD_MIN_HDR_SEQ_SZ);
        new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
        new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
        new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
@@ -958,8 +951,6 @@ expire_client(struct nfs4_client *clp)
        spin_lock(&recall_lock);
        while (!list_empty(&clp->cl_delegations)) {
                dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
-                dprintk("NFSD: expire client. dp %p, fp %p\n", dp,
-                                dp->dl_flock);
                list_del_init(&dp->dl_perclnt);
                list_move(&dp->dl_recall_lru, &reaplist);
        }
@@ -2078,6 +2069,7 @@ alloc_init_file(struct inode *ino)
                fp->fi_inode = igrab(ino);
                fp->fi_id = current_fileid++;
                fp->fi_had_conflict = false;
+                fp->fi_lease = NULL;
                memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
                memset(fp->fi_access, 0, sizeof(fp->fi_access));
                spin_lock(&recall_lock);
@@ -2329,23 +2321,8 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
                nfs4_file_put_access(fp, O_RDONLY);
 }
-/*
+static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
- * Spawn a thread to perform a recall on the delegation represented
- * by the lease (file_lock)
- *
- * Called from break_lease() with lock_flocks() held.
- * Note: we assume break_lease will only call this *once* for any given
- * lease.
- */
-static
-void nfsd_break_deleg_cb(struct file_lock *fl)
 {
-        struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
-        dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl);
-        if (!dp)
-                return;
        /* We're assuming the state code never drops its reference
         * without first removing the lease.  Since we're in this lease
         * callback (and since the lease code is serialized by the kernel
@@ -2353,22 +2330,35 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
         * it's safe to take a reference: */
        atomic_inc(&dp->dl_count);
-        spin_lock(&recall_lock);
        list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
-        spin_unlock(&recall_lock);
        /* only place dl_time is set. protected by lock_flocks*/
        dp->dl_time = get_seconds();
+        nfsd4_cb_recall(dp);
+}
+/* Called from break_lease() with lock_flocks() held. */
+static void nfsd_break_deleg_cb(struct file_lock *fl)
+{
+        struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
+        struct nfs4_delegation *dp;
+        BUG_ON(!fp);
+        /* We assume break_lease is only called once per lease: */
+        BUG_ON(fp->fi_had_conflict);
        /*
         * We don't want the locks code to timeout the lease for us;
-         * we'll remove it ourself if the delegation isn't returned
+         * we'll remove it ourself if a delegation isn't returned
-         * in time.
+         * in time:
         */
        fl->fl_break_time = 0;
-        dp->dl_file->fi_had_conflict = true;
+        spin_lock(&recall_lock);
-        nfsd4_cb_recall(dp);
+        fp->fi_had_conflict = true;
+        list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+                nfsd_break_one_deleg(dp);
+        spin_unlock(&recall_lock);
 }
 static
@@ -2461,10 +2451,13 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
 {
        struct nfs4_delegation *dp;
-        list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) {
+        spin_lock(&recall_lock);
-                if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid)
+        list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+                if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
+                        spin_unlock(&recall_lock);
                        return dp;
-        }
+                }
+        spin_unlock(&recall_lock);
        return NULL;
 }
@@ -2641,6 +2634,66 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
        return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
 }
+static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag)
+{
+        struct file_lock *fl;
+        fl = locks_alloc_lock();
+        if (!fl)
+                return NULL;
+        locks_init_lock(fl);
+        fl->fl_lmops = &nfsd_lease_mng_ops;
+        fl->fl_flags = FL_LEASE;
+        fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+        fl->fl_end = OFFSET_MAX;
+        fl->fl_owner = (fl_owner_t)(dp->dl_file);
+        fl->fl_pid = current->tgid;
+        return fl;
+}
+static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
+{
+        struct nfs4_file *fp = dp->dl_file;
+        struct file_lock *fl;
+        int status;
+        fl = nfs4_alloc_init_lease(dp, flag);
+        if (!fl)
+                return -ENOMEM;
+        fl->fl_file = find_readable_file(fp);
+        list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+        status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
+        if (status) {
+                list_del_init(&dp->dl_perclnt);
+                locks_free_lock(fl);
+                return -ENOMEM;
+        }
+        fp->fi_lease = fl;
+        fp->fi_deleg_file = fl->fl_file;
+        get_file(fp->fi_deleg_file);
+        atomic_set(&fp->fi_delegees, 1);
+        list_add(&dp->dl_perfile, &fp->fi_delegations);
+        return 0;
+}
+static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
+{
+        struct nfs4_file *fp = dp->dl_file;
+        if (!fp->fi_lease)
+                return nfs4_setlease(dp, flag);
+        spin_lock(&recall_lock);
+        if (fp->fi_had_conflict) {
+                spin_unlock(&recall_lock);
+                return -EAGAIN;
+        }
+        atomic_inc(&fp->fi_delegees);
+        list_add(&dp->dl_perfile, &fp->fi_delegations);
+        spin_unlock(&recall_lock);
+        list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
+        return 0;
+}
 /*
 * Attempt to hand out a delegation.
 */
@@ -2650,7 +2703,6 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        struct nfs4_delegation *dp;
        struct nfs4_stateowner *sop = stp->st_stateowner;
        int cb_up;
-        struct file_lock *fl;
        int status, flag = 0;
        cb_up = nfsd4_cb_channel_good(sop->so_client);
@@ -2681,36 +2733,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        }
        dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
-        if (dp == NULL) {
+        if (dp == NULL)
-                flag = NFS4_OPEN_DELEGATE_NONE;
+                goto out_no_deleg;
-                goto out;
+        status = nfs4_set_delegation(dp, flag);
-        }
+        if (status)
-        status = -ENOMEM;
+                goto out_free;
-        fl = locks_alloc_lock();
-        if (!fl)
-                goto out;
-        locks_init_lock(fl);
-        fl->fl_lmops = &nfsd_lease_mng_ops;
-        fl->fl_flags = FL_LEASE;
-        fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
-        fl->fl_end = OFFSET_MAX;
-        fl->fl_owner =  (fl_owner_t)dp;
-        fl->fl_file = find_readable_file(stp->st_file);
-        BUG_ON(!fl->fl_file);
-        fl->fl_pid = current->tgid;
-        dp->dl_flock = fl;
-        /* vfs_setlease checks to see if delegation should be handed out.
-         * the lock_manager callback fl_change is used
-         */
-        if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
-                dprintk("NFSD: setlease failed [%d], no delegation\n", status);
-                dp->dl_flock = NULL;
-                locks_free_lock(fl);
-                unhash_delegation(dp);
-                flag = NFS4_OPEN_DELEGATE_NONE;
-                goto out;
-        }
        memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
@@ -2722,6 +2749,12 @@ out:
                        && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
                dprintk("NFSD: WARNING: refusing delegation reclaim\n");
        open->op_delegate_type = flag;
+        return;
+out_free:
+        nfs4_put_delegation(dp);
+out_no_deleg:
+        flag = NFS4_OPEN_DELEGATE_NONE;
+        goto out;
 }
 /*
@@ -2916,8 +2949,6 @@ nfs4_laundromat(void)
                                test_val = u;
                        break;
                }
-                dprintk("NFSD: purging unused delegation dp %p, fp %p\n",
-                                    dp, dp->dl_flock);
                list_move(&dp->dl_recall_lru, &reaplist);
        }
        spin_unlock(&recall_lock);
@@ -3027,7 +3058,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
        if (ONE_STATEID(stateid) && (flags & RD_STATE))
                return nfs_ok;
        else if (locks_in_grace()) {
-                /* Answer in remaining cases depends on existance of
+                /* Answer in remaining cases depends on existence of
                 * conflicting state; so we must wait out the grace period. */
                return nfserr_grace;
        } else if (flags & WR_STATE)
@@ -3128,7 +3159,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
                        goto out;
                renew_client(dp->dl_client);
                if (filpp) {
-                        *filpp = find_readable_file(dp->dl_file);
+                        *filpp = dp->dl_file->fi_deleg_file;
                        BUG_ON(!*filpp);
                }
        } else { /* open or lock stateid */
@@ -3647,7 +3678,7 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
 /*
 * Alloc a lock owner structure.
 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 
- * occured. 
+ * occurred. 
 *
 * strhashval = lock_ownerstr_hashval 
 */
@@ -3708,6 +3739,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
        stp->st_stateid.si_stateownerid = sop->so_id;
        stp->st_stateid.si_fileid = fp->fi_id;
        stp->st_stateid.si_generation = 0;
+        stp->st_access_bmap = 0;
        stp->st_deny_bmap = open_stp->st_deny_bmap;
        stp->st_openstp = open_stp;
@@ -3722,6 +3754,17 @@ check_lock_length(u64 offset, u64 length)
             LOFF_OVERFLOW(offset, length)));
 }
+static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access)
+{
+        struct nfs4_file *fp = lock_stp->st_file;
+        int oflag = nfs4_access_to_omode(access);
+        if (test_bit(access, &lock_stp->st_access_bmap))
+                return;
+        nfs4_file_get_access(fp, oflag);
+        __set_bit(access, &lock_stp->st_access_bmap);
+}
 /*
 *  LOCK operation 
 */
@@ -3738,7 +3781,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct file_lock conflock;
        __be32 status = 0;
        unsigned int strhashval;
-        unsigned int cmd;
        int err;
        dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
@@ -3820,22 +3862,18 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        switch (lock->lk_type) {
                case NFS4_READ_LT:
                case NFS4_READW_LT:
-                        if (find_readable_file(lock_stp->st_file)) {
+                        filp = find_readable_file(lock_stp->st_file);
-                                nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ);
+                        if (filp)
-                                filp = find_readable_file(lock_stp->st_file);
+                                get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
-                        }
                        file_lock.fl_type = F_RDLCK;
-                        cmd = F_SETLK;
+                        break;
-                break;
                case NFS4_WRITE_LT:
                case NFS4_WRITEW_LT:
-                        if (find_writeable_file(lock_stp->st_file)) {
+                        filp = find_writeable_file(lock_stp->st_file);
-                                nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE);
+                        if (filp)
-                                filp = find_writeable_file(lock_stp->st_file);
+                                get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
-                        }
                        file_lock.fl_type = F_WRLCK;
-                        cmd = F_SETLK;
+                        break;
-                break;
                default:
                        status = nfserr_inval;
                goto out;
@@ -3859,7 +3897,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        * Note: locks.c uses the BKL to protect the inode's lock list.
        */
-        err = vfs_lock_file(filp, cmd, &file_lock, &conflock);
+        err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock);
        switch (-err) {
        case 0: /* success! */
                update_stateid(&lock_stp->st_stateid);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 956629b9cdc9..c6766af00d98 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -317,8 +317,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                READ_BUF(dummy32);
                len += (XDR_QUADLEN(dummy32) << 2);
                READMEM(buf, dummy32);
-                if ((host_err = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
+                if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
-                        goto out_nfserr;
+                        return status;
                iattr->ia_valid |= ATTR_UID;
        }
        if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) {
@@ -328,8 +328,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                READ_BUF(dummy32);
                len += (XDR_QUADLEN(dummy32) << 2);
                READMEM(buf, dummy32);
-                if ((host_err = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
+                if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
-                        goto out_nfserr;
+                        return status;
                iattr->ia_valid |= ATTR_GID;
        }
        if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
@@ -1215,8 +1215,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
                        READ_BUF(4);
                        READ32(dummy);
                        READ_BUF(dummy * 4);
-                        for (i = 0; i < dummy; ++i)
-                                READ32(dummy);
                        break;
                case RPC_AUTH_GSS:
                        dprintk("RPC_AUTH_GSS callback secflavor "
@@ -1232,7 +1230,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
                        READ_BUF(4);
                        READ32(dummy);
                        READ_BUF(dummy);
-                        p += XDR_QUADLEN(dummy);
                        break;
                default:
                        dprintk("Illegal callback secflavor\n");
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 33b3e2b06779..1f5eae40f34e 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -12,13 +12,14 @@
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/gss_api.h>
 #include "idmap.h"
 #include "nfsd.h"
 #include "cache.h"
 /*
- *      We have a single directory with 9 nodes in it.
+ *      We have a single directory with several nodes in it.
 */
 enum {
        NFSD_Root = 1,
@@ -42,6 +43,7 @@ enum {
        NFSD_Versions,
        NFSD_Ports,
        NFSD_MaxBlkSize,
+        NFSD_SupportedEnctypes,
        /*
         * The below MUST come last.  Otherwise we leave a hole in nfsd_files[]
         * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
@@ -187,6 +189,34 @@ static struct file_operations export_features_operations = {
        .release        = single_release,
 };
+#ifdef CONFIG_SUNRPC_GSS
+static int supported_enctypes_show(struct seq_file *m, void *v)
+{
+        struct gss_api_mech *k5mech;
+        k5mech = gss_mech_get_by_name("krb5");
+        if (k5mech == NULL)
+                goto out;
+        if (k5mech->gm_upcall_enctypes != NULL)
+                seq_printf(m, k5mech->gm_upcall_enctypes);
+        gss_mech_put(k5mech);
+out:
+        return 0;
+}
+static int supported_enctypes_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, supported_enctypes_show, NULL);
+}
+static struct file_operations supported_enctypes_ops = {
+        .open           = supported_enctypes_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+#endif /* CONFIG_SUNRPC_GSS */
 extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
 extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
@@ -1397,6 +1427,9 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
                [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
+#ifdef CONFIG_SUNRPC_GSS
+                [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
+#endif /* CONFIG_SUNRPC_GSS */
 #ifdef CONFIG_NFSD_V4
                [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 4ce005dbf3e6..65ec595e2226 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -451,7 +451,7 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
        *p++ = htonl(resp->count);
        xdr_ressize_check(rqstp, p);
-        /* now update rqstp->rq_res to reflect data aswell */
+        /* now update rqstp->rq_res to reflect data as well */
        rqstp->rq_res.page_len = resp->count;
        if (resp->count & 3) {
                /* need to pad the tail */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 3074656ba7bf..6bd2f3c21f2b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -83,8 +83,6 @@ struct nfs4_delegation {
        atomic_t                dl_count;       /* ref count */
        struct nfs4_client      *dl_client;
        struct nfs4_file        *dl_file;
-        struct file             *dl_vfs_file;
-        struct file_lock        *dl_flock;
        u32                     dl_type;
        time_t                  dl_time;
 /* For recall: */
@@ -369,16 +367,15 @@ struct nfs4_file {
        struct list_head        fi_delegations;
        /* One each for O_RDONLY, O_WRONLY, O_RDWR: */
        struct file *           fi_fds[3];
-        /* One each for O_RDONLY, O_WRONLY: */
-        atomic_t                fi_access[2];
        /*
-         * Each open stateid contributes 1 to either fi_readers or
+         * Each open or lock stateid contributes 1 to either
-         * fi_writers, or both, depending on the open mode.  A
+         * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending
-         * delegation also takes an fi_readers reference.  Lock
+         * on open or lock mode:
-         * stateid's take none.
         */
-        atomic_t                fi_readers;
+        atomic_t                fi_access[2];
-        atomic_t                fi_writers;
+        struct file             *fi_deleg_file;
+        struct file_lock        *fi_lease;
+        atomic_t                fi_delegees;
        struct inode            *fi_inode;
        u32                     fi_id;      /* used with stateowner->so_id 
                                             * for stateid_hashtbl hash */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 641117f2188d..2e1cebde90df 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -87,7 +87,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
                            .dentry = dget(dentry)};
        int err = 0;
-        err = follow_down(&path, false);
+        err = follow_down(&path);
        if (err < 0)
                goto out;
@@ -808,7 +808,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
                if (ra->p_count == 0)
                        frap = rap;
        }
-        depth = nfsdstats.ra_size*11/10;
+        depth = nfsdstats.ra_size;
        if (!frap) {    
                spin_unlock(&rab->pb_lock);
                return NULL;
@@ -1744,6 +1744,11 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        host_err = nfsd_break_lease(odentry->d_inode);
        if (host_err)
                goto out_drop_write;
+        if (ndentry->d_inode) {
+                host_err = nfsd_break_lease(ndentry->d_inode);
+                if (host_err)
+                        goto out_drop_write;
+        }
        host_err = vfs_rename(fdir, odentry, tdir, ndentry);
        if (!host_err) {
                host_err = commit_metadata(tfhp);
@@ -1812,22 +1817,22 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
        if (host_err)
-                goto out_nfserr;
+                goto out_put;
        host_err = nfsd_break_lease(rdentry->d_inode);
        if (host_err)
-                goto out_put;
+                goto out_drop_write;
        if (type != S_IFDIR)
                host_err = vfs_unlink(dirp, rdentry);
        else
                host_err = vfs_rmdir(dirp, rdentry);
-out_put:
-        dput(rdentry);
        if (!host_err)
                host_err = commit_metadata(fhp);
+out_drop_write:
        mnt_drop_write(fhp->fh_export->ex_path.mnt);
+out_put:
+        dput(rdentry);
 out_nfserr:
        err = nfserrno(host_err);
 out:
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index d7fd696e595c..0a0a66d98cce 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -521,8 +521,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
                                    group_offset, bitmap))
                printk(KERN_WARNING "%s: entry number %llu already freed\n",
                       __func__, (unsigned long long)req->pr_entry_nr);
+        else
-        nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+                nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
        kunmap(req->pr_bitmap_bh->b_page);
        kunmap(req->pr_desc_bh->b_page);
@@ -558,8 +558,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
                                    group_offset, bitmap))
                printk(KERN_WARNING "%s: entry number %llu already freed\n",
                       __func__, (unsigned long long)req->pr_entry_nr);
+        else
-        nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+                nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
        kunmap(req->pr_bitmap_bh->b_page);
        kunmap(req->pr_desc_bh->b_page);
@@ -665,7 +665,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
                for (j = i, n = 0;
                     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
                                                              entry_nrs[j]);
-                     j++, n++) {
+                     j++) {
                        nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
                        if (!nilfs_clear_bit_atomic(
                                    nilfs_mdt_bgl_lock(inode, group),
@@ -674,6 +674,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
                                       "%s: entry number %llu already freed\n",
                                       __func__,
                                       (unsigned long long)entry_nrs[j]);
+                        } else {
+                                n++;
                        }
                }
                nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 9af34a7e6e13..f5fde36b9e28 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -74,7 +74,7 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
 #define nilfs_set_bit_atomic            ext2_set_bit_atomic
 #define nilfs_clear_bit_atomic          ext2_clear_bit_atomic
-#define nilfs_find_next_zero_bit        ext2_find_next_zero_bit
+#define nilfs_find_next_zero_bit        find_next_zero_bit_le
 /*
 * persistent object allocator cache
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 3ee67c67cc52..4723f04e9b12 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -25,7 +25,6 @@
 #include <linux/errno.h>
 #include "nilfs.h"
 #include "bmap.h"
-#include "sb.h"
 #include "btree.h"
 #include "direct.h"
 #include "btnode.h"
@@ -425,17 +424,6 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
 /*
 * Internal use only
 */
-void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
-{
-        inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
-}
-void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
-{
-        inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
-}
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
                              const struct buffer_head *bh)
 {
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index bde1c0aa2e15..40d9f453d31c 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -240,9 +240,6 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
 __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
-void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
-void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
 /* Assume that bmap semaphore is locked. */
 static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 388e9e8f5286..609cd223eea8 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -34,20 +34,10 @@
 #include "page.h"
 #include "btnode.h"
-void nilfs_btnode_cache_init_once(struct address_space *btnc)
-{
-        nilfs_mapping_init_once(btnc);
-}
-static const struct address_space_operations def_btnode_aops = {
-        .sync_page              = block_sync_page,
-};
 void nilfs_btnode_cache_init(struct address_space *btnc,
                             struct backing_dev_info *bdi)
 {
-        nilfs_mapping_init(btnc, bdi, &def_btnode_aops);
+        nilfs_mapping_init(btnc, bdi);
 }
 void nilfs_btnode_cache_clear(struct address_space *btnc)
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 79037494f1e0..1b8ebd888c28 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
        struct buffer_head *newbh;
 };
-void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 300c2bc00c3f..d451ae0e0bf3 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1174,7 +1174,7 @@ static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
        if (ret < 0)
                goto out;
        nilfs_btree_commit_insert(btree, path, level, key, ptr);
-        nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
+        nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
 out:
        nilfs_btree_free_path(path);
@@ -1511,7 +1511,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)
        if (ret < 0)
                goto out;
        nilfs_btree_commit_delete(btree, path, level, dat);
-        nilfs_bmap_sub_blocks(btree, stats.bs_nblocks);
+        nilfs_inode_sub_blocks(btree->b_inode, stats.bs_nblocks);
 out:
        nilfs_btree_free_path(path);
@@ -1776,7 +1776,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
                return ret;
        nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
                                              di, ni, bh);
-        nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
+        nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
        return 0;
 }
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 9d45773b79e6..3a1923943b14 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -440,7 +440,6 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
        nilfs_commit_chunk(page, mapping, from, to);
        nilfs_put_page(page);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-/*      NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
 }
 /*
@@ -531,7 +530,6 @@ got_it:
        nilfs_set_de_type(de, inode);
        nilfs_commit_chunk(page, page->mapping, from, to);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-/*      NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
        nilfs_mark_inode_dirty(dir);
        /* OFFSET_CACHE */
 out_put:
@@ -579,7 +577,6 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
        dir->inode = 0;
        nilfs_commit_chunk(page, mapping, from, to);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-/*      NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
 out:
        nilfs_put_page(page);
        return err;
@@ -684,7 +681,7 @@ const struct file_operations nilfs_dir_operations = {
        .readdir        = nilfs_readdir,
        .unlocked_ioctl = nilfs_ioctl,
 #ifdef CONFIG_COMPAT
-        .compat_ioctl   = nilfs_ioctl,
+        .compat_ioctl   = nilfs_compat_ioctl,
 #endif  /* CONFIG_COMPAT */
        .fsync          = nilfs_sync_file,
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 324d80c57518..82f4865e86dd 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -146,7 +146,7 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
                if (NILFS_BMAP_USE_VBN(bmap))
                        nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr);
-                nilfs_bmap_add_blocks(bmap, 1);
+                nilfs_inode_add_blocks(bmap->b_inode, 1);
        }
        return ret;
 }
@@ -168,7 +168,7 @@ static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
        if (!ret) {
                nilfs_bmap_commit_end_ptr(bmap, &req, dat);
                nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR);
-                nilfs_bmap_sub_blocks(bmap, 1);
+                nilfs_inode_sub_blocks(bmap->b_inode, 1);
        }
        return ret;
 }
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 2f560c9fb808..397e73258631 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -59,7 +59,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct nilfs_transaction_info ti;
        int ret;
-        if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs)))
+        if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
                return VM_FAULT_SIGBUS; /* -ENOSPC */
        lock_page(page);
@@ -72,10 +72,9 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        /*
         * check to see if the page is mapped already (no holes)
         */
-        if (PageMappedToDisk(page)) {
+        if (PageMappedToDisk(page))
-                unlock_page(page);
                goto mapped;
-        }
        if (page_has_buffers(page)) {
                struct buffer_head *bh, *head;
                int fully_mapped = 1;
@@ -90,7 +89,6 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                if (fully_mapped) {
                        SetPageMappedToDisk(page);
-                        unlock_page(page);
                        goto mapped;
                }
        }
@@ -105,16 +103,17 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                return VM_FAULT_SIGBUS;
        ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
-        if (unlikely(ret)) {
+        if (ret != VM_FAULT_LOCKED) {
                nilfs_transaction_abort(inode->i_sb);
                return ret;
        }
+        nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits));
        nilfs_transaction_commit(inode->i_sb);
 mapped:
        SetPageChecked(page);
        wait_on_page_writeback(page);
-        return 0;
+        return VM_FAULT_LOCKED;
 }
 static const struct vm_operations_struct nilfs_file_vm_ops = {
@@ -142,7 +141,7 @@ const struct file_operations nilfs_file_operations = {
        .aio_write      = generic_file_aio_write,
        .unlocked_ioctl = nilfs_ioctl,
 #ifdef CONFIG_COMPAT
-        .compat_ioctl   = nilfs_ioctl,
+        .compat_ioctl   = nilfs_compat_ioctl,
 #endif  /* CONFIG_COMPAT */
        .mmap           = nilfs_file_mmap,
        .open           = generic_file_open,
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index caf9a6a3fb54..1c2a3e23f8b2 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -49,7 +49,6 @@
 #include "ifile.h"
 static const struct address_space_operations def_gcinode_aops = {
-        .sync_page              = block_sync_page,
 };
 /*
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2fd440d8d6b8..c0aa27490c02 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -41,6 +41,24 @@ struct nilfs_iget_args {
        int for_gc;
 };
+void nilfs_inode_add_blocks(struct inode *inode, int n)
+{
+        struct nilfs_root *root = NILFS_I(inode)->i_root;
+        inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
+        if (root)
+                atomic_add(n, &root->blocks_count);
+}
+void nilfs_inode_sub_blocks(struct inode *inode, int n)
+{
+        struct nilfs_root *root = NILFS_I(inode)->i_root;
+        inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
+        if (root)
+                atomic_sub(n, &root->blocks_count);
+}
 /**
 * nilfs_get_block() - get a file block on the filesystem (callback function)
 * @inode - inode struct of the target file
@@ -262,7 +280,6 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 const struct address_space_operations nilfs_aops = {
        .writepage              = nilfs_writepage,
        .readpage               = nilfs_readpage,
-        .sync_page              = block_sync_page,
        .writepages             = nilfs_writepages,
        .set_page_dirty         = nilfs_set_page_dirty,
        .readpages              = nilfs_readpages,
@@ -277,7 +294,7 @@ const struct address_space_operations nilfs_aops = {
 struct inode *nilfs_new_inode(struct inode *dir, int mode)
 {
        struct super_block *sb = dir->i_sb;
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct inode *inode;
        struct nilfs_inode_info *ii;
        struct nilfs_root *root;
@@ -315,19 +332,16 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
                /* No lock is needed; iget() ensures it. */
        }
-        ii->i_flags = NILFS_I(dir)->i_flags;
+        ii->i_flags = nilfs_mask_flags(
-        if (S_ISLNK(mode))
+                mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED);
-                ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL);
-        if (!S_ISDIR(mode))
-                ii->i_flags &= ~NILFS_DIRSYNC_FL;
        /* ii->i_file_acl = 0; */
        /* ii->i_dir_acl = 0; */
        ii->i_dir_start_lookup = 0;
        nilfs_set_inode_flags(inode);
-        spin_lock(&sbi->s_next_gen_lock);
+        spin_lock(&nilfs->ns_next_gen_lock);
-        inode->i_generation = sbi->s_next_generation++;
+        inode->i_generation = nilfs->ns_next_generation++;
-        spin_unlock(&sbi->s_next_gen_lock);
+        spin_unlock(&nilfs->ns_next_gen_lock);
        insert_inode_hash(inode);
        err = nilfs_init_acl(inode, dir);
@@ -359,17 +373,15 @@ void nilfs_set_inode_flags(struct inode *inode)
        inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
                            S_DIRSYNC);
-        if (flags & NILFS_SYNC_FL)
+        if (flags & FS_SYNC_FL)
                inode->i_flags |= S_SYNC;
-        if (flags & NILFS_APPEND_FL)
+        if (flags & FS_APPEND_FL)
                inode->i_flags |= S_APPEND;
-        if (flags & NILFS_IMMUTABLE_FL)
+        if (flags & FS_IMMUTABLE_FL)
                inode->i_flags |= S_IMMUTABLE;
-#ifndef NILFS_ATIME_DISABLE
+        if (flags & FS_NOATIME_FL)
-        if (flags & NILFS_NOATIME_FL)
-#endif
                inode->i_flags |= S_NOATIME;
-        if (flags & NILFS_DIRSYNC_FL)
+        if (flags & FS_DIRSYNC_FL)
                inode->i_flags |= S_DIRSYNC;
        mapping_set_gfp_mask(inode->i_mapping,
                             mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
@@ -420,7 +432,7 @@ static int __nilfs_read_inode(struct super_block *sb,
                              struct nilfs_root *root, unsigned long ino,
                              struct inode *inode)
 {
-        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct buffer_head *bh;
        struct nilfs_inode *raw_inode;
        int err;
@@ -707,6 +719,7 @@ void nilfs_evict_inode(struct inode *inode)
        struct nilfs_transaction_info ti;
        struct super_block *sb = inode->i_sb;
        struct nilfs_inode_info *ii = NILFS_I(inode);
+        int ret;
        if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
                if (inode->i_data.nrpages)
@@ -725,8 +738,9 @@ void nilfs_evict_inode(struct inode *inode)
        nilfs_mark_inode_dirty(inode);
        end_writeback(inode);
-        nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
+        ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
-        atomic_dec(&ii->i_root->inodes_count);
+        if (!ret)
+                atomic_dec(&ii->i_root->inodes_count);
        nilfs_clear_inode(inode);
@@ -792,18 +806,18 @@ int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct nilfs_inode_info *ii = NILFS_I(inode);
        int err;
-        spin_lock(&sbi->s_inode_lock);
+        spin_lock(&nilfs->ns_inode_lock);
        if (ii->i_bh == NULL) {
-                spin_unlock(&sbi->s_inode_lock);
+                spin_unlock(&nilfs->ns_inode_lock);
                err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
                                                  inode->i_ino, pbh);
                if (unlikely(err))
                        return err;
-                spin_lock(&sbi->s_inode_lock);
+                spin_lock(&nilfs->ns_inode_lock);
                if (ii->i_bh == NULL)
                        ii->i_bh = *pbh;
                else {
@@ -814,36 +828,36 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
                *pbh = ii->i_bh;
        get_bh(*pbh);
-        spin_unlock(&sbi->s_inode_lock);
+        spin_unlock(&nilfs->ns_inode_lock);
        return 0;
 }
 int nilfs_inode_dirty(struct inode *inode)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
-        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        int ret = 0;
        if (!list_empty(&ii->i_dirty)) {
-                spin_lock(&sbi->s_inode_lock);
+                spin_lock(&nilfs->ns_inode_lock);
                ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
                        test_bit(NILFS_I_BUSY, &ii->i_state);
-                spin_unlock(&sbi->s_inode_lock);
+                spin_unlock(&nilfs->ns_inode_lock);
        }
        return ret;
 }
 int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
        struct nilfs_inode_info *ii = NILFS_I(inode);
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
-        atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
+        atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);
        if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
                return 0;
-        spin_lock(&sbi->s_inode_lock);
+        spin_lock(&nilfs->ns_inode_lock);
        if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
            !test_bit(NILFS_I_BUSY, &ii->i_state)) {
                /* Because this routine may race with nilfs_dispose_list(),
@@ -851,18 +865,18 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
                if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
                        /* This will happen when somebody is freeing
                           this inode. */
-                        nilfs_warning(sbi->s_super, __func__,
+                        nilfs_warning(inode->i_sb, __func__,
                                      "cannot get inode (ino=%lu)\n",
                                      inode->i_ino);
-                        spin_unlock(&sbi->s_inode_lock);
+                        spin_unlock(&nilfs->ns_inode_lock);
                        return -EINVAL; /* NILFS_I_DIRTY may remain for
                                           freeing inode */
                }
                list_del(&ii->i_dirty);
-                list_add_tail(&ii->i_dirty, &sbi->s_dirty_files);
+                list_add_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
                set_bit(NILFS_I_QUEUED, &ii->i_state);
        }
-        spin_unlock(&sbi->s_inode_lock);
+        spin_unlock(&nilfs->ns_inode_lock);
        return 0;
 }
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 496738963fdb..f2469ba6246b 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -26,7 +26,9 @@
 #include <linux/capability.h>   /* capable() */
 #include <linux/uaccess.h>      /* copy_from_user(), copy_to_user() */
 #include <linux/vmalloc.h>
+#include <linux/compat.h>       /* compat_ptr() */
 #include <linux/mount.h>        /* mnt_want_write(), mnt_drop_write() */
+#include <linux/buffer_head.h>
 #include <linux/nilfs2_fs.h>
 #include "nilfs.h"
 #include "segment.h"
@@ -97,11 +99,74 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
        return ret;
 }
+static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
+{
+        unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE;
+        return put_user(flags, (int __user *)argp);
+}
+static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
+                                void __user *argp)
+{
+        struct nilfs_transaction_info ti;
+        unsigned int flags, oldflags;
+        int ret;
+        if (!inode_owner_or_capable(inode))
+                return -EACCES;
+        if (get_user(flags, (int __user *)argp))
+                return -EFAULT;
+        ret = mnt_want_write(filp->f_path.mnt);
+        if (ret)
+                return ret;
+        flags = nilfs_mask_flags(inode->i_mode, flags);
+        mutex_lock(&inode->i_mutex);
+        oldflags = NILFS_I(inode)->i_flags;
+        /*
+         * The IMMUTABLE and APPEND_ONLY flags can only be changed by the
+         * relevant capability.
+         */
+        ret = -EPERM;
+        if (((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) &&
+            !capable(CAP_LINUX_IMMUTABLE))
+                goto out;
+        ret = nilfs_transaction_begin(inode->i_sb, &ti, 0);
+        if (ret)
+                goto out;
+        NILFS_I(inode)->i_flags = (oldflags & ~FS_FL_USER_MODIFIABLE) |
+                (flags & FS_FL_USER_MODIFIABLE);
+        nilfs_set_inode_flags(inode);
+        inode->i_ctime = CURRENT_TIME;
+        if (IS_SYNC(inode))
+                nilfs_set_transaction_flag(NILFS_TI_SYNC);
+        nilfs_mark_inode_dirty(inode);
+        ret = nilfs_transaction_commit(inode->i_sb);
+out:
+        mutex_unlock(&inode->i_mutex);
+        mnt_drop_write(filp->f_path.mnt);
+        return ret;
+}
+static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
+{
+        return put_user(inode->i_generation, (int __user *)argp);
+}
 static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
                                     unsigned int cmd, void __user *argp)
 {
-        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
-        struct inode *cpfile = nilfs->ns_cpfile;
        struct nilfs_transaction_info ti;
        struct nilfs_cpmode cpmode;
        int ret;
@@ -121,7 +186,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
        nilfs_transaction_begin(inode->i_sb, &ti, 0);
        ret = nilfs_cpfile_change_cpmode(
-                cpfile, cpmode.cm_cno, cpmode.cm_mode);
+                nilfs->ns_cpfile, cpmode.cm_cno, cpmode.cm_mode);
        if (unlikely(ret < 0))
                nilfs_transaction_abort(inode->i_sb);
        else
@@ -137,7 +202,7 @@ static int
 nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
                              unsigned int cmd, void __user *argp)
 {
-        struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct nilfs_transaction_info ti;
        __u64 cno;
        int ret;
@@ -154,7 +219,7 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
                goto out;
        nilfs_transaction_begin(inode->i_sb, &ti, 0);
-        ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
+        ret = nilfs_cpfile_delete_checkpoint(nilfs->ns_cpfile, cno);
        if (unlikely(ret < 0))
                nilfs_transaction_abort(inode->i_sb);
        else
@@ -180,7 +245,7 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
                                  unsigned int cmd, void __user *argp)
 {
-        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct nilfs_cpstat cpstat;
        int ret;
@@ -211,7 +276,7 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
 static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
                                  unsigned int cmd, void __user *argp)
 {
-        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct nilfs_sustat sustat;
        int ret;
@@ -267,7 +332,7 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
 static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
                                  unsigned int cmd, void __user *argp)
 {
-        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct nilfs_argv argv;
        int ret;
@@ -336,7 +401,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
                                   struct nilfs_argv *argv, void *buf)
 {
        size_t nmembs = argv->v_nmembs;
-        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct inode *inode;
        struct nilfs_vdesc *vdesc;
        struct buffer_head *bh, *n;
@@ -550,7 +615,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
                ret = PTR_ERR(kbufs[4]);
                goto out;
        }
-        nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        nilfs = inode->i_sb->s_fs_info;
        for (n = 0; n < 4; n++) {
                ret = -EINVAL;
@@ -623,7 +688,7 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
                return ret;
        if (argp != NULL) {
-                nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+                nilfs = inode->i_sb->s_fs_info;
                down_read(&nilfs->ns_segctor_sem);
                cno = nilfs->ns_cno - 1;
                up_read(&nilfs->ns_segctor_sem);
@@ -641,7 +706,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
                                                  void *, size_t, size_t))
 {
-        struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
        struct nilfs_argv argv;
        int ret;
@@ -666,6 +731,12 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        void __user *argp = (void __user *)arg;
        switch (cmd) {
+        case FS_IOC_GETFLAGS:
+                return nilfs_ioctl_getflags(inode, argp);
+        case FS_IOC_SETFLAGS:
+                return nilfs_ioctl_setflags(inode, filp, argp);
+        case FS_IOC_GETVERSION:
+                return nilfs_ioctl_getversion(inode, argp);
        case NILFS_IOCTL_CHANGE_CPMODE:
                return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
        case NILFS_IOCTL_DELETE_CHECKPOINT:
@@ -696,3 +767,23 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                return -ENOTTY;
        }
 }
+#ifdef CONFIG_COMPAT
+long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+        switch (cmd) {
+        case FS_IOC32_GETFLAGS:
+                cmd = FS_IOC_GETFLAGS;
+                break;
+        case FS_IOC32_SETFLAGS:
+                cmd = FS_IOC_SETFLAGS;
+                break;
+        case FS_IOC32_GETVERSION:
+                cmd = FS_IOC_GETVERSION;
+                break;
+        default:
+                return -ENOIOCTLCMD;
+        }
+        return nilfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 6a0e2a189f60..a649b05f7069 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -399,7 +399,6 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 static const struct address_space_operations def_mdt_aops = {
        .writepage              = nilfs_mdt_write_page,
-        .sync_page              = block_sync_page,
 };
 static const struct inode_operations def_mdt_iops;
@@ -438,10 +437,6 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
        mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
 }
-static const struct address_space_operations shadow_map_aops = {
-        .sync_page              = block_sync_page,
-};
 /**
 * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
 * @inode: inode of the metadata file
@@ -454,10 +449,10 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
        struct backing_dev_info *bdi = inode->i_sb->s_bdi;
        INIT_LIST_HEAD(&shadow->frozen_buffers);
-        nilfs_mapping_init_once(&shadow->frozen_data);
+        address_space_init_once(&shadow->frozen_data);
-        nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
+        nilfs_mapping_init(&shadow->frozen_data, bdi);
-        nilfs_mapping_init_once(&shadow->frozen_btnodes);
+        address_space_init_once(&shadow->frozen_btnodes);
-        nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
+        nilfs_mapping_init(&shadow->frozen_btnodes, bdi);
        mi->mi_shadow = shadow;
        return 0;
 }
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index b13734bf3521..ed68563ec708 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -66,7 +66,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
 static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
 {
-        return NILFS_SB(inode->i_sb)->s_nilfs;
+        return inode->i_sb->s_fs_info;
 }
 /* Default GFP flags using highmem */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 98034271cd02..546849b3e88f 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -397,7 +397,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
-                inc_nlink(old_inode);
                nilfs_set_link(new_dir, new_de, new_page, old_inode);
                nilfs_mark_inode_dirty(new_dir);
                new_inode->i_ctime = CURRENT_TIME;
@@ -411,13 +410,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= NILFS_LINK_MAX)
                                goto out_dir;
                }
-                inc_nlink(old_inode);
                err = nilfs_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        drop_nlink(old_inode);
-                        nilfs_mark_inode_dirty(old_inode);
                        goto out_dir;
-                }
                if (dir_de) {
                        inc_nlink(new_dir);
                        nilfs_mark_inode_dirty(new_dir);
@@ -431,7 +426,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        old_inode->i_ctime = CURRENT_TIME;
        nilfs_delete_entry(old_de, old_page);
-        drop_nlink(old_inode);
        if (dir_de) {
                nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
@@ -488,7 +482,7 @@ static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
        if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
                return ERR_PTR(-ESTALE);
-        root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
+        root = nilfs_lookup_root(sb->s_fs_info, cno);
        if (!root)
                return ERR_PTR(-ESTALE);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 777e8fd04304..a8dd344303cb 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -30,7 +30,6 @@
 #include <linux/blkdev.h>
 #include <linux/nilfs2_fs.h>
 #include "the_nilfs.h"
-#include "sb.h"
 #include "bmap.h"
 /*
@@ -115,19 +114,19 @@ enum {
 * Macros to check inode numbers
 */
 #define NILFS_MDT_INO_BITS   \
-  ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO |          \
+        ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO |    \
-                  1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO |        \
+                        1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO |  \
-                  1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
+                        1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
 #define NILFS_SYS_INO_BITS   \
-  ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
+        ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
-#define NILFS_FIRST_INO(sb)  (NILFS_SB(sb)->s_nilfs->ns_first_ino)
+#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
 #define NILFS_MDT_INODE(sb, ino) \
-  ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
+        ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
 #define NILFS_VALID_INODE(sb, ino) \
-  ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
+        ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
 /**
 * struct nilfs_transaction_info: context information for synchronization
@@ -212,6 +211,23 @@ static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
 #define NILFS_ATIME_DISABLE
+/* Flags that should be inherited by new inodes from their parent. */
+#define NILFS_FL_INHERITED                                              \
+        (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | FS_SYNC_FL |          \
+         FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL |\
+         FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL)
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags)
+{
+        if (S_ISDIR(mode))
+                return flags;
+        else if (S_ISREG(mode))
+                return flags & ~(FS_DIRSYNC_FL | FS_TOPDIR_FL);
+        else
+                return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
 /* dir.c */
 extern int nilfs_add_link(struct dentry *, struct inode *);
 extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
@@ -229,10 +245,13 @@ extern int nilfs_sync_file(struct file *, int);
 /* ioctl.c */
 long nilfs_ioctl(struct file *, unsigned int, unsigned long);
+long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
                                       void **);
 /* inode.c */
+void nilfs_inode_add_blocks(struct inode *inode, int n);
+void nilfs_inode_sub_blocks(struct inode *inode, int n);
 extern struct inode *nilfs_new_inode(struct inode *, int);
 extern void nilfs_free_inode(struct inode *);
 extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
@@ -266,7 +285,7 @@ extern void nilfs_destroy_inode(struct inode *);
 extern void nilfs_error(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
 extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
-       __attribute__ ((format (printf, 3, 4)));
+        __attribute__ ((format (printf, 3, 4)));
 extern struct nilfs_super_block *
 nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
 extern int nilfs_store_magic_and_option(struct super_block *,
@@ -275,11 +294,11 @@ extern int nilfs_check_feature_compatibility(struct super_block *,
                                             struct nilfs_super_block *);
 extern void nilfs_set_log_cursor(struct nilfs_super_block *,
                                 struct the_nilfs *);
-extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
+struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
-                                                      int flip);
+                                               int flip);
-extern int nilfs_commit_super(struct nilfs_sb_info *, int);
+int nilfs_commit_super(struct super_block *sb, int flag);
-extern int nilfs_cleanup_super(struct nilfs_sb_info *);
+int nilfs_cleanup_super(struct super_block *sb);
-int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
+int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
                            struct nilfs_root **root);
 int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 0c432416cfef..1168059c7efd 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -492,29 +492,15 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
        return nc;
 }
-void nilfs_mapping_init_once(struct address_space *mapping)
-{
-        memset(mapping, 0, sizeof(*mapping));
-        INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-        spin_lock_init(&mapping->tree_lock);
-        INIT_LIST_HEAD(&mapping->private_list);
-        spin_lock_init(&mapping->private_lock);
-        spin_lock_init(&mapping->i_mmap_lock);
-        INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-}
 void nilfs_mapping_init(struct address_space *mapping,
-                        struct backing_dev_info *bdi,
+                        struct backing_dev_info *bdi)
-                        const struct address_space_operations *aops)
 {
        mapping->host = NULL;
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_NOFS);
        mapping->assoc_mapping = NULL;
        mapping->backing_dev_info = bdi;
-        mapping->a_ops = aops;
+        mapping->a_ops = &empty_aops;
 }
 /*
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 622df27cd891..f06b79ad7493 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -61,10 +61,8 @@ void nilfs_free_private_page(struct page *);
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
-void nilfs_mapping_init_once(struct address_space *mapping);
 void nilfs_mapping_init(struct address_space *mapping,
-                        struct backing_dev_info *bdi,
+                        struct backing_dev_info *bdi);
-                        const struct address_space_operations *aops);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
                                            sector_t start_blk,
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 3dfcd3b7d389..ba4a64518f38 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -425,7 +425,7 @@ void nilfs_dispose_segment_list(struct list_head *head)
 }
 static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
-                                              struct nilfs_sb_info *sbi,
+                                              struct super_block *sb,
                                              struct nilfs_recovery_info *ri)
 {
        struct list_head *head = &ri->ri_used_segments;
@@ -501,7 +501,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
 }
 static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
-                                      struct nilfs_sb_info *sbi,
+                                      struct super_block *sb,
                                      struct nilfs_root *root,
                                      struct list_head *head,
                                      unsigned long *nr_salvaged_blocks)
@@ -514,7 +514,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
        int err = 0, err2 = 0;
        list_for_each_entry_safe(rb, n, head, list) {
-                inode = nilfs_iget(sbi->s_super, root, rb->ino);
+                inode = nilfs_iget(sb, root, rb->ino);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
                        inode = NULL;
@@ -572,11 +572,11 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
 * nilfs_do_roll_forward - salvage logical segments newer than the latest
 * checkpoint
 * @nilfs: nilfs object
- * @sbi: nilfs_sb_info
+ * @sb: super block instance
 * @ri: pointer to a nilfs_recovery_info
 */
 static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
-                                 struct nilfs_sb_info *sbi,
+                                 struct super_block *sb,
                                 struct nilfs_root *root,
                                 struct nilfs_recovery_info *ri)
 {
@@ -648,7 +648,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
                                goto failed;
                        if (flags & NILFS_SS_LOGEND) {
                                err = nilfs_recover_dsync_blocks(
-                                        nilfs, sbi, root, &dsync_blocks,
+                                        nilfs, sb, root, &dsync_blocks,
                                        &nsalvaged_blocks);
                                if (unlikely(err))
                                        goto failed;
@@ -681,7 +681,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
        if (nsalvaged_blocks) {
                printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
-                       sbi->s_super->s_id, nsalvaged_blocks);
+                       sb->s_id, nsalvaged_blocks);
                ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
        }
 out:
@@ -695,7 +695,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
        printk(KERN_ERR
               "NILFS (device %s): Error roll-forwarding "
               "(err=%d, pseg block=%llu). ",
-               sbi->s_super->s_id, err, (unsigned long long)pseg_start);
+               sb->s_id, err, (unsigned long long)pseg_start);
        goto out;
 }
@@ -724,7 +724,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
 /**
 * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
 * @nilfs: nilfs object
- * @sbi: nilfs_sb_info
+ * @sb: super block instance
 * @ri: pointer to a nilfs_recovery_info struct to store search results.
 *
 * Return Value: On success, 0 is returned.  On error, one of the following
@@ -741,7 +741,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
 * %-ENOMEM - Insufficient memory available.
 */
 int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
-                              struct nilfs_sb_info *sbi,
+                              struct super_block *sb,
                              struct nilfs_recovery_info *ri)
 {
        struct nilfs_root *root;
@@ -750,32 +750,32 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
        if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
                return 0;
-        err = nilfs_attach_checkpoint(sbi, ri->ri_cno, true, &root);
+        err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root);
        if (unlikely(err)) {
                printk(KERN_ERR
                       "NILFS: error loading the latest checkpoint.\n");
                return err;
        }
-        err = nilfs_do_roll_forward(nilfs, sbi, root, ri);
+        err = nilfs_do_roll_forward(nilfs, sb, root, ri);
        if (unlikely(err))
                goto failed;
        if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
-                err = nilfs_prepare_segment_for_recovery(nilfs, sbi, ri);
+                err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri);
                if (unlikely(err)) {
                        printk(KERN_ERR "NILFS: Error preparing segments for "
                               "recovery.\n");
                        goto failed;
                }
-                err = nilfs_attach_segment_constructor(sbi, root);
+                err = nilfs_attach_log_writer(sb, root);
                if (unlikely(err))
                        goto failed;
                set_nilfs_discontinued(nilfs);
-                err = nilfs_construct_segment(sbi->s_super);
+                err = nilfs_construct_segment(sb);
-                nilfs_detach_segment_constructor(sbi);
+                nilfs_detach_log_writer(sb);
                if (unlikely(err)) {
                        printk(KERN_ERR "NILFS: Oops! recovery failed. "
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
deleted file mode 100644
index 7a17715f215f..000000000000
--- a/fs/nilfs2/sb.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * sb.h - NILFS on-memory super block structure.
- *
- * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
- *
- */
-#ifndef _NILFS_SB
-#define _NILFS_SB
-#include <linux/types.h>
-#include <linux/fs.h>
-struct the_nilfs;
-struct nilfs_sc_info;
-/*
- * NILFS super-block data in memory
- */
-struct nilfs_sb_info {
-        /* Mount options */
-        unsigned long s_mount_opt;
-        uid_t s_resuid;
-        gid_t s_resgid;
-        unsigned long s_interval;       /* construction interval */
-        unsigned long s_watermark;      /* threshold of data amount
-                                           for the segment construction */
-        /* Fundamental members */
-        struct super_block *s_super;    /* reverse pointer to super_block */
-        struct the_nilfs *s_nilfs;
-        /* Segment constructor */
-        struct list_head s_dirty_files; /* dirty files list */
-        struct nilfs_sc_info *s_sc_info; /* segment constructor info */
-        spinlock_t s_inode_lock;        /* Lock for the nilfs inode.
-                                           It covers s_dirty_files list */
-        /* Inode allocator */
-        spinlock_t s_next_gen_lock;
-        u32 s_next_generation;
-};
-static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
-{
-        return sb->s_fs_info;
-}
-static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi)
-{
-        return sbi->s_sc_info;
-}
-/*
- * Bit operations for the mount option
- */
-#define nilfs_clear_opt(sbi, opt)  \
-        do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
-#define nilfs_set_opt(sbi, opt)  \
-        do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0)
-#define nilfs_test_opt(sbi, opt)   ((sbi)->s_mount_opt & NILFS_MOUNT_##opt)
-#define nilfs_write_opt(sbi, mask, opt)                                 \
-        do { (sbi)->s_mount_opt =                                       \
-                (((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) |           \
-                 NILFS_MOUNT_##opt);                                    \
-        } while (0)
-#endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 0f83e93935b2..2853ff20f85a 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -509,7 +509,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
                 * Last BIO is always sent through the following
                 * submission.
                 */
-                rw |= REQ_SYNC | REQ_UNPLUG;
+                rw |= REQ_SYNC;
                res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
        }
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 55ebae5c7f39..afe4f2183454 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -104,8 +104,7 @@ struct nilfs_sc_operations {
 static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
 static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
 static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
-static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
+static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int);
-                               int);
 #define nilfs_cnt32_gt(a, b)   \
        (typecheck(__u32, a) && typecheck(__u32, b) && \
@@ -182,7 +181,6 @@ int nilfs_transaction_begin(struct super_block *sb,
                            struct nilfs_transaction_info *ti,
                            int vacancy_check)
 {
-        struct nilfs_sb_info *sbi;
        struct the_nilfs *nilfs;
        int ret = nilfs_prepare_segment_lock(ti);
@@ -193,8 +191,7 @@ int nilfs_transaction_begin(struct super_block *sb,
        vfs_check_frozen(sb, SB_FREEZE_WRITE);
-        sbi = NILFS_SB(sb);
+        nilfs = sb->s_fs_info;
-        nilfs = sbi->s_nilfs;
        down_read(&nilfs->ns_segctor_sem);
        if (vacancy_check && nilfs_near_disk_full(nilfs)) {
                up_read(&nilfs->ns_segctor_sem);
@@ -225,8 +222,7 @@ int nilfs_transaction_begin(struct super_block *sb,
 int nilfs_transaction_commit(struct super_block *sb)
 {
        struct nilfs_transaction_info *ti = current->journal_info;
-        struct nilfs_sb_info *sbi;
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct nilfs_sc_info *sci;
        int err = 0;
        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
@@ -235,16 +231,15 @@ int nilfs_transaction_commit(struct super_block *sb)
                ti->ti_count--;
                return 0;
        }
-        sbi = NILFS_SB(sb);
+        if (nilfs->ns_writer) {
-        sci = NILFS_SC(sbi);
+                struct nilfs_sc_info *sci = nilfs->ns_writer;
-        if (sci != NULL) {
                if (ti->ti_flags & NILFS_TI_COMMIT)
                        nilfs_segctor_start_timer(sci);
-                if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) >
+                if (atomic_read(&nilfs->ns_ndirtyblks) > sci->sc_watermark)
-                    sci->sc_watermark)
                        nilfs_segctor_do_flush(sci, 0);
        }
-        up_read(&sbi->s_nilfs->ns_segctor_sem);
+        up_read(&nilfs->ns_segctor_sem);
        current->journal_info = ti->ti_save;
        if (ti->ti_flags & NILFS_TI_SYNC)
@@ -257,13 +252,14 @@ int nilfs_transaction_commit(struct super_block *sb)
 void nilfs_transaction_abort(struct super_block *sb)
 {
        struct nilfs_transaction_info *ti = current->journal_info;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
        if (ti->ti_count > 0) {
                ti->ti_count--;
                return;
        }
-        up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem);
+        up_read(&nilfs->ns_segctor_sem);
        current->journal_info = ti->ti_save;
        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
@@ -272,9 +268,8 @@ void nilfs_transaction_abort(struct super_block *sb)
 void nilfs_relax_pressure_in_lock(struct super_block *sb)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct nilfs_sc_info *sci = nilfs->ns_writer;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        if (!sci || !sci->sc_flush_request)
                return;
@@ -294,11 +289,13 @@ void nilfs_relax_pressure_in_lock(struct super_block *sb)
        downgrade_write(&nilfs->ns_segctor_sem);
 }
-static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
+static void nilfs_transaction_lock(struct super_block *sb,
                                   struct nilfs_transaction_info *ti,
                                   int gcflag)
 {
        struct nilfs_transaction_info *cur_ti = current->journal_info;
+        struct the_nilfs *nilfs = sb->s_fs_info;
+        struct nilfs_sc_info *sci = nilfs->ns_writer;
        WARN_ON(cur_ti);
        ti->ti_flags = NILFS_TI_WRITER;
@@ -309,30 +306,31 @@ static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
        current->journal_info = ti;
        for (;;) {
-                down_write(&sbi->s_nilfs->ns_segctor_sem);
+                down_write(&nilfs->ns_segctor_sem);
-                if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags))
+                if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
                        break;
-                nilfs_segctor_do_immediate_flush(NILFS_SC(sbi));
+                nilfs_segctor_do_immediate_flush(sci);
-                up_write(&sbi->s_nilfs->ns_segctor_sem);
+                up_write(&nilfs->ns_segctor_sem);
                yield();
        }
        if (gcflag)
                ti->ti_flags |= NILFS_TI_GC;
 }
-static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi)
+static void nilfs_transaction_unlock(struct super_block *sb)
 {
        struct nilfs_transaction_info *ti = current->journal_info;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
        BUG_ON(ti->ti_count > 0);
-        up_write(&sbi->s_nilfs->ns_segctor_sem);
+        up_write(&nilfs->ns_segctor_sem);
        current->journal_info = ti->ti_save;
        if (!list_empty(&ti->ti_garbage))
-                nilfs_dispose_list(sbi, &ti->ti_garbage, 0);
+                nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
 }
 static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -430,7 +428,8 @@ static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
        nilfs_segctor_map_segsum_entry(
                sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
-        if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+        if (NILFS_I(inode)->i_root &&
+            !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
                set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
        /* skip finfo */
 }
@@ -713,7 +712,7 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
        }
 }
-static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
+static void nilfs_dispose_list(struct the_nilfs *nilfs,
                               struct list_head *head, int force)
 {
        struct nilfs_inode_info *ii, *n;
@@ -721,7 +720,7 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
        unsigned nv = 0;
        while (!list_empty(head)) {
-                spin_lock(&sbi->s_inode_lock);
+                spin_lock(&nilfs->ns_inode_lock);
                list_for_each_entry_safe(ii, n, head, i_dirty) {
                        list_del_init(&ii->i_dirty);
                        if (force) {
@@ -732,14 +731,14 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
                        } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
                                set_bit(NILFS_I_QUEUED, &ii->i_state);
                                list_add_tail(&ii->i_dirty,
-                                              &sbi->s_dirty_files);
+                                              &nilfs->ns_dirty_files);
                                continue;
                        }
                        ivec[nv++] = ii;
                        if (nv == SC_N_INODEVEC)
                                break;
                }
-                spin_unlock(&sbi->s_inode_lock);
+                spin_unlock(&nilfs->ns_inode_lock);
                for (pii = ivec; nv > 0; pii++, nv--)
                        iput(&(*pii)->vfs_inode);
@@ -772,24 +771,23 @@ static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
 static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        int ret = 0;
-        if (nilfs_test_metadata_dirty(sbi->s_nilfs, sci->sc_root))
+        if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
                set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
-        spin_lock(&sbi->s_inode_lock);
+        spin_lock(&nilfs->ns_inode_lock);
-        if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci))
+        if (list_empty(&nilfs->ns_dirty_files) && nilfs_segctor_clean(sci))
                ret++;
-        spin_unlock(&sbi->s_inode_lock);
+        spin_unlock(&nilfs->ns_inode_lock);
        return ret;
 }
 static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        nilfs_mdt_clear_dirty(sci->sc_root->ifile);
        nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
@@ -799,7 +797,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
 {
-        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        struct buffer_head *bh_cp;
        struct nilfs_checkpoint *raw_cp;
        int err;
@@ -823,8 +821,7 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
 static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct buffer_head *bh_cp;
        struct nilfs_checkpoint *raw_cp;
        int err;
@@ -1048,8 +1045,7 @@ static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
 static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct list_head *head;
        struct nilfs_inode_info *ii;
        size_t ndone;
@@ -1858,7 +1854,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
 {
        struct nilfs_segment_buffer *segbuf;
        struct page *bd_page = NULL, *fs_page = NULL;
-        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        int update_sr = false;
        list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
@@ -1962,30 +1958,30 @@ static int nilfs_segctor_wait(struct nilfs_sc_info *sci)
        return ret;
 }
-static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
+static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
-                                        struct nilfs_sb_info *sbi)
+                                             struct the_nilfs *nilfs)
 {
        struct nilfs_inode_info *ii, *n;
        struct inode *ifile = sci->sc_root->ifile;
-        spin_lock(&sbi->s_inode_lock);
+        spin_lock(&nilfs->ns_inode_lock);
 retry:
-        list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) {
+        list_for_each_entry_safe(ii, n, &nilfs->ns_dirty_files, i_dirty) {
                if (!ii->i_bh) {
                        struct buffer_head *ibh;
                        int err;
-                        spin_unlock(&sbi->s_inode_lock);
+                        spin_unlock(&nilfs->ns_inode_lock);
                        err = nilfs_ifile_get_inode_block(
                                ifile, ii->vfs_inode.i_ino, &ibh);
                        if (unlikely(err)) {
-                                nilfs_warning(sbi->s_super, __func__,
+                                nilfs_warning(sci->sc_super, __func__,
                                              "failed to get inode block.\n");
                                return err;
                        }
                        nilfs_mdt_mark_buffer_dirty(ibh);
                        nilfs_mdt_mark_dirty(ifile);
-                        spin_lock(&sbi->s_inode_lock);
+                        spin_lock(&nilfs->ns_inode_lock);
                        if (likely(!ii->i_bh))
                                ii->i_bh = ibh;
                        else
@@ -1998,18 +1994,18 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
                list_del(&ii->i_dirty);
                list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
        }
-        spin_unlock(&sbi->s_inode_lock);
+        spin_unlock(&nilfs->ns_inode_lock);
        return 0;
 }
-static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
+static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
-                                          struct nilfs_sb_info *sbi)
+                                             struct the_nilfs *nilfs)
 {
        struct nilfs_transaction_info *ti = current->journal_info;
        struct nilfs_inode_info *ii, *n;
-        spin_lock(&sbi->s_inode_lock);
+        spin_lock(&nilfs->ns_inode_lock);
        list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
                if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
                    test_bit(NILFS_I_DIRTY, &ii->i_state))
@@ -2021,7 +2017,7 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
                list_del(&ii->i_dirty);
                list_add_tail(&ii->i_dirty, &ti->ti_garbage);
        }
-        spin_unlock(&sbi->s_inode_lock);
+        spin_unlock(&nilfs->ns_inode_lock);
 }
 /*
@@ -2029,15 +2025,14 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
 */
 static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct page *failed_page;
        int err;
        sci->sc_stage.scnt = NILFS_ST_INIT;
        sci->sc_cno = nilfs->ns_cno;
-        err = nilfs_segctor_check_in_files(sci, sbi);
+        err = nilfs_segctor_collect_dirty_files(sci, nilfs);
        if (unlikely(err))
                goto out;
@@ -2115,7 +2110,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
        } while (sci->sc_stage.scnt != NILFS_ST_DONE);
 out:
-        nilfs_segctor_check_out_files(sci, sbi);
+        nilfs_segctor_drop_written_files(sci, nilfs);
        return err;
 failed_to_write:
@@ -2168,8 +2163,8 @@ static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
 */
 void nilfs_flush_segment(struct super_block *sb, ino_t ino)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct nilfs_sc_info *sci = nilfs->ns_writer;
        if (!sci || nilfs_doing_construction())
                return;
@@ -2258,8 +2253,8 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
 */
 int nilfs_construct_segment(struct super_block *sb)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct nilfs_sc_info *sci = nilfs->ns_writer;
        struct nilfs_transaction_info *ti;
        int err;
@@ -2296,8 +2291,8 @@ int nilfs_construct_segment(struct super_block *sb)
 int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
                                  loff_t start, loff_t end)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct nilfs_sc_info *sci = nilfs->ns_writer;
        struct nilfs_inode_info *ii;
        struct nilfs_transaction_info ti;
        int err = 0;
@@ -2305,33 +2300,33 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
        if (!sci)
                return -EROFS;
-        nilfs_transaction_lock(sbi, &ti, 0);
+        nilfs_transaction_lock(sb, &ti, 0);
        ii = NILFS_I(inode);
        if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
-            nilfs_test_opt(sbi, STRICT_ORDER) ||
+            nilfs_test_opt(nilfs, STRICT_ORDER) ||
            test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
-            nilfs_discontinued(sbi->s_nilfs)) {
+            nilfs_discontinued(nilfs)) {
-                nilfs_transaction_unlock(sbi);
+                nilfs_transaction_unlock(sb);
                err = nilfs_segctor_sync(sci);
                return err;
        }
-        spin_lock(&sbi->s_inode_lock);
+        spin_lock(&nilfs->ns_inode_lock);
        if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
            !test_bit(NILFS_I_BUSY, &ii->i_state)) {
-                spin_unlock(&sbi->s_inode_lock);
+                spin_unlock(&nilfs->ns_inode_lock);
-                nilfs_transaction_unlock(sbi);
+                nilfs_transaction_unlock(sb);
                return 0;
        }
-        spin_unlock(&sbi->s_inode_lock);
+        spin_unlock(&nilfs->ns_inode_lock);
        sci->sc_dsync_inode = ii;
        sci->sc_dsync_start = start;
        sci->sc_dsync_end = end;
        err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
-        nilfs_transaction_unlock(sbi);
+        nilfs_transaction_unlock(sb);
        return err;
 }
@@ -2387,8 +2382,7 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
 */
 static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct nilfs_super_block **sbp;
        int err = 0;
@@ -2406,11 +2400,12 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
                    nilfs_discontinued(nilfs)) {
                        down_write(&nilfs->ns_sem);
                        err = -EIO;
-                        sbp = nilfs_prepare_super(sbi,
+                        sbp = nilfs_prepare_super(sci->sc_super,
                                                  nilfs_sb_will_flip(nilfs));
                        if (likely(sbp)) {
                                nilfs_set_log_cursor(sbp[0], nilfs);
-                                err = nilfs_commit_super(sbi, NILFS_SB_COMMIT);
+                                err = nilfs_commit_super(sci->sc_super,
+                                                         NILFS_SB_COMMIT);
                        }
                        up_write(&nilfs->ns_sem);
                }
@@ -2442,16 +2437,15 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
 int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
                         void **kbufs)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct nilfs_sc_info *sci = NILFS_SC(sbi);
+        struct nilfs_sc_info *sci = nilfs->ns_writer;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct nilfs_transaction_info ti;
        int err;
        if (unlikely(!sci))
                return -EROFS;
-        nilfs_transaction_lock(sbi, &ti, 1);
+        nilfs_transaction_lock(sb, &ti, 1);
        err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
        if (unlikely(err))
@@ -2479,14 +2473,14 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
                set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(sci->sc_interval);
        }
-        if (nilfs_test_opt(sbi, DISCARD)) {
+        if (nilfs_test_opt(nilfs, DISCARD)) {
                int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
                                                 sci->sc_nfreesegs);
                if (ret) {
                        printk(KERN_WARNING
                               "NILFS warning: error %d on discard request, "
                               "turning discards off for the device\n", ret);
-                        nilfs_clear_opt(sbi, DISCARD);
+                        nilfs_clear_opt(nilfs, DISCARD);
                }
        }
@@ -2494,16 +2488,15 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
        sci->sc_freesegs = NULL;
        sci->sc_nfreesegs = 0;
        nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
-        nilfs_transaction_unlock(sbi);
+        nilfs_transaction_unlock(sb);
        return err;
 }
 static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
        struct nilfs_transaction_info ti;
-        nilfs_transaction_lock(sbi, &ti, 0);
+        nilfs_transaction_lock(sci->sc_super, &ti, 0);
        nilfs_segctor_construct(sci, mode);
        /*
@@ -2514,7 +2507,7 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
        if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
                nilfs_segctor_start_timer(sci);
-        nilfs_transaction_unlock(sbi);
+        nilfs_transaction_unlock(sci->sc_super);
 }
 static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
@@ -2560,7 +2553,7 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
 static int nilfs_segctor_thread(void *arg)
 {
        struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
-        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        int timeout = 0;
        sci->sc_timer.data = (unsigned long)current;
@@ -2671,17 +2664,17 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
 /*
 * Setup & clean-up functions
 */
-static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
+static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
                                               struct nilfs_root *root)
 {
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_sc_info *sci;
        sci = kzalloc(sizeof(*sci), GFP_KERNEL);
        if (!sci)
                return NULL;
-        sci->sc_sbi = sbi;
+        sci->sc_super = sb;
-        sci->sc_super = sbi->s_super;
        nilfs_get_root(root);
        sci->sc_root = root;
@@ -2701,10 +2694,10 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
        sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
        sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
-        if (sbi->s_interval)
+        if (nilfs->ns_interval)
-                sci->sc_interval = sbi->s_interval;
+                sci->sc_interval = nilfs->ns_interval;
-        if (sbi->s_watermark)
+        if (nilfs->ns_watermark)
-                sci->sc_watermark = sbi->s_watermark;
+                sci->sc_watermark = nilfs->ns_watermark;
        return sci;
 }
@@ -2715,12 +2708,11 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
        /* The segctord thread was stopped and its timer was removed.
           But some tasks remain. */
        do {
-                struct nilfs_sb_info *sbi = sci->sc_sbi;
                struct nilfs_transaction_info ti;
-                nilfs_transaction_lock(sbi, &ti, 0);
+                nilfs_transaction_lock(sci->sc_super, &ti, 0);
                ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
-                nilfs_transaction_unlock(sbi);
+                nilfs_transaction_unlock(sci->sc_super);
        } while (ret && retrycount-- > 0);
 }
@@ -2735,10 +2727,10 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
 */
 static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
 {
-        struct nilfs_sb_info *sbi = sci->sc_sbi;
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
        int flag;
-        up_write(&sbi->s_nilfs->ns_segctor_sem);
+        up_write(&nilfs->ns_segctor_sem);
        spin_lock(&sci->sc_state_lock);
        nilfs_segctor_kill_thread(sci);
@@ -2752,9 +2744,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
        WARN_ON(!list_empty(&sci->sc_copied_buffers));
        if (!list_empty(&sci->sc_dirty_files)) {
-                nilfs_warning(sbi->s_super, __func__,
+                nilfs_warning(sci->sc_super, __func__,
                              "dirty file(s) after the final construction\n");
-                nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
+                nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
        }
        WARN_ON(!list_empty(&sci->sc_segbufs));
@@ -2762,79 +2754,78 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
        nilfs_put_root(sci->sc_root);
-        down_write(&sbi->s_nilfs->ns_segctor_sem);
+        down_write(&nilfs->ns_segctor_sem);
        del_timer_sync(&sci->sc_timer);
        kfree(sci);
 }
 /**
- * nilfs_attach_segment_constructor - attach a segment constructor
+ * nilfs_attach_log_writer - attach log writer
- * @sbi: nilfs_sb_info
+ * @sb: super block instance
 * @root: root object of the current filesystem tree
 *
- * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
+ * This allocates a log writer object, initializes it, and starts the
- * initializes it, and starts the segment constructor.
+ * log writer.
 *
 * Return Value: On success, 0 is returned. On error, one of the following
 * negative error code is returned.
 *
 * %-ENOMEM - Insufficient memory available.
 */
-int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
+int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
-                                     struct nilfs_root *root)
 {
+        struct the_nilfs *nilfs = sb->s_fs_info;
        int err;
-        if (NILFS_SC(sbi)) {
+        if (nilfs->ns_writer) {
                /*
                 * This happens if the filesystem was remounted
                 * read/write after nilfs_error degenerated it into a
                 * read-only mount.
                 */
-                nilfs_detach_segment_constructor(sbi);
+                nilfs_detach_log_writer(sb);
        }
-        sbi->s_sc_info = nilfs_segctor_new(sbi, root);
+        nilfs->ns_writer = nilfs_segctor_new(sb, root);
-        if (!sbi->s_sc_info)
+        if (!nilfs->ns_writer)
                return -ENOMEM;
-        err = nilfs_segctor_start_thread(NILFS_SC(sbi));
+        err = nilfs_segctor_start_thread(nilfs->ns_writer);
        if (err) {
-                kfree(sbi->s_sc_info);
+                kfree(nilfs->ns_writer);
-                sbi->s_sc_info = NULL;
+                nilfs->ns_writer = NULL;
        }
        return err;
 }
 /**
- * nilfs_detach_segment_constructor - destroy the segment constructor
+ * nilfs_detach_log_writer - destroy log writer
- * @sbi: nilfs_sb_info
+ * @sb: super block instance
 *
- * nilfs_detach_segment_constructor() kills the segment constructor daemon,
+ * This kills log writer daemon, frees the log writer object, and
- * frees the struct nilfs_sc_info, and destroy the dirty file list.
+ * destroys list of dirty files.
 */
-void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
+void nilfs_detach_log_writer(struct super_block *sb)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        LIST_HEAD(garbage_list);
        down_write(&nilfs->ns_segctor_sem);
-        if (NILFS_SC(sbi)) {
+        if (nilfs->ns_writer) {
-                nilfs_segctor_destroy(NILFS_SC(sbi));
+                nilfs_segctor_destroy(nilfs->ns_writer);
-                sbi->s_sc_info = NULL;
+                nilfs->ns_writer = NULL;
        }
        /* Force to free the list of dirty files */
-        spin_lock(&sbi->s_inode_lock);
+        spin_lock(&nilfs->ns_inode_lock);
-        if (!list_empty(&sbi->s_dirty_files)) {
+        if (!list_empty(&nilfs->ns_dirty_files)) {
-                list_splice_init(&sbi->s_dirty_files, &garbage_list);
+                list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
-                nilfs_warning(sbi->s_super, __func__,
+                nilfs_warning(sb, __func__,
-                              "Non empty dirty list after the last "
+                              "Hit dirty file after stopped log writer\n");
-                              "segment construction\n");
+        }
-        }
+        spin_unlock(&nilfs->ns_inode_lock);
-        spin_unlock(&sbi->s_inode_lock);
        up_write(&nilfs->ns_segctor_sem);
-        nilfs_dispose_list(sbi, &garbage_list, 1);
+        nilfs_dispose_list(nilfs, &garbage_list, 1);
 }
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index cd8056e7cbed..6c02a86745fb 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -27,7 +27,7 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/nilfs2_fs.h>
-#include "sb.h"
+#include "nilfs.h"
 struct nilfs_root;
@@ -88,7 +88,6 @@ struct nilfs_segsum_pointer {
 /**
 * struct nilfs_sc_info - Segment constructor information
 * @sc_super: Back pointer to super_block struct
- * @sc_sbi: Back pointer to nilfs_sb_info struct
 * @sc_root: root object of the current filesystem tree
 * @sc_nblk_inc: Block count of current generation
 * @sc_dirty_files: List of files to be written
@@ -131,7 +130,6 @@ struct nilfs_segsum_pointer {
 */
 struct nilfs_sc_info {
        struct super_block     *sc_super;
-        struct nilfs_sb_info   *sc_sbi;
        struct nilfs_root      *sc_root;
        unsigned long           sc_nblk_inc;
@@ -235,18 +233,16 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
 extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
                                void **);
-int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi,
+int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root);
-                                     struct nilfs_root *root);
+void nilfs_detach_log_writer(struct super_block *sb);
-extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
 /* recovery.c */
 extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t,
                                       struct buffer_head **, int);
 extern int nilfs_search_super_root(struct the_nilfs *,
                                   struct nilfs_recovery_info *);
-extern int nilfs_salvage_orphan_logs(struct the_nilfs *,
+int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, struct super_block *sb,
-                                     struct nilfs_sb_info *,
+                              struct nilfs_recovery_info *ri);
-                                     struct nilfs_recovery_info *);
 extern void nilfs_dispose_segment_list(struct list_head *);
 #endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 58fd707174e1..062cca065195 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -43,7 +43,6 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/parser.h>
-#include <linux/random.h>
 #include <linux/crc32.h>
 #include <linux/vfs.h>
 #include <linux/writeback.h>
@@ -72,23 +71,23 @@ struct kmem_cache *nilfs_transaction_cachep;
 struct kmem_cache *nilfs_segbuf_cachep;
 struct kmem_cache *nilfs_btree_path_cache;
-static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount);
+static int nilfs_setup_super(struct super_block *sb, int is_mount);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
-static void nilfs_set_error(struct nilfs_sb_info *sbi)
+static void nilfs_set_error(struct super_block *sb)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp;
        down_write(&nilfs->ns_sem);
        if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
                nilfs->ns_mount_state |= NILFS_ERROR_FS;
-                sbp = nilfs_prepare_super(sbi, 0);
+                sbp = nilfs_prepare_super(sb, 0);
                if (likely(sbp)) {
                        sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
                        if (sbp[1])
                                sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
-                        nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
+                        nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
                }
        }
        up_write(&nilfs->ns_sem);
@@ -109,7 +108,7 @@ static void nilfs_set_error(struct nilfs_sb_info *sbi)
 void nilfs_error(struct super_block *sb, const char *function,
                 const char *fmt, ...)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct va_format vaf;
        va_list args;
@@ -124,15 +123,15 @@ void nilfs_error(struct super_block *sb, const char *function,
        va_end(args);
        if (!(sb->s_flags & MS_RDONLY)) {
-                nilfs_set_error(sbi);
+                nilfs_set_error(sb);
-                if (nilfs_test_opt(sbi, ERRORS_RO)) {
+                if (nilfs_test_opt(nilfs, ERRORS_RO)) {
                        printk(KERN_CRIT "Remounting filesystem read-only\n");
                        sb->s_flags |= MS_RDONLY;
                }
        }
-        if (nilfs_test_opt(sbi, ERRORS_PANIC))
+        if (nilfs_test_opt(nilfs, ERRORS_PANIC))
                panic("NILFS (device %s): panic forced after error\n",
                      sb->s_id);
 }
@@ -189,14 +188,14 @@ void nilfs_destroy_inode(struct inode *inode)
        call_rcu(&inode->i_rcu, nilfs_i_callback);
 }
-static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
+static int nilfs_sync_super(struct super_block *sb, int flag)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        int err;
 retry:
        set_buffer_dirty(nilfs->ns_sbh[0]);
-        if (nilfs_test_opt(sbi, BARRIER)) {
+        if (nilfs_test_opt(nilfs, BARRIER)) {
                err = __sync_dirty_buffer(nilfs->ns_sbh[0],
                                          WRITE_SYNC | WRITE_FLUSH_FUA);
        } else {
@@ -263,10 +262,10 @@ void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
        spin_unlock(&nilfs->ns_last_segment_lock);
 }
-struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
+struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
                                               int flip)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
        /* nilfs->ns_sem must be locked by the caller. */
@@ -276,7 +275,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
                        memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
                } else {
                        printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
-                               sbi->s_super->s_id);
+                               sb->s_id);
                        return NULL;
                }
        } else if (sbp[1] &&
@@ -290,9 +289,9 @@ struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
        return sbp;
 }
-int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag)
+int nilfs_commit_super(struct super_block *sb, int flag)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp = nilfs->ns_sbp;
        time_t t;
@@ -312,27 +311,28 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag)
                                            nilfs->ns_sbsize));
        }
        clear_nilfs_sb_dirty(nilfs);
-        return nilfs_sync_super(sbi, flag);
+        return nilfs_sync_super(sb, flag);
 }
 /**
 * nilfs_cleanup_super() - write filesystem state for cleanup
- * @sbi: nilfs_sb_info to be unmounted or degraded to read-only
+ * @sb: super block instance to be unmounted or degraded to read-only
 *
 * This function restores state flags in the on-disk super block.
 * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
 * filesystem was not clean previously.
 */
-int nilfs_cleanup_super(struct nilfs_sb_info *sbi)
+int nilfs_cleanup_super(struct super_block *sb)
 {
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp;
        int flag = NILFS_SB_COMMIT;
        int ret = -EIO;
-        sbp = nilfs_prepare_super(sbi, 0);
+        sbp = nilfs_prepare_super(sb, 0);
        if (sbp) {
-                sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state);
+                sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
-                nilfs_set_log_cursor(sbp[0], sbi->s_nilfs);
+                nilfs_set_log_cursor(sbp[0], nilfs);
                if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) {
                        /*
                         * make the "clean" flag also to the opposite
@@ -342,21 +342,20 @@ int nilfs_cleanup_super(struct nilfs_sb_info *sbi)
                        sbp[1]->s_state = sbp[0]->s_state;
                        flag = NILFS_SB_COMMIT_ALL;
                }
-                ret = nilfs_commit_super(sbi, flag);
+                ret = nilfs_commit_super(sb, flag);
        }
        return ret;
 }
 static void nilfs_put_super(struct super_block *sb)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
-        nilfs_detach_segment_constructor(sbi);
+        nilfs_detach_log_writer(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                down_write(&nilfs->ns_sem);
-                nilfs_cleanup_super(sbi);
+                nilfs_cleanup_super(sb);
                up_write(&nilfs->ns_sem);
        }
@@ -365,15 +364,12 @@ static void nilfs_put_super(struct super_block *sb)
        iput(nilfs->ns_dat);
        destroy_nilfs(nilfs);
-        sbi->s_super = NULL;
        sb->s_fs_info = NULL;
-        kfree(sbi);
 }
 static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct nilfs_super_block **sbp;
        int err = 0;
@@ -383,10 +379,10 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
        down_write(&nilfs->ns_sem);
        if (nilfs_sb_dirty(nilfs)) {
-                sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs));
+                sbp = nilfs_prepare_super(sb, nilfs_sb_will_flip(nilfs));
                if (likely(sbp)) {
                        nilfs_set_log_cursor(sbp[0], nilfs);
-                        nilfs_commit_super(sbi, NILFS_SB_COMMIT);
+                        nilfs_commit_super(sb, NILFS_SB_COMMIT);
                }
        }
        up_write(&nilfs->ns_sem);
@@ -394,10 +390,10 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
        return err;
 }
-int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
+int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
                            struct nilfs_root **rootp)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_root *root;
        struct nilfs_checkpoint *raw_cp;
        struct buffer_head *bh_cp;
@@ -426,7 +422,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
                goto failed;
        }
-        err = nilfs_ifile_read(sbi->s_super, root, nilfs->ns_inode_size,
+        err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size,
                               &raw_cp->cp_ifile_inode, &root->ifile);
        if (err)
                goto failed_bh;
@@ -450,8 +446,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
 static int nilfs_freeze(struct super_block *sb)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        int err;
        if (sb->s_flags & MS_RDONLY)
@@ -459,21 +454,20 @@ static int nilfs_freeze(struct super_block *sb)
        /* Mark super block clean */
        down_write(&nilfs->ns_sem);
-        err = nilfs_cleanup_super(sbi);
+        err = nilfs_cleanup_super(sb);
        up_write(&nilfs->ns_sem);
        return err;
 }
 static int nilfs_unfreeze(struct super_block *sb)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        if (sb->s_flags & MS_RDONLY)
                return 0;
        down_write(&nilfs->ns_sem);
-        nilfs_setup_super(sbi, false);
+        nilfs_setup_super(sb, false);
        up_write(&nilfs->ns_sem);
        return 0;
 }
@@ -530,22 +524,22 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
        struct super_block *sb = vfs->mnt_sb;
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
-        if (!nilfs_test_opt(sbi, BARRIER))
+        if (!nilfs_test_opt(nilfs, BARRIER))
                seq_puts(seq, ",nobarrier");
        if (root->cno != NILFS_CPTREE_CURRENT_CNO)
                seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
-        if (nilfs_test_opt(sbi, ERRORS_PANIC))
+        if (nilfs_test_opt(nilfs, ERRORS_PANIC))
                seq_puts(seq, ",errors=panic");
-        if (nilfs_test_opt(sbi, ERRORS_CONT))
+        if (nilfs_test_opt(nilfs, ERRORS_CONT))
                seq_puts(seq, ",errors=continue");
-        if (nilfs_test_opt(sbi, STRICT_ORDER))
+        if (nilfs_test_opt(nilfs, STRICT_ORDER))
                seq_puts(seq, ",order=strict");
-        if (nilfs_test_opt(sbi, NORECOVERY))
+        if (nilfs_test_opt(nilfs, NORECOVERY))
                seq_puts(seq, ",norecovery");
-        if (nilfs_test_opt(sbi, DISCARD))
+        if (nilfs_test_opt(nilfs, DISCARD))
                seq_puts(seq, ",discard");
        return 0;
@@ -594,7 +588,7 @@ static match_table_t tokens = {
 static int parse_options(char *options, struct super_block *sb, int is_remount)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
        char *p;
        substring_t args[MAX_OPT_ARGS];
@@ -609,29 +603,29 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
                token = match_token(p, tokens, args);
                switch (token) {
                case Opt_barrier:
-                        nilfs_set_opt(sbi, BARRIER);
+                        nilfs_set_opt(nilfs, BARRIER);
                        break;
                case Opt_nobarrier:
-                        nilfs_clear_opt(sbi, BARRIER);
+                        nilfs_clear_opt(nilfs, BARRIER);
                        break;
                case Opt_order:
                        if (strcmp(args[0].from, "relaxed") == 0)
                                /* Ordered data semantics */
-                                nilfs_clear_opt(sbi, STRICT_ORDER);
+                                nilfs_clear_opt(nilfs, STRICT_ORDER);
                        else if (strcmp(args[0].from, "strict") == 0)
                                /* Strict in-order semantics */
-                                nilfs_set_opt(sbi, STRICT_ORDER);
+                                nilfs_set_opt(nilfs, STRICT_ORDER);
                        else
                                return 0;
                        break;
                case Opt_err_panic:
-                        nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC);
+                        nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_PANIC);
                        break;
                case Opt_err_ro:
-                        nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO);
+                        nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_RO);
                        break;
                case Opt_err_cont:
-                        nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
+                        nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_CONT);
                        break;
                case Opt_snapshot:
                        if (is_remount) {
@@ -642,13 +636,13 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
                        }
                        break;
                case Opt_norecovery:
-                        nilfs_set_opt(sbi, NORECOVERY);
+                        nilfs_set_opt(nilfs, NORECOVERY);
                        break;
                case Opt_discard:
-                        nilfs_set_opt(sbi, DISCARD);
+                        nilfs_set_opt(nilfs, DISCARD);
                        break;
                case Opt_nodiscard:
-                        nilfs_clear_opt(sbi, DISCARD);
+                        nilfs_clear_opt(nilfs, DISCARD);
                        break;
                default:
                        printk(KERN_ERR
@@ -660,22 +654,24 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
 }
 static inline void
-nilfs_set_default_options(struct nilfs_sb_info *sbi,
+nilfs_set_default_options(struct super_block *sb,
                          struct nilfs_super_block *sbp)
 {
-        sbi->s_mount_opt =
+        struct the_nilfs *nilfs = sb->s_fs_info;
+        nilfs->ns_mount_opt =
                NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
 }
-static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount)
+static int nilfs_setup_super(struct super_block *sb, int is_mount)
 {
-        struct the_nilfs *nilfs = sbi->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_super_block **sbp;
        int max_mnt_count;
        int mnt_count;
        /* nilfs->ns_sem must be locked by the caller. */
-        sbp = nilfs_prepare_super(sbi, 0);
+        sbp = nilfs_prepare_super(sb, 0);
        if (!sbp)
                return -EIO;
@@ -706,7 +702,7 @@ skip_mount_setup:
        /* synchronize sbp[1] with sbp[0] */
        if (sbp[1])
                memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
-        return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
+        return nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
 }
 struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
@@ -727,7 +723,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
                                 struct nilfs_super_block *sbp,
                                 char *data)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
        sb->s_magic = le16_to_cpu(sbp->s_magic);
@@ -736,12 +732,12 @@ int nilfs_store_magic_and_option(struct super_block *sb,
        sb->s_flags |= MS_NOATIME;
 #endif
-        nilfs_set_default_options(sbi, sbp);
+        nilfs_set_default_options(sb, sbp);
-        sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid);
+        nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid);
-        sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid);
+        nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid);
-        sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
+        nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval);
-        sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
+        nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max);
        return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
 }
@@ -822,7 +818,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
 static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
                                 struct dentry **root_dentry)
 {
-        struct the_nilfs *nilfs = NILFS_SB(s)->s_nilfs;
+        struct the_nilfs *nilfs = s->s_fs_info;
        struct nilfs_root *root;
        int ret;
@@ -840,7 +836,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
                goto out;
        }
-        ret = nilfs_attach_checkpoint(NILFS_SB(s), cno, false, &root);
+        ret = nilfs_attach_checkpoint(s, cno, false, &root);
        if (ret) {
                printk(KERN_ERR "NILFS: error loading snapshot "
                       "(checkpoint number=%llu).\n",
@@ -874,7 +870,7 @@ static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
 int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
 {
-        struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs;
+        struct the_nilfs *nilfs = sb->s_fs_info;
        struct nilfs_root *root;
        struct inode *inode;
        struct dentry *dentry;
@@ -887,7 +883,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
                return true;    /* protect recent checkpoints */
        ret = false;
-        root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno);
+        root = nilfs_lookup_root(nilfs, cno);
        if (root) {
                inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
                if (inode) {
@@ -917,43 +913,21 @@ static int
 nilfs_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct the_nilfs *nilfs;
-        struct nilfs_sb_info *sbi;
        struct nilfs_root *fsroot;
        struct backing_dev_info *bdi;
        __u64 cno;
        int err;
-        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+        nilfs = alloc_nilfs(sb->s_bdev);
-        if (!sbi)
+        if (!nilfs)
                return -ENOMEM;
-        sb->s_fs_info = sbi;
+        sb->s_fs_info = nilfs;
-        sbi->s_super = sb;
-        nilfs = alloc_nilfs(sb->s_bdev);
-        if (!nilfs) {
-                err = -ENOMEM;
-                goto failed_sbi;
-        }
-        sbi->s_nilfs = nilfs;
-        err = init_nilfs(nilfs, sbi, (char *)data);
+        err = init_nilfs(nilfs, sb, (char *)data);
        if (err)
                goto failed_nilfs;
-        spin_lock_init(&sbi->s_inode_lock);
-        INIT_LIST_HEAD(&sbi->s_dirty_files);
-        /*
-         * Following initialization is overlapped because
-         * nilfs_sb_info structure has been cleared at the beginning.
-         * But we reserve them to keep our interest and make ready
-         * for the future change.
-         */
-        get_random_bytes(&sbi->s_next_generation,
-                         sizeof(sbi->s_next_generation));
-        spin_lock_init(&sbi->s_next_gen_lock);
        sb->s_op = &nilfs_sops;
        sb->s_export_op = &nilfs_export_ops;
        sb->s_root = NULL;
@@ -962,12 +936,12 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
        bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
        sb->s_bdi = bdi ? : &default_backing_dev_info;
-        err = load_nilfs(nilfs, sbi);
+        err = load_nilfs(nilfs, sb);
        if (err)
                goto failed_nilfs;
        cno = nilfs_last_cno(nilfs);
-        err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot);
+        err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
        if (err) {
                printk(KERN_ERR "NILFS: error loading last checkpoint "
                       "(checkpoint number=%llu).\n", (unsigned long long)cno);
@@ -975,7 +949,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (!(sb->s_flags & MS_RDONLY)) {
-                err = nilfs_attach_segment_constructor(sbi, fsroot);
+                err = nilfs_attach_log_writer(sb, fsroot);
                if (err)
                        goto failed_checkpoint;
        }
@@ -988,14 +962,14 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
        if (!(sb->s_flags & MS_RDONLY)) {
                down_write(&nilfs->ns_sem);
-                nilfs_setup_super(sbi, true);
+                nilfs_setup_super(sb, true);
                up_write(&nilfs->ns_sem);
        }
        return 0;
 failed_segctor:
-        nilfs_detach_segment_constructor(sbi);
+        nilfs_detach_log_writer(sb);
 failed_checkpoint:
        nilfs_put_root(fsroot);
@@ -1007,23 +981,18 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 failed_nilfs:
        destroy_nilfs(nilfs);
- failed_sbi:
-        sb->s_fs_info = NULL;
-        kfree(sbi);
        return err;
 }
 static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 {
-        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        struct the_nilfs *nilfs = sb->s_fs_info;
-        struct the_nilfs *nilfs = sbi->s_nilfs;
        unsigned long old_sb_flags;
        unsigned long old_mount_opt;
        int err;
        old_sb_flags = sb->s_flags;
-        old_mount_opt = sbi->s_mount_opt;
+        old_mount_opt = nilfs->ns_mount_opt;
        if (!parse_options(data, sb, 1)) {
                err = -EINVAL;
@@ -1043,8 +1012,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                goto out;
        if (*flags & MS_RDONLY) {
-                /* Shutting down the segment constructor */
+                /* Shutting down log writer */
-                nilfs_detach_segment_constructor(sbi);
+                nilfs_detach_log_writer(sb);
                sb->s_flags |= MS_RDONLY;
                /*
@@ -1052,7 +1021,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                 * the RDONLY flag and then mark the partition as valid again.
                 */
                down_write(&nilfs->ns_sem);
-                nilfs_cleanup_super(sbi);
+                nilfs_cleanup_super(sb);
                up_write(&nilfs->ns_sem);
        } else {
                __u64 features;
@@ -1079,12 +1048,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                sb->s_flags &= ~MS_RDONLY;
                root = NILFS_I(sb->s_root->d_inode)->i_root;
-                err = nilfs_attach_segment_constructor(sbi, root);
+                err = nilfs_attach_log_writer(sb, root);
                if (err)
                        goto restore_opts;
                down_write(&nilfs->ns_sem);
-                nilfs_setup_super(sbi, true);
+                nilfs_setup_super(sb, true);
                up_write(&nilfs->ns_sem);
        }
 out:
@@ -1092,13 +1061,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 restore_opts:
        sb->s_flags = old_sb_flags;
-        sbi->s_mount_opt = old_mount_opt;
+        nilfs->ns_mount_opt = old_mount_opt;
        return err;
 }
 struct nilfs_super_data {
        struct block_device *bdev;
-        struct nilfs_sb_info *sbi;
        __u64 cno;
        int flags;
 };
@@ -1279,7 +1247,7 @@ static void nilfs_inode_init_once(void *obj)
 #ifdef CONFIG_NILFS_XATTR
        init_rwsem(&ii->xattr_sem);
 #endif
-        nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+        address_space_init_once(&ii->i_btnode_cache);
        ii->i_bmap = &ii->i_bmap_data;
        inode_init_once(&ii->vfs_inode);
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ad4ac607cf57..d2acd1a651f3 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
+#include <linux/random.h>
 #include <linux/crc32.h>
 #include "nilfs.h"
 #include "segment.h"
@@ -75,7 +76,10 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
        nilfs->ns_bdev = bdev;
        atomic_set(&nilfs->ns_ndirtyblks, 0);
        init_rwsem(&nilfs->ns_sem);
+        INIT_LIST_HEAD(&nilfs->ns_dirty_files);
        INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
+        spin_lock_init(&nilfs->ns_inode_lock);
+        spin_lock_init(&nilfs->ns_next_gen_lock);
        spin_lock_init(&nilfs->ns_last_segment_lock);
        nilfs->ns_cptree = RB_ROOT;
        spin_lock_init(&nilfs->ns_cptree_lock);
@@ -197,16 +201,16 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
 /**
 * load_nilfs - load and recover the nilfs
 * @nilfs: the_nilfs structure to be released
- * @sbi: nilfs_sb_info used to recover past segment
+ * @sb: super block isntance used to recover past segment
 *
 * load_nilfs() searches and load the latest super root,
 * attaches the last segment, and does recovery if needed.
 * The caller must call this exclusively for simultaneous mounts.
 */
-int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
 {
        struct nilfs_recovery_info ri;
-        unsigned int s_flags = sbi->s_super->s_flags;
+        unsigned int s_flags = sb->s_flags;
        int really_read_only = bdev_read_only(nilfs->ns_bdev);
        int valid_fs = nilfs_valid_fs(nilfs);
        int err;
@@ -271,7 +275,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
                        goto scan_error;
        }
-        err = nilfs_load_super_root(nilfs, sbi->s_super, ri.ri_super_root);
+        err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root);
        if (unlikely(err)) {
                printk(KERN_ERR "NILFS: error loading super root.\n");
                goto failed;
@@ -283,7 +287,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        if (s_flags & MS_RDONLY) {
                __u64 features;
-                if (nilfs_test_opt(sbi, NORECOVERY)) {
+                if (nilfs_test_opt(nilfs, NORECOVERY)) {
                        printk(KERN_INFO "NILFS: norecovery option specified. "
                               "skipping roll-forward recovery\n");
                        goto skip_recovery;
@@ -304,21 +308,21 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
                        err = -EROFS;
                        goto failed_unload;
                }
-                sbi->s_super->s_flags &= ~MS_RDONLY;
+                sb->s_flags &= ~MS_RDONLY;
-        } else if (nilfs_test_opt(sbi, NORECOVERY)) {
+        } else if (nilfs_test_opt(nilfs, NORECOVERY)) {
                printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
                       "option was specified for a read/write mount\n");
                err = -EINVAL;
                goto failed_unload;
        }
-        err = nilfs_salvage_orphan_logs(nilfs, sbi, &ri);
+        err = nilfs_salvage_orphan_logs(nilfs, sb, &ri);
        if (err)
                goto failed_unload;
        down_write(&nilfs->ns_sem);
        nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */
-        err = nilfs_cleanup_super(sbi);
+        err = nilfs_cleanup_super(sb);
        up_write(&nilfs->ns_sem);
        if (err) {
@@ -330,7 +334,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 skip_recovery:
        nilfs_clear_recovery_info(&ri);
-        sbi->s_super->s_flags = s_flags;
+        sb->s_flags = s_flags;
        return 0;
 scan_error:
@@ -344,7 +348,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 failed:
        nilfs_clear_recovery_info(&ri);
-        sbi->s_super->s_flags = s_flags;
+        sb->s_flags = s_flags;
        return err;
 }
@@ -475,10 +479,13 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
                        return -EIO;
                }
                printk(KERN_WARNING
-                       "NILFS warning: unable to read primary superblock\n");
+                       "NILFS warning: unable to read primary superblock "
-        } else if (!sbp[1])
+                       "(blocksize = %d)\n", blocksize);
+        } else if (!sbp[1]) {
                printk(KERN_WARNING
-                       "NILFS warning: unable to read secondary superblock\n");
+                       "NILFS warning: unable to read secondary superblock "
+                       "(blocksize = %d)\n", blocksize);
+        }
        /*
         * Compare two super blocks and set 1 in swp if the secondary
@@ -505,7 +512,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
        if (!valid[!swp])
                printk(KERN_WARNING "NILFS warning: broken superblock. "
-                       "using spare superblock.\n");
+                       "using spare superblock (blocksize = %d).\n", blocksize);
        if (swp)
                nilfs_swap_super_block(nilfs);
@@ -519,7 +526,6 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
 /**
 * init_nilfs - initialize a NILFS instance.
 * @nilfs: the_nilfs structure
- * @sbi: nilfs_sb_info
 * @sb: super block
 * @data: mount options
 *
@@ -530,9 +536,8 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
 * Return Value: On success, 0 is returned. On error, a negative error
 * code is returned.
 */
-int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
+int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
 {
-        struct super_block *sb = sbi->s_super;
        struct nilfs_super_block *sbp;
        int blocksize;
        int err;
@@ -588,6 +593,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
        nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
        nilfs->ns_blocksize = blocksize;
+        get_random_bytes(&nilfs->ns_next_generation,
+                         sizeof(nilfs->ns_next_generation));
        err = nilfs_store_disk_layout(nilfs, sbp);
        if (err)
                goto failed_sbh;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index fd85e4c05c6b..f4968145c2a3 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -31,7 +31,8 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/slab.h>
-#include "sb.h"
+struct nilfs_sc_info;
 /* the_nilfs struct */
 enum {
@@ -65,13 +66,23 @@ enum {
 * @ns_last_cno: checkpoint number of the latest segment
 * @ns_prot_seq: least sequence number of segments which must not be reclaimed
 * @ns_prev_seq: base sequence number used to decide if advance log cursor
- * @ns_segctor_sem: segment constructor semaphore
+ * @ns_writer: log writer
+ * @ns_segctor_sem: semaphore protecting log write
 * @ns_dat: DAT file inode
 * @ns_cpfile: checkpoint file inode
 * @ns_sufile: segusage file inode
 * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
 * @ns_cptree_lock: lock protecting @ns_cptree
+ * @ns_dirty_files: list of dirty files
+ * @ns_inode_lock: lock protecting @ns_dirty_files
 * @ns_gc_inodes: dummy inodes to keep live blocks
+ * @ns_next_generation: next generation number for inodes
+ * @ns_next_gen_lock: lock protecting @ns_next_generation
+ * @ns_mount_opt: mount options
+ * @ns_resuid: uid for reserved blocks
+ * @ns_resgid: gid for reserved blocks
+ * @ns_interval: checkpoint creation interval
+ * @ns_watermark: watermark for the number of dirty buffers
 * @ns_blocksize_bits: bit length of block size
 * @ns_blocksize: block size
 * @ns_nsegments: number of segments in filesystem
@@ -131,6 +142,7 @@ struct the_nilfs {
        u64                     ns_prot_seq;
        u64                     ns_prev_seq;
+        struct nilfs_sc_info   *ns_writer;
        struct rw_semaphore     ns_segctor_sem;
        /*
@@ -145,9 +157,25 @@ struct the_nilfs {
        struct rb_root          ns_cptree;
        spinlock_t              ns_cptree_lock;
+        /* Dirty inode list */
+        struct list_head        ns_dirty_files;
+        spinlock_t              ns_inode_lock;
        /* GC inode list */
        struct list_head        ns_gc_inodes;
+        /* Inode allocator */
+        u32                     ns_next_generation;
+        spinlock_t              ns_next_gen_lock;
+        /* Mount options */
+        unsigned long           ns_mount_opt;
+        uid_t                   ns_resuid;
+        gid_t                   ns_resgid;
+        unsigned long           ns_interval;
+        unsigned long           ns_watermark;
        /* Disk layout information (static) */
        unsigned int            ns_blocksize_bits;
        unsigned int            ns_blocksize;
@@ -180,6 +208,20 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
 THE_NILFS_FNS(GC_RUNNING, gc_running)
 THE_NILFS_FNS(SB_DIRTY, sb_dirty)
+/*
+ * Mount option operations
+ */
+#define nilfs_clear_opt(nilfs, opt)  \
+        do { (nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
+#define nilfs_set_opt(nilfs, opt)  \
+        do { (nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt; } while (0)
+#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt)
+#define nilfs_write_opt(nilfs, mask, opt)                               \
+        do { (nilfs)->ns_mount_opt =                                    \
+                (((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) |        \
+                 NILFS_MOUNT_##opt);                                    \
+        } while (0)
 /**
 * struct nilfs_root - nilfs root object
 * @cno: checkpoint number
@@ -224,15 +266,14 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
 struct the_nilfs *alloc_nilfs(struct block_device *bdev);
 void destroy_nilfs(struct the_nilfs *nilfs);
-int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
+int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
-int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
+int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
 int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
 struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
 struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
                                             __u64 cno);
 void nilfs_put_root(struct nilfs_root *root);
-struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
 int nilfs_near_disk_full(struct the_nilfs *);
 void nilfs_fall_back_super_block(struct the_nilfs *);
 void nilfs_swap_super_block(struct the_nilfs *);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8b61220cffc5..9fde1c00a296 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -164,7 +164,7 @@ static int process_access_response(struct fsnotify_group *group,
                 fd, response);
        /*
         * make sure the response is valid, if invalid we do nothing and either
-         * userspace can send a valid responce or we will clean it up after the
+         * userspace can send a valid response or we will clean it up after the
         * timeout
         */
        switch (response) {
@@ -876,7 +876,7 @@ SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
 #endif
 /*
- * fanotify_user_setup - Our initialization function.  Note that we cannnot return
+ * fanotify_user_setup - Our initialization function.  Note that we cannot return
 * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
 * must result in panic().
 */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 4c29fcf557d1..07ea8d3e6ea2 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -22,13 +22,14 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
-#include <linux/writeback.h> /* for inode_lock */
 #include <asm/atomic.h>
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
+#include "../internal.h"
 /*
 * Recalculate the mask of events relevant to a given inode locked.
 */
@@ -237,15 +238,14 @@ out:
 * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
 * @list: list of inodes being unmounted (sb->s_inodes)
 *
- * Called with inode_lock held, protecting the unmounting super block's list
+ * Called during unmount with no locks held, so needs to be safe against
- * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
+ * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block.
- * We temporarily drop inode_lock, however, and CAN block.
 */
 void fsnotify_unmount_inodes(struct list_head *list)
 {
        struct inode *inode, *next_i, *need_iput = NULL;
-        spin_lock(&inode_lock);
+        spin_lock(&inode_sb_list_lock);
        list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
                struct inode *need_iput_tmp;
@@ -254,8 +254,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
                 * I_WILL_FREE, or I_NEW which is fine because by that point
                 * the inode cannot have any associated watches.
                 */
-                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+                spin_lock(&inode->i_lock);
+                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
                /*
                 * If i_count is zero, the inode cannot have any watches and
@@ -263,8 +266,10 @@ void fsnotify_unmount_inodes(struct list_head *list)
                 * evict all inodes with zero i_count from icache which is
                 * unnecessarily violent and may in fact be illegal to do.
                 */
-                if (!atomic_read(&inode->i_count))
+                if (!atomic_read(&inode->i_count)) {
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
                need_iput_tmp = need_iput;
                need_iput = NULL;
@@ -274,22 +279,25 @@ void fsnotify_unmount_inodes(struct list_head *list)
                        __iget(inode);
                else
                        need_iput_tmp = NULL;
+                spin_unlock(&inode->i_lock);
                /* In case the dropping of a reference would nuke next_i. */
                if ((&next_i->i_sb_list != list) &&
-                    atomic_read(&next_i->i_count) &&
+                    atomic_read(&next_i->i_count)) {
-                    !(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
+                        spin_lock(&next_i->i_lock);
-                        __iget(next_i);
+                        if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
-                        need_iput = next_i;
+                                __iget(next_i);
+                                need_iput = next_i;
+                        }
+                        spin_unlock(&next_i->i_lock);
                }
                /*
-                 * We can safely drop inode_lock here because we hold
+                 * We can safely drop inode_sb_list_lock here because we hold
                 * references on both inode and next_i.  Also no new inodes
-                 * will be added since the umount has begun.  Finally,
+                 * will be added since the umount has begun.
-                 * iprune_mutex keeps shrink_icache_memory() away.
                 */
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode_sb_list_lock);
                if (need_iput_tmp)
                        iput(need_iput_tmp);
@@ -301,7 +309,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
                iput(inode);
-                spin_lock(&inode_lock);
+                spin_lock(&inode_sb_list_lock);
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
 }
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index a91b69a6a291..e3cbd746f64a 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -194,10 +194,11 @@ static int idr_callback(int id, void *p, void *data)
 static void inotify_free_group_priv(struct fsnotify_group *group)
 {
-        /* ideally the idr is empty and we won't hit the BUG in teh callback */
+        /* ideally the idr is empty and we won't hit the BUG in the callback */
        idr_for_each(&group->inotify_data.idr, idr_callback, group);
        idr_remove_all(&group->inotify_data.idr);
        idr_destroy(&group->inotify_data.idr);
+        atomic_dec(&group->inotify_data.user->inotify_devs);
        free_uid(group->inotify_data.user);
 }
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 4cd5d5d78f9f..8445fbc8985c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -290,7 +290,6 @@ static int inotify_fasync(int fd, struct file *file, int on)
 static int inotify_release(struct inode *ignored, struct file *file)
 {
        struct fsnotify_group *group = file->private_data;
-        struct user_struct *user = group->inotify_data.user;
        pr_debug("%s: group=%p\n", __func__, group);
@@ -299,8 +298,6 @@ static int inotify_release(struct inode *ignored, struct file *file)
        /* free this group, matching get was inotify_init->fsnotify_obtain_group */
        fsnotify_put_group(group);
-        atomic_dec(&user->inotify_devs);
        return 0;
 }
@@ -697,7 +694,7 @@ retry:
        return ret;
 }
-static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
+static struct fsnotify_group *inotify_new_group(unsigned int max_events)
 {
        struct fsnotify_group *group;
@@ -710,8 +707,14 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
        spin_lock_init(&group->inotify_data.idr_lock);
        idr_init(&group->inotify_data.idr);
        group->inotify_data.last_wd = 0;
-        group->inotify_data.user = user;
        group->inotify_data.fa = NULL;
+        group->inotify_data.user = get_current_user();
+        if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
+            inotify_max_user_instances) {
+                fsnotify_put_group(group);
+                return ERR_PTR(-EMFILE);
+        }
        return group;
 }
@@ -721,7 +724,6 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
 SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
        struct fsnotify_group *group;
-        struct user_struct *user;
        int ret;
        /* Check the IN_* constants for consistency.  */
@@ -731,31 +733,16 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
        if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
                return -EINVAL;
-        user = get_current_user();
-        if (unlikely(atomic_read(&user->inotify_devs) >=
-                        inotify_max_user_instances)) {
-                ret = -EMFILE;
-                goto out_free_uid;
-        }
        /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
-        group = inotify_new_group(user, inotify_max_queued_events);
+        group = inotify_new_group(inotify_max_queued_events);
-        if (IS_ERR(group)) {
+        if (IS_ERR(group))
-                ret = PTR_ERR(group);
+                return PTR_ERR(group);
-                goto out_free_uid;
-        }
-        atomic_inc(&user->inotify_devs);
        ret = anon_inode_getfd("inotify", &inotify_fops, group,
                                  O_RDONLY | flags);
-        if (ret >= 0)
+        if (ret < 0)
-                return ret;
+                fsnotify_put_group(group);
-        fsnotify_put_group(group);
-        atomic_dec(&user->inotify_devs);
-out_free_uid:
-        free_uid(user);
        return ret;
 }
@@ -841,7 +828,7 @@ out:
 }
 /*
- * inotify_user_setup - Our initialization function.  Note that we cannnot return
+ * inotify_user_setup - Our initialization function.  Note that we cannot return
 * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
 * must result in panic().
 */
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 325185e514bb..252ab1f6452b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -24,7 +24,7 @@
 * referencing this object.  The object typically will live inside the kernel
 * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
 * which can find this object holding the appropriete locks, can take a reference
- * and the object itself is guarenteed to survive until the reference is dropped.
+ * and the object itself is guaranteed to survive until the reference is dropped.
 *
 * LOCKING:
 * There are 3 spinlocks involved with fsnotify inode marks and they MUST
@@ -91,7 +91,6 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/srcu.h>
-#include <linux/writeback.h> /* for inode_lock */
 #include <asm/atomic.h>
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 85eebff6d0d7..e86577d6c5c3 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -23,7 +23,6 @@
 #include <linux/mount.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
-#include <linux/writeback.h> /* for inode_lock */
 #include <asm/atomic.h>
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 4ff028fcfd6e..30206b238433 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -2,18 +2,13 @@
 obj-$(CONFIG_NTFS_FS) += ntfs.o
-ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
+ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
-             index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
+          index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
-             unistr.o upcase.o
+          unistr.o upcase.o
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\"
+ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-ifeq ($(CONFIG_NTFS_DEBUG),y)
+ccflags-y := -DNTFS_VERSION=\"2.1.30\"
-EXTRA_CFLAGS += -DDEBUG
+ccflags-$(CONFIG_NTFS_DEBUG)    += -DDEBUG
-endif
+ccflags-$(CONFIG_NTFS_RW)       += -DNTFS_RW
-ifeq ($(CONFIG_NTFS_RW),y)
-EXTRA_CFLAGS += -DNTFS_RW
-ntfs-objs += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-endif
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index c3c2c7ac9020..0b1e885b8cf8 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1543,8 +1543,6 @@ err_out:
 */
 const struct address_space_operations ntfs_aops = {
        .readpage       = ntfs_readpage,        /* Fill page with data. */
-        .sync_page      = block_sync_page,      /* Currently, just unplugs the
-                                                   disk request queue. */
 #ifdef NTFS_RW
        .writepage      = ntfs_writepage,       /* Write dirty page to disk. */
 #endif /* NTFS_RW */
@@ -1560,8 +1558,6 @@ const struct address_space_operations ntfs_aops = {
 */
 const struct address_space_operations ntfs_mst_aops = {
        .readpage       = ntfs_readpage,        /* Fill page with data. */
-        .sync_page      = block_sync_page,      /* Currently, just unplugs the
-                                                   disk request queue. */
 #ifdef NTFS_RW
        .writepage      = ntfs_writepage,       /* Write dirty page to disk. */
        .set_page_dirty = __set_page_dirty_nobuffers,   /* Set the page dirty
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index f5094ee224c1..f14fde2b03d6 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -197,7 +197,7 @@ err_out:
        } else if (ctx_needs_reset) {
                /*
                 * If there is no attribute list, restoring the search context
-                 * is acomplished simply by copying the saved context back over
+                 * is accomplished simply by copying the saved context back over
                 * the caller supplied context.  If there is an attribute list,
                 * things are more complicated as we need to deal with mapping
                 * of mft records and resulting potential changes in pointers.
@@ -1181,7 +1181,7 @@ not_found:
 * for, i.e. if one wants to add the attribute to the mft record this is the
 * correct place to insert its attribute list entry into.
 *
- * When -errno != -ENOENT, an error occured during the lookup.  @ctx->attr is
+ * When -errno != -ENOENT, an error occurred during the lookup.  @ctx->attr is
 * then undefined and in particular you should not rely on it not changing.
 */
 int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 6551c7cbad92..ee4144ce5d7c 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -501,7 +501,7 @@ int ntfs_read_compressed_block(struct page *page)
        VCN start_vcn = (((s64)index << PAGE_CACHE_SHIFT) & ~cb_size_mask) >>
                        vol->cluster_size_bits;
        /*
-         * The first vcn after the last wanted vcn (minumum alignment is again
+         * The first vcn after the last wanted vcn (minimum alignment is again
         * PAGE_CACHE_SIZE.
         */
        VCN end_vcn = ((((s64)(index + 1UL) << PAGE_CACHE_SHIFT) + cb_size - 1)
@@ -698,8 +698,7 @@ lock_retry_remap:
                                        "uptodate! Unplugging the disk queue "
                                        "and rescheduling.");
                        get_bh(tbh);
-                        blk_run_address_space(mapping);
+                        io_schedule();
-                        schedule();
                        put_bh(tbh);
                        if (unlikely(!buffer_uptodate(tbh)))
                                goto read_err;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index a627ed82c0a3..c05d6dcf77a4 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -54,7 +54,7 @@
 *
 * Return 1 if the attributes match and 0 if not.
 *
- * NOTE: This function runs with the inode_lock spin lock held so it is not
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
 * allowed to sleep.
 */
 int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
@@ -98,7 +98,7 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
 *
 * Return 0 on success and -errno on error.
 *
- * NOTE: This function runs with the inode_lock spin lock held so it is not
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
 * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
 */
 static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
@@ -622,7 +622,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
         */
        /* Everyone gets all permissions. */
        vi->i_mode |= S_IRWXUGO;
-        /* If read-only, noone gets write permissions. */
+        /* If read-only, no one gets write permissions. */
        if (IS_RDONLY(vi))
                vi->i_mode &= ~S_IWUGO;
        if (m->flags & MFT_RECORD_IS_DIRECTORY) {
@@ -2529,7 +2529,7 @@ retry_truncate:
                 * specifies that the behaviour is unspecified thus we do not
                 * have to do anything.  This means that in our implementation
                 * in the rare case that the file is mmap()ped and a write
-                 * occured into the mmap()ped region just beyond the file size
+                 * occurred into the mmap()ped region just beyond the file size
                 * and writepage has not yet been called to write out the page
                 * (which would clear the area beyond the file size) and we now
                 * extend the file size to incorporate this dirty region
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 8b2549f672bf..faece7190866 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -286,7 +286,7 @@ typedef le16 MFT_RECORD_FLAGS;
 * fragmented. Volume free space includes the empty part of the mft zone and
 * when the volume's free 88% are used up, the mft zone is shrunk by a factor
 * of 2, thus making more space available for more files/data. This process is
- * repeated everytime there is no more free space except for the mft zone until
+ * repeated every time there is no more free space except for the mft zone until
 * there really is no more free space.
 */
@@ -1657,13 +1657,13 @@ typedef enum {
 *      pointed to by the Owner field was provided by a defaulting mechanism
 *      rather than explicitly provided by the original provider of the
 *      security descriptor.  This may affect the treatment of the SID with
- *      respect to inheritence of an owner.
+ *      respect to inheritance of an owner.
 *
 * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in
 *      the Group field was provided by a defaulting mechanism rather than
 *      explicitly provided by the original provider of the security
 *      descriptor.  This may affect the treatment of the SID with respect to
- *      inheritence of a primary group.
+ *      inheritance of a primary group.
 *
 * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security
 *      descriptor contains a discretionary ACL.  If this flag is set and the
@@ -1674,7 +1674,7 @@ typedef enum {
 *      pointed to by the Dacl field was provided by a defaulting mechanism
 *      rather than explicitly provided by the original provider of the
 *      security descriptor.  This may affect the treatment of the ACL with
- *      respect to inheritence of an ACL.  This flag is ignored if the
+ *      respect to inheritance of an ACL.  This flag is ignored if the
 *      DaclPresent flag is not set.
 *
 * SE_SACL_PRESENT - This boolean flag, when set,  indicates that the security
@@ -1686,7 +1686,7 @@ typedef enum {
 *      pointed to by the Sacl field was provided by a defaulting mechanism
 *      rather than explicitly provided by the original provider of the
 *      security descriptor.  This may affect the treatment of the ACL with
- *      respect to inheritence of an ACL.  This flag is ignored if the
+ *      respect to inheritance of an ACL.  This flag is ignored if the
 *      SaclPresent flag is not set.
 *
 * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security
@@ -2283,7 +2283,7 @@ typedef struct {
        //                 the key_length is zero, then the vcn immediately
        //                 follows the INDEX_ENTRY_HEADER. Regardless of
        //                 key_length, the address of the 8-byte boundary
-        //                 alligned vcn of INDEX_ENTRY{_HEADER} *ie is given by
+        //                 aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by
        //                 (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN),
        //                 where sizeof(VCN) can be hardcoded as 8 if wanted. */
 } __attribute__ ((__packed__)) INDEX_ENTRY;
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 4dadcdf3d451..c71de292c5ad 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -669,7 +669,7 @@ err_out:
 * of cases where we think that a volume is dirty when in fact it is clean.
 * This should only affect volumes that have not been shutdown cleanly but did
 * not have any pending, non-check-pointed i/o, i.e. they were completely idle
- * at least for the five seconds preceeding the unclean shutdown.
+ * at least for the five seconds preceding the unclean shutdown.
 *
 * This function assumes that the $LogFile journal has already been consistency
 * checked by a call to ntfs_check_logfile() and in particular if the $LogFile
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index b5a6f08bd35c..aa2b6ac3f0a4 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -222,7 +222,7 @@ typedef struct {
 /* 24*/ sle64 file_size;        /* Usable byte size of the log file.  If the
                                   restart_area_offset + the offset of the
                                   file_size are > 510 then corruption has
-                                   occured.  This is the very first check when
+                                   occurred.  This is the very first check when
                                   starting with the restart_area as if it
                                   fails it means that some of the above values
                                   will be corrupted by the multi sector
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 326e7475a22a..382857f9c7db 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -73,7 +73,7 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
                if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs +
                                vol->mft_record_size) {
                        page = ERR_PTR(-ENOENT);
-                        ntfs_error(vol->sb, "Attemt to read mft record 0x%lx, "
+                        ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
                                        "which is beyond the end of the mft.  "
                                        "This is probably a bug in the ntfs "
                                        "driver.", ni->mft_no);
@@ -1442,7 +1442,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
                // Note: It will need to be a special mft record and if none of
                // those are available it gets rather complicated...
                ntfs_error(vol->sb, "Not enough space in this mft record to "
-                                "accomodate extended mft bitmap attribute "
+                                "accommodate extended mft bitmap attribute "
                                "extent.  Cannot handle this yet.");
                ret = -EOPNOTSUPP;
                goto undo_alloc;
@@ -1879,7 +1879,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
                // and we would then need to update all references to this mft
                // record appropriately.  This is rather complicated...
                ntfs_error(vol->sb, "Not enough space in this mft record to "
-                                "accomodate extended mft data attribute "
+                                "accommodate extended mft data attribute "
                                "extent.  Cannot handle this yet.");
                ret = -EOPNOTSUPP;
                goto undo_alloc;
@@ -2357,7 +2357,7 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
        }
 #ifdef DEBUG
        read_lock_irqsave(&mftbmp_ni->size_lock, flags);
-        ntfs_debug("Status of mftbmp after initialized extention: "
+        ntfs_debug("Status of mftbmp after initialized extension: "
                        "allocated_size 0x%llx, data_size 0x%llx, "
                        "initialized_size 0x%llx.",
                        (long long)mftbmp_ni->allocated_size,
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 56a9a6d25a2a..eac7d6788a10 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -1243,7 +1243,7 @@ err_out:
 * write.
 *
 * This is used when building the mapping pairs array of a runlist to compress
- * a given logical cluster number (lcn) or a specific run length to the minumum
+ * a given logical cluster number (lcn) or a specific run length to the minimum
 * size possible.
 *
 * Return the number of bytes written on success.  On error, i.e. the
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 29099a07b9fe..b52706da4645 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -458,7 +458,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
         * the volume on boot and updates them.
         *
         * When remounting read-only, mark the volume clean if no volume errors
-         * have occured.
+         * have occurred.
         */
        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
                static const char *es = ".  Cannot remount read-write.";
@@ -1269,7 +1269,7 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
                                        "hibernated on the volume.");
                        return 0;
                }
-                /* A real error occured. */
+                /* A real error occurred. */
                ntfs_error(vol->sb, "Failed to find inode number for "
                                "hiberfil.sys.");
                return ret;
@@ -1370,7 +1370,7 @@ static bool load_and_init_quota(ntfs_volume *vol)
                        NVolSetQuotaOutOfDate(vol);
                        return true;
                }
-                /* A real error occured. */
+                /* A real error occurred. */
                ntfs_error(vol->sb, "Failed to find inode number for $Quota.");
                return false;
        }
@@ -1454,7 +1454,7 @@ not_enabled:
                        NVolSetUsnJrnlStamped(vol);
                        return true;
                }
-                /* A real error occured. */
+                /* A real error occurred. */
                ntfs_error(vol->sb, "Failed to find inode number for "
                                "$UsnJrnl.");
                return false;
@@ -2292,7 +2292,7 @@ static void ntfs_put_super(struct super_block *sb)
        ntfs_commit_inode(vol->mft_ino);
        /*
-         * If a read-write mount and no volume errors have occured, mark the
+         * If a read-write mount and no volume errors have occurred, mark the
         * volume clean.  Also, re-commit all affected inodes.
         */
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -2496,7 +2496,7 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
        if (vol->nr_clusters & 63)
                nr_free += 64 - (vol->nr_clusters & 63);
        up_read(&vol->lcnbmp_lock);
-        /* If errors occured we may well have gone below zero, fix this. */
+        /* If errors occurred we may well have gone below zero, fix this. */
        if (nr_free < 0)
                nr_free = 0;
        ntfs_debug("Exiting.");
@@ -2561,7 +2561,7 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
        }
        ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
                        index - 1);
-        /* If errors occured we may well have gone below zero, fix this. */
+        /* If errors occurred we may well have gone below zero, fix this. */
        if (nr_free < 0)
                nr_free = 0;
        ntfs_debug("Exiting.");
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 07d9fd854350..d8a0313e99e6 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -1,6 +1,6 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
-EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
+ccflags-y += -DCATCH_BH_JBD_RACES
 obj-$(CONFIG_OCFS2_FS) +=       \
        ocfs2.o                 \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 704f6b1742f3..e913ad130fdd 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -24,7 +24,6 @@
 #include <linux/slab.h>
 #include <linux/string.h>
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -497,7 +496,7 @@ static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value) {
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e4984e259cb6..48aa9c7401c7 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -30,7 +30,6 @@
 #include <linux/swap.h>
 #include <linux/quotaops.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -50,6 +49,7 @@
 #include "uptodate.h"
 #include "xattr.h"
 #include "refcounttree.h"
+#include "ocfs2_trace.h"
 #include "buffer_head_io.h"
@@ -886,8 +886,7 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
        struct ocfs2_extent_block *eb =
                (struct ocfs2_extent_block *)bh->b_data;
-        mlog(0, "Validating extent block %llu\n",
+        trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
-             (unsigned long long)bh->b_blocknr);
        BUG_ON(!buffer_uptodate(bh));
@@ -965,8 +964,6 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
        struct buffer_head *eb_bh = NULL;
        u64 last_eb_blk = 0;
-        mlog_entry_void();
        el = et->et_root_el;
        last_eb_blk = ocfs2_et_get_last_eb_blk(et);
@@ -987,7 +984,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 bail:
        brelse(eb_bh);
-        mlog_exit(retval);
+        trace_ocfs2_num_free_extents(retval);
        return retval;
 }
@@ -1010,8 +1007,6 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
                OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
        struct ocfs2_extent_block *eb;
-        mlog_entry_void();
        count = 0;
        while (count < wanted) {
                status = ocfs2_claim_metadata(handle,
@@ -1074,8 +1069,8 @@ bail:
                        brelse(bhs[i]);
                        bhs[i] = NULL;
                }
+                mlog_errno(status);
        }
-        mlog_exit(status);
        return status;
 }
@@ -1173,8 +1168,6 @@ static int ocfs2_add_branch(handle_t *handle,
        struct ocfs2_extent_list  *el;
        u32 new_cpos, root_end;
-        mlog_entry_void();
        BUG_ON(!last_eb_bh || !*last_eb_bh);
        if (eb_bh) {
@@ -1200,8 +1193,11 @@ static int ocfs2_add_branch(handle_t *handle,
         * from new_cpos).
         */
        if (root_end > new_cpos) {
-                mlog(0, "adjust the cluster end from %u to %u\n",
+                trace_ocfs2_adjust_rightmost_branch(
-                     root_end, new_cpos);
+                        (unsigned long long)
+                        ocfs2_metadata_cache_owner(et->et_ci),
+                        root_end, new_cpos);
                status = ocfs2_adjust_rightmost_branch(handle, et);
                if (status) {
                        mlog_errno(status);
@@ -1332,7 +1328,6 @@ bail:
                kfree(new_eb_bhs);
        }
-        mlog_exit(status);
        return status;
 }
@@ -1353,8 +1348,6 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
        struct ocfs2_extent_list  *root_el;
        struct ocfs2_extent_list  *eb_el;
-        mlog_entry_void();
        status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
                                           &new_eb_bh);
        if (status < 0) {
@@ -1415,7 +1408,6 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
 bail:
        brelse(new_eb_bh);
-        mlog_exit(status);
        return status;
 }
@@ -1446,8 +1438,6 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
        struct buffer_head *bh = NULL;
        struct buffer_head *lowest_bh = NULL;
-        mlog_entry_void();
        *target_bh = NULL;
        el = et->et_root_el;
@@ -1503,7 +1493,6 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
 bail:
        brelse(bh);
-        mlog_exit(status);
        return status;
 }
@@ -1540,7 +1529,10 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
         * another tree level */
        if (shift) {
                BUG_ON(bh);
-                mlog(0, "need to shift tree depth (current = %d)\n", depth);
+                trace_ocfs2_grow_tree(
+                        (unsigned long long)
+                        ocfs2_metadata_cache_owner(et->et_ci),
+                        depth);
                /* ocfs2_shift_tree_depth will return us a buffer with
                 * the new extent block (so we can pass that to
@@ -1570,7 +1562,6 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
        /* call ocfs2_add_branch to add the final part of the tree with
         * the new data. */
-        mlog(0, "add branch. bh = %p\n", bh);
        ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
                               meta_ac);
        if (ret < 0) {
@@ -1645,8 +1636,9 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
        }
        insert_index = i;
-        mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
+        trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
-             insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
+                                has_empty, next_free,
+                                le16_to_cpu(el->l_count));
        BUG_ON(insert_index < 0);
        BUG_ON(insert_index >= le16_to_cpu(el->l_count));
@@ -2059,7 +2051,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
        left_el = path_leaf_el(left_path);
        right_el = path_leaf_el(right_path);
        for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
-                mlog(0, "Adjust records at index %u\n", i);
+                trace_ocfs2_complete_edge_insert(i);
                /*
                 * One nice property of knowing that all of these
@@ -2389,7 +2381,9 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
                goto out;
        }
-        mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
+        trace_ocfs2_rotate_tree_right(
+                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+                insert_cpos, cpos);
        /*
         * What we want to do here is:
@@ -2418,8 +2412,10 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
         * rotating subtrees.
         */
        while (cpos && insert_cpos <= cpos) {
-                mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
+                trace_ocfs2_rotate_tree_right(
-                     insert_cpos, cpos);
+                        (unsigned long long)
+                        ocfs2_metadata_cache_owner(et->et_ci),
+                        insert_cpos, cpos);
                ret = ocfs2_find_path(et->et_ci, left_path, cpos);
                if (ret) {
@@ -2461,10 +2457,10 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
                start = ocfs2_find_subtree_root(et, left_path, right_path);
-                mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
+                trace_ocfs2_rotate_subtree(start,
-                     start,
+                        (unsigned long long)
-                     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
+                        right_path->p_node[start].bh->b_blocknr,
-                     right_path->p_tree_depth);
+                        right_path->p_tree_depth);
                ret = ocfs2_extend_rotate_transaction(handle, start,
                                                      orig_credits, right_path);
@@ -2964,8 +2960,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
                subtree_root = ocfs2_find_subtree_root(et, left_path,
                                                       right_path);
-                mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
+                trace_ocfs2_rotate_subtree(subtree_root,
-                     subtree_root,
                     (unsigned long long)
                     right_path->p_node[subtree_root].bh->b_blocknr,
                     right_path->p_tree_depth);
@@ -3989,9 +3984,11 @@ static int ocfs2_append_rec_to_path(handle_t *handle,
                        goto out;
                }
-                mlog(0, "Append may need a left path update. cpos: %u, "
+                trace_ocfs2_append_rec_to_path(
-                     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
+                        (unsigned long long)
-                     left_cpos);
+                        ocfs2_metadata_cache_owner(et->et_ci),
+                        le32_to_cpu(insert_rec->e_cpos),
+                        left_cpos);
                /*
                 * No need to worry if the append is already in the
@@ -4522,7 +4519,7 @@ set_tail_append:
 }
 /*
- * Helper function called at the begining of an insert.
+ * Helper function called at the beginning of an insert.
 *
 * This computes a few things that are commonly used in the process of
 * inserting into the btree:
@@ -4562,7 +4559,7 @@ static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
                                              ocfs2_et_get_last_eb_blk(et),
                                              &bh);
                if (ret) {
-                        mlog_exit(ret);
+                        mlog_errno(ret);
                        goto out;
                }
                eb = (struct ocfs2_extent_block *) bh->b_data;
@@ -4678,9 +4675,9 @@ int ocfs2_insert_extent(handle_t *handle,
        struct ocfs2_insert_type insert = {0, };
        struct ocfs2_extent_rec rec;
-        mlog(0, "add %u clusters at position %u to owner %llu\n",
+        trace_ocfs2_insert_extent_start(
-             new_clusters, cpos,
+                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
-             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+                cpos, new_clusters);
        memset(&rec, 0, sizeof(rec));
        rec.e_cpos = cpu_to_le32(cpos);
@@ -4700,11 +4697,9 @@ int ocfs2_insert_extent(handle_t *handle,
                goto bail;
        }
-        mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
+        trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
-             "Insert.contig_index: %d, Insert.free_records: %d, "
+                                  insert.ins_contig_index, free_records,
-             "Insert.tree_depth: %d\n",
+                                  insert.ins_tree_depth);
-             insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
-             free_records, insert.ins_tree_depth);
        if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
                status = ocfs2_grow_tree(handle, et,
@@ -4726,7 +4721,6 @@ int ocfs2_insert_extent(handle_t *handle,
 bail:
        brelse(last_eb_bh);
-        mlog_exit(status);
        return status;
 }
@@ -4746,7 +4740,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
                                struct ocfs2_alloc_context *meta_ac,
                                enum ocfs2_alloc_restarted *reason_ret)
 {
-        int status = 0;
+        int status = 0, err = 0;
        int free_extents;
        enum ocfs2_alloc_restarted reason = RESTART_NONE;
        u32 bit_off, num_bits;
@@ -4773,14 +4767,14 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
         * 2) we are so fragmented, we've needed to add metadata too
         *    many times. */
        if (!free_extents && !meta_ac) {
-                mlog(0, "we haven't reserved any metadata!\n");
+                err = -1;
                status = -EAGAIN;
                reason = RESTART_META;
                goto leave;
        } else if ((!free_extents)
                   && (ocfs2_alloc_context_bits_left(meta_ac)
                       < ocfs2_extend_meta_needed(et->et_root_el))) {
-                mlog(0, "filesystem is really fragmented...\n");
+                err = -2;
                status = -EAGAIN;
                reason = RESTART_META;
                goto leave;
@@ -4805,9 +4799,9 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
        }
        block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-        mlog(0, "Allocating %u clusters at block %u for owner %llu\n",
+        trace_ocfs2_add_clusters_in_btree(
-             num_bits, bit_off,
+             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
-             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+             bit_off, num_bits);
        status = ocfs2_insert_extent(handle, et, *logical_offset, block,
                                     num_bits, flags, meta_ac);
        if (status < 0) {
@@ -4821,16 +4815,15 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
        *logical_offset += num_bits;
        if (clusters_to_add) {
-                mlog(0, "need to alloc once more, wanted = %u\n",
+                err = clusters_to_add;
-                     clusters_to_add);
                status = -EAGAIN;
                reason = RESTART_TRANS;
        }
 leave:
-        mlog_exit(status);
        if (reason_ret)
                *reason_ret = reason;
+        trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
        return status;
 }
@@ -5039,7 +5032,7 @@ int ocfs2_split_extent(handle_t *handle,
                                              ocfs2_et_get_last_eb_blk(et),
                                              &last_eb_bh);
                if (ret) {
-                        mlog_exit(ret);
+                        mlog_errno(ret);
                        goto out;
                }
@@ -5056,9 +5049,9 @@ int ocfs2_split_extent(handle_t *handle,
        ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
-        mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
+        trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
-             split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
+                                 ctxt.c_has_empty_extent,
-             ctxt.c_split_covers_rec);
+                                 ctxt.c_split_covers_rec);
        if (ctxt.c_contig_type == CONTIG_NONE) {
                if (ctxt.c_split_covers_rec)
@@ -5192,8 +5185,9 @@ int ocfs2_mark_extent_written(struct inode *inode,
 {
        int ret;
-        mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n",
+        trace_ocfs2_mark_extent_written(
-             inode->i_ino, cpos, len, phys);
+                (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                cpos, len, phys);
        if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
                ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
@@ -5512,11 +5506,10 @@ int ocfs2_remove_extent(handle_t *handle,
        BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
-        mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d "
+        trace_ocfs2_remove_extent(
-             "(cpos %u, len %u)\n",
+                (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
-             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+                cpos, len, index, le32_to_cpu(rec->e_cpos),
-             cpos, len, index,
+                ocfs2_rec_clusters(el, rec));
-             le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
        if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
                ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
@@ -5795,9 +5788,6 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        struct ocfs2_dinode *di;
        struct ocfs2_truncate_log *tl;
-        mlog_entry("start_blk = %llu, num_clusters = %u\n",
-                   (unsigned long long)start_blk, num_clusters);
        BUG_ON(mutex_trylock(&tl_inode->i_mutex));
        start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
@@ -5834,10 +5824,9 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
                goto bail;
        }
-        mlog(0, "Log truncate of %u clusters starting at cluster %u to "
+        trace_ocfs2_truncate_log_append(
-             "%llu (index = %d)\n", num_clusters, start_cluster,
+                (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index,
-             (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
+                start_cluster, num_clusters);
        if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
                /*
                 * Move index back to the record we are coalescing with.
@@ -5846,9 +5835,10 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
                index--;
                num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
-                mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
+                trace_ocfs2_truncate_log_append(
-                     index, le32_to_cpu(tl->tl_recs[index].t_start),
+                        (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
-                     num_clusters);
+                        index, le32_to_cpu(tl->tl_recs[index].t_start),
+                        num_clusters);
        } else {
                tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
                tl->tl_used = cpu_to_le16(index + 1);
@@ -5859,7 +5849,6 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        osb->truncated_clusters += num_clusters;
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -5878,8 +5867,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
        struct inode *tl_inode = osb->osb_tl_inode;
        struct buffer_head *tl_bh = osb->osb_tl_bh;
-        mlog_entry_void();
        di = (struct ocfs2_dinode *) tl_bh->b_data;
        tl = &di->id2.i_dealloc;
        i = le16_to_cpu(tl->tl_used) - 1;
@@ -5915,8 +5902,9 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
                /* if start_blk is not set, we ignore the record as
                 * invalid. */
                if (start_blk) {
-                        mlog(0, "free record %d, start = %u, clusters = %u\n",
+                        trace_ocfs2_replay_truncate_records(
-                             i, le32_to_cpu(rec.t_start), num_clusters);
+                                (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+                                i, le32_to_cpu(rec.t_start), num_clusters);
                        status = ocfs2_free_clusters(handle, data_alloc_inode,
                                                     data_alloc_bh, start_blk,
@@ -5932,7 +5920,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
        osb->truncated_clusters = 0;
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -5949,8 +5936,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
        struct ocfs2_dinode *di;
        struct ocfs2_truncate_log *tl;
-        mlog_entry_void();
        BUG_ON(mutex_trylock(&tl_inode->i_mutex));
        di = (struct ocfs2_dinode *) tl_bh->b_data;
@@ -5962,8 +5947,9 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
        tl = &di->id2.i_dealloc;
        num_to_flush = le16_to_cpu(tl->tl_used);
-        mlog(0, "Flush %u records from truncate log #%llu\n",
+        trace_ocfs2_flush_truncate_log(
-             num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
+                (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+                num_to_flush);
        if (!num_to_flush) {
                status = 0;
                goto out;
@@ -6009,7 +5995,6 @@ out_mutex:
        iput(data_alloc_inode);
 out:
-        mlog_exit(status);
        return status;
 }
@@ -6032,15 +6017,11 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
                container_of(work, struct ocfs2_super,
                             osb_truncate_log_wq.work);
-        mlog_entry_void();
        status = ocfs2_flush_truncate_log(osb);
        if (status < 0)
                mlog_errno(status);
        else
                ocfs2_init_steal_slots(osb);
-        mlog_exit(status);
 }
 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
@@ -6086,7 +6067,6 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
        *tl_inode = inode;
        *tl_bh    = bh;
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -6106,7 +6086,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
        *tl_copy = NULL;
-        mlog(0, "recover truncate log from slot %d\n", slot_num);
+        trace_ocfs2_begin_truncate_log_recovery(slot_num);
        status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
        if (status < 0) {
@@ -6123,8 +6103,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
        tl = &di->id2.i_dealloc;
        if (le16_to_cpu(tl->tl_used)) {
-                mlog(0, "We'll have %u logs to recover\n",
+                trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
-                     le16_to_cpu(tl->tl_used));
                *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
                if (!(*tl_copy)) {
@@ -6157,9 +6136,9 @@ bail:
        if (status < 0 && (*tl_copy)) {
                kfree(*tl_copy);
                *tl_copy = NULL;
+                mlog_errno(status);
        }
-        mlog_exit(status);
        return status;
 }
@@ -6174,8 +6153,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
        struct inode *tl_inode = osb->osb_tl_inode;
        struct ocfs2_truncate_log *tl;
-        mlog_entry_void();
        if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
                mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
                return -EINVAL;
@@ -6183,8 +6160,9 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
        tl = &tl_copy->id2.i_dealloc;
        num_recs = le16_to_cpu(tl->tl_used);
-        mlog(0, "cleanup %u records from %llu\n", num_recs,
+        trace_ocfs2_complete_truncate_log_recovery(
-             (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
+                (unsigned long long)le64_to_cpu(tl_copy->i_blkno),
+                num_recs);
        mutex_lock(&tl_inode->i_mutex);
        for(i = 0; i < num_recs; i++) {
@@ -6219,7 +6197,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 bail_up:
        mutex_unlock(&tl_inode->i_mutex);
-        mlog_exit(status);
        return status;
 }
@@ -6228,8 +6205,6 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
        int status;
        struct inode *tl_inode = osb->osb_tl_inode;
-        mlog_entry_void();
        if (tl_inode) {
                cancel_delayed_work(&osb->osb_truncate_log_wq);
                flush_workqueue(ocfs2_wq);
@@ -6241,8 +6216,6 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
                brelse(osb->osb_tl_bh);
                iput(osb->osb_tl_inode);
        }
-        mlog_exit_void();
 }
 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
@@ -6251,8 +6224,6 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
        struct inode *tl_inode = NULL;
        struct buffer_head *tl_bh = NULL;
-        mlog_entry_void();
        status = ocfs2_get_truncate_log_info(osb,
                                             osb->slot_num,
                                             &tl_inode,
@@ -6268,7 +6239,6 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
        osb->osb_tl_bh    = tl_bh;
        osb->osb_tl_inode = tl_inode;
-        mlog_exit(status);
        return status;
 }
@@ -6350,8 +6320,8 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
                else
                        bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
                                                              head->free_bit);
-                mlog(0, "Free bit: (bit %u, blkno %llu)\n",
+                trace_ocfs2_free_cached_blocks(
-                     head->free_bit, (unsigned long long)head->free_blk);
+                     (unsigned long long)head->free_blk, head->free_bit);
                ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
                                               head->free_bit, bg_blkno, 1);
@@ -6404,8 +6374,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
                return ret;
        }
-        mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
+        trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit);
-             bit, (unsigned long long)blkno);
        item->free_blk = blkno;
        item->free_bit = bit;
@@ -6480,8 +6449,8 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
                fl = ctxt->c_first_suballocator;
                if (fl->f_first) {
-                        mlog(0, "Free items: (type %u, slot %d)\n",
+                        trace_ocfs2_run_deallocs(fl->f_inode_type,
-                             fl->f_inode_type, fl->f_slot);
+                                                 fl->f_slot);
                        ret2 = ocfs2_free_cached_blocks(osb,
                                                        fl->f_inode_type,
                                                        fl->f_slot,
@@ -6558,8 +6527,9 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
                goto out;
        }
-        mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
+        trace_ocfs2_cache_block_dealloc(type, slot,
-             type, slot, bit, (unsigned long long)blkno);
+                                        (unsigned long long)suballoc,
+                                        (unsigned long long)blkno, bit);
        item->free_bg = suballoc;
        item->free_blk = blkno;
@@ -7005,8 +6975,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
        struct ocfs2_extent_tree et;
        struct ocfs2_cached_dealloc_ctxt dealloc;
-        mlog_entry_void();
        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
        ocfs2_init_dealloc_ctxt(&dealloc);
@@ -7041,8 +7009,11 @@ start:
                goto bail;
        }
-        mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
+        trace_ocfs2_commit_truncate(
-             OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
+                (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                new_highest_cpos,
+                OCFS2_I(inode)->ip_clusters,
+                path->p_tree_depth);
        /*
         * By now, el will point to the extent list on the bottom most
@@ -7136,7 +7107,6 @@ bail:
        ocfs2_free_path(path);
-        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1fbb0e20131b..ac97bca282d2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -29,7 +29,6 @@
 #include <linux/mpage.h>
 #include <linux/quotaops.h>
-#define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -45,6 +44,7 @@
 #include "super.h"
 #include "symlink.h"
 #include "refcounttree.h"
+#include "ocfs2_trace.h"
 #include "buffer_head_io.h"
@@ -59,8 +59,9 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        void *kaddr;
-        mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
+        trace_ocfs2_symlink_get_block(
-                   (unsigned long long)iblock, bh_result, create);
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                        (unsigned long long)iblock, bh_result, create);
        BUG_ON(ocfs2_inode_is_fast_symlink(inode));
@@ -123,7 +124,6 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 bail:
        brelse(bh);
-        mlog_exit(err);
        return err;
 }
@@ -136,8 +136,8 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
        u64 p_blkno, count, past_eof;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
+        trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno,
-                   (unsigned long long)iblock, bh_result, create);
+                              (unsigned long long)iblock, bh_result, create);
        if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
                mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
@@ -199,8 +199,9 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
        }
        past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-        mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
-             (unsigned long long)past_eof);
+        trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                  (unsigned long long)past_eof);
        if (create && (iblock >= past_eof))
                set_buffer_new(bh_result);
@@ -208,7 +209,6 @@ bail:
        if (err < 0)
                err = -EIO;
-        mlog_exit(err);
        return err;
 }
@@ -278,7 +278,8 @@ static int ocfs2_readpage(struct file *file, struct page *page)
        loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
        int ret, unlock = 1;
-        mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
+        trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
+                             (page ? page->index : 0));
        ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
        if (ret != 0) {
@@ -323,7 +324,6 @@ out_inode_unlock:
 out:
        if (unlock)
                unlock_page(page);
-        mlog_exit(ret);
        return ret;
 }
@@ -396,15 +396,11 @@ out_unlock:
 */
 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 {
-        int ret;
+        trace_ocfs2_writepage(
+                (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
-        mlog_entry("(0x%p)\n", page);
+                page->index);
-        ret = block_write_full_page(page, ocfs2_get_block, wbc);
-        mlog_exit(ret);
+        return block_write_full_page(page, ocfs2_get_block, wbc);
-        return ret;
 }
 /* Taken from ext3. We don't necessarily need the full blown
@@ -450,7 +446,8 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
        int err = 0;
        struct inode *inode = mapping->host;
-        mlog_entry("(block = %llu)\n", (unsigned long long)block);
+        trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
+                         (unsigned long long)block);
        /* We don't need to lock journal system files, since they aren't
         * accessed concurrently from multiple nodes.
@@ -484,8 +481,6 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 bail:
        status = err ? 0 : p_blkno;
-        mlog_exit((int)status);
        return status;
 }
@@ -616,9 +611,6 @@ static ssize_t ocfs2_direct_IO(int rw,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
-        int ret;
-        mlog_entry_void();
        /*
         * Fallback to buffered I/O if we see an inode without
@@ -631,13 +623,10 @@ static ssize_t ocfs2_direct_IO(int rw,
        if (i_size_read(inode) <= offset)
                return 0;
-        ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+        return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
-                                   iov, offset, nr_segs,
+                                    iov, offset, nr_segs,
-                                   ocfs2_direct_IO_get_blocks,
+                                    ocfs2_direct_IO_get_blocks,
-                                   ocfs2_dio_end_io, NULL, 0);
+                                    ocfs2_dio_end_io, NULL, 0);
-        mlog_exit(ret);
-        return ret;
 }
 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
@@ -1026,6 +1015,12 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
        ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
                                        &cluster_start, &cluster_end);
+        /* treat the write as new if the a hole/lseek spanned across
+         * the page boundary.
+         */
+        new = new | ((i_size_read(inode) <= page_offset(page)) &&
+                        (page_offset(page) <= user_pos));
        if (page == wc->w_target_page) {
                map_from = user_pos & (PAGE_CACHE_SIZE - 1);
                map_to = map_from + user_len;
@@ -1534,9 +1529,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_dinode *di = NULL;
-        mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
+        trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno,
-             (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
+                                             len, (unsigned long long)pos,
-             oi->ip_dyn_features);
+                                             oi->ip_dyn_features);
        /*
         * Handle inodes which already have inline data 1st.
@@ -1739,6 +1734,13 @@ try_again:
        di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+        trace_ocfs2_write_begin_nolock(
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                        (long long)i_size_read(inode),
+                        le32_to_cpu(di->i_clusters),
+                        pos, len, flags, mmap_page,
+                        clusters_to_alloc, extents_to_split);
        /*
         * We set w_target_from, w_target_to here so that
         * ocfs2_write_end() knows which range in the target page to
@@ -1751,12 +1753,6 @@ try_again:
                 * ocfs2_lock_allocators(). It greatly over-estimates
                 * the work to be done.
                 */
-                mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
-                     " clusters_to_add = %u, extents_to_split = %u\n",
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                     (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
-                     clusters_to_alloc, extents_to_split);
                ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
                                              wc->w_di_bh);
                ret = ocfs2_lock_allocators(inode, &et,
@@ -1938,8 +1934,8 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
        memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
        kunmap_atomic(kaddr, KM_USER0);
-        mlog(0, "Data written to inode at offset %llu. "
+        trace_ocfs2_write_end_inline(
-             "id_count = %u, copied = %u, i_dyn_features = 0x%x\n",
+             (unsigned long long)OCFS2_I(inode)->ip_blkno,
             (unsigned long long)pos, *copied,
             le16_to_cpu(di->id2.i_data.id_count),
             le16_to_cpu(di->i_dyn_features));
@@ -2043,7 +2039,6 @@ const struct address_space_operations ocfs2_aops = {
        .write_begin            = ocfs2_write_begin,
        .write_end              = ocfs2_write_end,
        .bmap                   = ocfs2_bmap,
-        .sync_page              = block_sync_page,
        .direct_IO              = ocfs2_direct_IO,
        .invalidatepage         = ocfs2_invalidatepage,
        .releasepage            = ocfs2_releasepage,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index eceb456037c1..75cf3ad987a6 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -71,7 +71,7 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
 /*
 * Using a named enum representing lock types in terms of #N bit stored in
- * iocb->private, which is going to be used for communication bewteen
+ * iocb->private, which is going to be used for communication between
 * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
 */
 enum ocfs2_iocb_lock_bits {
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index f9d5d3ffc75a..5d18ad10c27f 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -35,8 +35,8 @@
 #include "inode.h"
 #include "journal.h"
 #include "uptodate.h"
 #include "buffer_head_io.h"
+#include "ocfs2_trace.h"
 /*
 * Bits on bh->b_state used by ocfs2.
@@ -55,8 +55,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 {
        int ret = 0;
-        mlog_entry("(bh->b_blocknr = %llu, ci=%p)\n",
+        trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci);
-                   (unsigned long long)bh->b_blocknr, ci);
        BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
        BUG_ON(buffer_jbd(bh));
@@ -66,6 +65,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
         * can get modified during recovery even if read-only. */
        if (ocfs2_is_hard_readonly(osb)) {
                ret = -EROFS;
+                mlog_errno(ret);
                goto out;
        }
@@ -91,11 +91,11 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
                 * uptodate. */
                ret = -EIO;
                put_bh(bh);
+                mlog_errno(ret);
        }
        ocfs2_metadata_cache_io_unlock(ci);
 out:
-        mlog_exit(ret);
        return ret;
 }
@@ -106,10 +106,10 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
        unsigned int i;
        struct buffer_head *bh;
-        if (!nr) {
+        trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
-                mlog(ML_BH_IO, "No buffers will be read!\n");
+        if (!nr)
                goto bail;
-        }
        for (i = 0 ; i < nr ; i++) {
                if (bhs[i] == NULL) {
@@ -123,10 +123,8 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
                bh = bhs[i];
                if (buffer_jbd(bh)) {
-                        mlog(ML_BH_IO,
+                        trace_ocfs2_read_blocks_sync_jbd(
-                             "trying to sync read a jbd "
+                                        (unsigned long long)bh->b_blocknr);
-                             "managed bh (blocknr = %llu), skipping\n",
-                             (unsigned long long)bh->b_blocknr);
                        continue;
                }
@@ -186,8 +184,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
        struct buffer_head *bh;
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
-        mlog_entry("(ci=%p, block=(%llu), nr=(%d), flags=%d)\n",
+        trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
-                   ci, (unsigned long long)block, nr, flags);
        BUG_ON(!ci);
        BUG_ON((flags & OCFS2_BH_READAHEAD) &&
@@ -207,7 +204,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
        }
        if (nr == 0) {
-                mlog(ML_BH_IO, "No buffers will be read!\n");
                status = 0;
                goto bail;
        }
@@ -251,8 +247,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
                 */
                if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
-                        mlog(ML_UPTODATE,
+                        trace_ocfs2_read_blocks_from_disk(
-                             "bh (%llu), owner %llu not uptodate\n",
                             (unsigned long long)bh->b_blocknr,
                             (unsigned long long)ocfs2_metadata_cache_owner(ci));
                        /* We're using ignore_cache here to say
@@ -260,11 +255,10 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
                        ignore_cache = 1;
                }
+                trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr,
+                        ignore_cache, buffer_jbd(bh), buffer_dirty(bh));
                if (buffer_jbd(bh)) {
-                        if (ignore_cache)
-                                mlog(ML_BH_IO, "trying to sync read a jbd "
-                                               "managed bh (blocknr = %llu)\n",
-                                     (unsigned long long)bh->b_blocknr);
                        continue;
                }
@@ -272,9 +266,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
                        if (buffer_dirty(bh)) {
                                /* This should probably be a BUG, or
                                 * at least return an error. */
-                                mlog(ML_BH_IO, "asking me to sync read a dirty "
-                                               "buffer! (blocknr = %llu)\n",
-                                     (unsigned long long)bh->b_blocknr);
                                continue;
                        }
@@ -367,14 +358,11 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
        }
        ocfs2_metadata_cache_io_unlock(ci);
-        mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
+        trace_ocfs2_read_blocks_end((unsigned long long)block, nr,
-             (unsigned long long)block, nr,
+                                    flags, ignore_cache);
-             ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
-             flags);
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -408,13 +396,12 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
        int ret = 0;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
-        mlog_entry_void();
        BUG_ON(buffer_jbd(bh));
        ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
                ret = -EROFS;
+                mlog_errno(ret);
                goto out;
        }
@@ -434,9 +421,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
        if (!buffer_uptodate(bh)) {
                ret = -EIO;
                put_bh(bh);
+                mlog_errno(ret);
        }
 out:
-        mlog_exit(ret);
        return ret;
 }
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index b108e863d8f6..643720209a98 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -367,11 +367,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
 static void o2hb_wait_on_io(struct o2hb_region *reg,
                            struct o2hb_bio_wait_ctxt *wc)
 {
-        struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
-        blk_run_address_space(mapping);
        o2hb_bio_wait_dec(wc, 1);
        wait_for_completion(&wc->wc_io_complete);
 }
@@ -1658,8 +1654,6 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
        struct o2hb_disk_slot *slot;
        struct o2hb_disk_heartbeat_block *hb_block;
-        mlog_entry_void();
        ret = o2hb_read_slots(reg, reg->hr_blocks);
        if (ret) {
                mlog_errno(ret);
@@ -1681,7 +1675,6 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
        }
 out:
-        mlog_exit(ret);
        return ret;
 }
@@ -2282,7 +2275,7 @@ void o2hb_free_hb_set(struct config_group *group)
        kfree(hs);
 }
-/* hb callback registration and issueing */
+/* hb callback registration and issuing */
 static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
 {
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 6c61771469af..07ac24fd9252 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -30,7 +30,7 @@
 struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
 EXPORT_SYMBOL_GPL(mlog_and_bits);
-struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK);
+struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0);
 EXPORT_SYMBOL_GPL(mlog_not_bits);
 static ssize_t mlog_mask_show(u64 mask, char *buf)
@@ -80,8 +80,6 @@ struct mlog_attribute {
 }
 static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
-        define_mask(ENTRY),
-        define_mask(EXIT),
        define_mask(TCP),
        define_mask(MSG),
        define_mask(SOCKET),
@@ -93,27 +91,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(DLM_THREAD),
        define_mask(DLM_MASTER),
        define_mask(DLM_RECOVERY),
-        define_mask(AIO),
-        define_mask(JOURNAL),
-        define_mask(DISK_ALLOC),
-        define_mask(SUPER),
-        define_mask(FILE_IO),
-        define_mask(EXTENT_MAP),
        define_mask(DLM_GLUE),
-        define_mask(BH_IO),
-        define_mask(UPTODATE),
-        define_mask(NAMEI),
-        define_mask(INODE),
        define_mask(VOTE),
-        define_mask(DCACHE),
        define_mask(CONN),
        define_mask(QUORUM),
-        define_mask(EXPORT),
-        define_mask(XATTR),
-        define_mask(QUOTA),
-        define_mask(REFCOUNT),
        define_mask(BASTS),
-        define_mask(RESERVATIONS),
        define_mask(CLUSTER),
        define_mask(ERROR),
        define_mask(NOTICE),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 34d6544357d9..baa2b9ef7eef 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -82,41 +82,23 @@
 /* bits that are frequently given and infrequently matched in the low word */
 /* NOTE: If you add a flag, you need to also update masklog.c! */
-#define ML_ENTRY        0x0000000000000001ULL /* func call entry */
+#define ML_TCP          0x0000000000000001ULL /* net cluster/tcp.c */
-#define ML_EXIT         0x0000000000000002ULL /* func call exit */
+#define ML_MSG          0x0000000000000002ULL /* net network messages */
-#define ML_TCP          0x0000000000000004ULL /* net cluster/tcp.c */
+#define ML_SOCKET       0x0000000000000004ULL /* net socket lifetime */
-#define ML_MSG          0x0000000000000008ULL /* net network messages */
+#define ML_HEARTBEAT    0x0000000000000008ULL /* hb all heartbeat tracking */
-#define ML_SOCKET       0x0000000000000010ULL /* net socket lifetime */
+#define ML_HB_BIO       0x0000000000000010ULL /* hb io tracing */
-#define ML_HEARTBEAT    0x0000000000000020ULL /* hb all heartbeat tracking */
+#define ML_DLMFS        0x0000000000000020ULL /* dlm user dlmfs */
-#define ML_HB_BIO       0x0000000000000040ULL /* hb io tracing */
+#define ML_DLM          0x0000000000000040ULL /* dlm general debugging */
-#define ML_DLMFS        0x0000000000000080ULL /* dlm user dlmfs */
+#define ML_DLM_DOMAIN   0x0000000000000080ULL /* dlm domain debugging */
-#define ML_DLM          0x0000000000000100ULL /* dlm general debugging */
+#define ML_DLM_THREAD   0x0000000000000100ULL /* dlm domain thread */
-#define ML_DLM_DOMAIN   0x0000000000000200ULL /* dlm domain debugging */
+#define ML_DLM_MASTER   0x0000000000000200ULL /* dlm master functions */
-#define ML_DLM_THREAD   0x0000000000000400ULL /* dlm domain thread */
+#define ML_DLM_RECOVERY 0x0000000000000400ULL /* dlm master functions */
-#define ML_DLM_MASTER   0x0000000000000800ULL /* dlm master functions */
+#define ML_DLM_GLUE     0x0000000000000800ULL /* ocfs2 dlm glue layer */
-#define ML_DLM_RECOVERY 0x0000000000001000ULL /* dlm master functions */
+#define ML_VOTE         0x0000000000001000ULL /* ocfs2 node messaging  */
-#define ML_AIO          0x0000000000002000ULL /* ocfs2 aio read and write */
+#define ML_CONN         0x0000000000002000ULL /* net connection management */
-#define ML_JOURNAL      0x0000000000004000ULL /* ocfs2 journalling functions */
+#define ML_QUORUM       0x0000000000004000ULL /* net connection quorum */
-#define ML_DISK_ALLOC   0x0000000000008000ULL /* ocfs2 disk allocation */
+#define ML_BASTS        0x0000000000008000ULL /* dlmglue asts and basts */
-#define ML_SUPER        0x0000000000010000ULL /* ocfs2 mount / umount */
+#define ML_CLUSTER      0x0000000000010000ULL /* cluster stack */
-#define ML_FILE_IO      0x0000000000020000ULL /* ocfs2 file I/O */
-#define ML_EXTENT_MAP   0x0000000000040000ULL /* ocfs2 extent map caching */
-#define ML_DLM_GLUE     0x0000000000080000ULL /* ocfs2 dlm glue layer */
-#define ML_BH_IO        0x0000000000100000ULL /* ocfs2 buffer I/O */
-#define ML_UPTODATE     0x0000000000200000ULL /* ocfs2 caching sequence #'s */
-#define ML_NAMEI        0x0000000000400000ULL /* ocfs2 directory / namespace */
-#define ML_INODE        0x0000000000800000ULL /* ocfs2 inode manipulation */
-#define ML_VOTE         0x0000000001000000ULL /* ocfs2 node messaging  */
-#define ML_DCACHE       0x0000000002000000ULL /* ocfs2 dcache operations */
-#define ML_CONN         0x0000000004000000ULL /* net connection management */
-#define ML_QUORUM       0x0000000008000000ULL /* net connection quorum */
-#define ML_EXPORT       0x0000000010000000ULL /* ocfs2 export operations */
-#define ML_XATTR        0x0000000020000000ULL /* ocfs2 extended attributes */
-#define ML_QUOTA        0x0000000040000000ULL /* ocfs2 quota operations */
-#define ML_REFCOUNT     0x0000000080000000ULL /* refcount tree operations */
-#define ML_BASTS        0x0000000100000000ULL /* dlmglue asts and basts */
-#define ML_RESERVATIONS 0x0000000200000000ULL /* ocfs2 alloc reservations */
-#define ML_CLUSTER      0x0000000400000000ULL /* cluster stack */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR        0x1000000000000000ULL /* sent to KERN_ERR */
@@ -124,7 +106,6 @@
 #define ML_KTHREAD      0x4000000000000000ULL /* kernel thread activity */
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
-#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
 #ifndef MLOG_MASK_PREFIX
 #define MLOG_MASK_PREFIX 0
 #endif
@@ -222,58 +203,6 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
                mlog(ML_ERROR, "status = %lld\n", (long long)_st);      \
 } while (0)
-#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
-#define mlog_entry(fmt, args...) do {                                   \
-        mlog(ML_ENTRY, "ENTRY:" fmt , ##args);                          \
-} while (0)
-#define mlog_entry_void() do {                                          \
-        mlog(ML_ENTRY, "ENTRY:\n");                                     \
-} while (0)
-/*
- * We disable this for sparse.
- */
-#if !defined(__CHECKER__)
-#define mlog_exit(st) do {                                                   \
-        if (__builtin_types_compatible_p(typeof(st), unsigned long))         \
-                mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st));          \
-        else if (__builtin_types_compatible_p(typeof(st), signed long))      \
-                mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st));            \
-        else if (__builtin_types_compatible_p(typeof(st), unsigned int)      \
-                 || __builtin_types_compatible_p(typeof(st), unsigned short) \
-                 || __builtin_types_compatible_p(typeof(st), unsigned char)) \
-                mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st));            \
-        else if (__builtin_types_compatible_p(typeof(st), signed int)        \
-                 || __builtin_types_compatible_p(typeof(st), signed short)   \
-                 || __builtin_types_compatible_p(typeof(st), signed char))   \
-                mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st));              \
-        else if (__builtin_types_compatible_p(typeof(st), long long))        \
-                mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st));             \
-        else                                                                 \
-                mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st));    \
-} while (0)
-#else
-#define mlog_exit(st) do {                                                   \
-        mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st));                     \
-} while (0)
-#endif
-#define mlog_exit_ptr(ptr) do {                                         \
-        mlog(ML_EXIT, "EXIT: %p\n", ptr);                               \
-} while (0)
-#define mlog_exit_void() do {                                           \
-        mlog(ML_EXIT, "EXIT\n");                                        \
-} while (0)
-#else
-#define mlog_entry(...)  do { } while (0)
-#define mlog_entry_void(...)  do { } while (0)
-#define mlog_exit(...)  do { } while (0)
-#define mlog_exit_ptr(...)  do { } while (0)
-#define mlog_exit_void(...)  do { } while (0)
-#endif  /* defined(CONFIG_OCFS2_DEBUG_MASKLOG) */
 #define mlog_bug_on_msg(cond, fmt, args...) do {                        \
        if (cond) {                                                     \
                mlog(ML_ERROR, "bug expression: " #cond "\n");          \
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index a87366750f23..8f9cea1597af 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -89,7 +89,7 @@ static void o2quo_fence_self(void)
        };
 }
-/* Indicate that a timeout occured on a hearbeat region write. The
+/* Indicate that a timeout occurred on a hearbeat region write. The
 * other nodes in the cluster may consider us dead at that time so we
 * want to "fence" ourselves so that we don't scribble on the disk
 * after they think they've recovered us. This can't solve all
@@ -261,7 +261,7 @@ void o2quo_hb_still_up(u8 node)
        spin_unlock(&qs->qs_lock);
 }
-/* This is analagous to hb_up.  as a node's connection comes up we delay the
+/* This is analogous to hb_up.  as a node's connection comes up we delay the
 * quorum decision until we see it heartbeating.  the hold will be droped in
 * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
 * it's already heartbeating we we might be dropping a hold that conn_up got.
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 3b11cb1e38fc..db5ee4b4f47a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -210,10 +210,6 @@ static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
        sc->sc_tv_func_stop = ktime_get();
 }
-static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
-{
-        return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
-}
 #else  /* CONFIG_DEBUG_FS */
 # define o2net_init_nst(a, b, c, d, e)
 # define o2net_set_nst_sock_time(a)
@@ -227,10 +223,14 @@ static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
 # define o2net_set_advance_stop_time(a)
 # define o2net_set_func_start_time(a)
 # define o2net_set_func_stop_time(a)
-# define o2net_get_func_run_time(a)             (ktime_t)0
 #endif /* CONFIG_DEBUG_FS */
 #ifdef CONFIG_OCFS2_FS_STATS
+static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
+{
+        return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
+}
 static void o2net_update_send_stats(struct o2net_send_tracking *nst,
                                    struct o2net_sock_container *sc)
 {
@@ -565,7 +565,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
         * the work queue actually being up. */
        if (!valid && o2net_wq) {
                unsigned long delay;
-                /* delay if we're withing a RECONNECT_DELAY of the
+                /* delay if we're within a RECONNECT_DELAY of the
                 * last attempt */
                delay = (nn->nn_last_connect_attempt +
                         msecs_to_jiffies(o2net_reconnect_delay()))
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 6d80ecc7834f..e5ba34818332 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -28,7 +28,6 @@
 #include <linux/slab.h>
 #include <linux/namei.h>
-#define MLOG_MASK_PREFIX ML_DCACHE
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -39,6 +38,7 @@
 #include "file.h"
 #include "inode.h"
 #include "super.h"
+#include "ocfs2_trace.h"
 void ocfs2_dentry_attach_gen(struct dentry *dentry)
 {
@@ -56,14 +56,14 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
        int ret = 0;    /* if all else fails, just return false */
        struct ocfs2_super *osb;
-        if (nd->flags & LOOKUP_RCU)
+        if (nd && nd->flags & LOOKUP_RCU)
                return -ECHILD;
        inode = dentry->d_inode;
        osb = OCFS2_SB(dentry->d_sb);
-        mlog_entry("(0x%p, '%.*s')\n", dentry,
+        trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len,
-                   dentry->d_name.len, dentry->d_name.name);
+                                      dentry->d_name.name);
        /* For a negative dentry -
         * check the generation number of the parent and compare with the
@@ -73,9 +73,10 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
                unsigned long gen = (unsigned long) dentry->d_fsdata;
                unsigned long pgen =
                        OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
-                mlog(0, "negative dentry: %.*s parent gen: %lu "
-                        "dentry gen: %lu\n",
+                trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
-                        dentry->d_name.len, dentry->d_name.name, pgen, gen);
+                                                       dentry->d_name.name,
+                                                       pgen, gen);
                if (gen != pgen)
                        goto bail;
                goto valid;
@@ -90,8 +91,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
        /* did we or someone else delete this inode? */
        if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                mlog(0, "inode (%llu) deleted, returning false\n",
+                trace_ocfs2_dentry_revalidate_delete(
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
                goto bail;
        }
        spin_unlock(&OCFS2_I(inode)->ip_lock);
@@ -101,10 +102,9 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
         * inode nlink hits zero, it never goes back.
         */
        if (inode->i_nlink == 0) {
-                mlog(0, "Inode %llu orphaned, returning false "
+                trace_ocfs2_dentry_revalidate_orphaned(
-                     "dir = %d\n",
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                        S_ISDIR(inode->i_mode));
-                     S_ISDIR(inode->i_mode));
                goto bail;
        }
@@ -113,9 +113,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
         * redo it.
         */
        if (!dentry->d_fsdata) {
-                mlog(0, "Inode %llu doesn't have dentry lock, "
+                trace_ocfs2_dentry_revalidate_nofsdata(
-                     "returning false\n",
+                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
                goto bail;
        }
@@ -123,8 +122,7 @@ valid:
        ret = 1;
 bail:
-        mlog_exit(ret);
+        trace_ocfs2_dentry_revalidate_ret(ret);
        return ret;
 }
@@ -181,8 +179,8 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
                spin_lock(&dentry->d_lock);
                if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
-                        mlog(0, "dentry found: %.*s\n",
+                        trace_ocfs2_find_local_alias(dentry->d_name.len,
-                             dentry->d_name.len, dentry->d_name.name);
+                                                     dentry->d_name.name);
                        dget_dlock(dentry);
                        spin_unlock(&dentry->d_lock);
@@ -240,9 +238,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
        struct dentry *alias;
        struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
-        mlog(0, "Attach \"%.*s\", parent %llu, fsdata: %p\n",
+        trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
-             dentry->d_name.len, dentry->d_name.name,
+                                       (unsigned long long)parent_blkno, dl);
-             (unsigned long long)parent_blkno, dl);
        /*
         * Negative dentry. We ignore these for now.
@@ -292,7 +289,9 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
                                (unsigned long long)parent_blkno,
                                (unsigned long long)dl->dl_parent_blkno);
-                mlog(0, "Found: %s\n", dl->dl_lockres.l_name);
+                trace_ocfs2_dentry_attach_lock_found(dl->dl_lockres.l_name,
+                                (unsigned long long)parent_blkno,
+                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
                goto out_attach;
        }
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index d417b3f9b0c7..9fe5b8fd658f 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -43,7 +43,6 @@
 #include <linux/quotaops.h>
 #include <linux/sort.h>
-#define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -61,6 +60,7 @@
 #include "super.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "ocfs2_trace.h"
 #include "buffer_head_io.h"
@@ -322,21 +322,23 @@ static int ocfs2_check_dir_entry(struct inode * dir,
        const char *error_msg = NULL;
        const int rlen = le16_to_cpu(de->rec_len);
-        if (rlen < OCFS2_DIR_REC_LEN(1))
+        if (unlikely(rlen < OCFS2_DIR_REC_LEN(1)))
                error_msg = "rec_len is smaller than minimal";
-        else if (rlen % 4 != 0)
+        else if (unlikely(rlen % 4 != 0))
                error_msg = "rec_len % 4 != 0";
-        else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
+        else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len)))
                error_msg = "rec_len is too small for name_len";
-        else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+        else if (unlikely(
+                 ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))
                error_msg = "directory entry across blocks";
-        if (error_msg != NULL)
+        if (unlikely(error_msg != NULL))
                mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
                     "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
                     (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
                     offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
                     de->name_len);
        return error_msg == NULL ? 1 : 0;
 }
@@ -354,7 +356,7 @@ static inline int ocfs2_match(int len,
 /*
 * Returns 0 if not found, -1 on failure, and 1 on success
 */
-static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+static inline int ocfs2_search_dirblock(struct buffer_head *bh,
                                        struct inode *dir,
                                        const char *name, int namelen,
                                        unsigned long offset,
@@ -367,8 +369,6 @@ static int inline ocfs2_search_dirblock(struct buffer_head *bh,
        int de_len;
        int ret = 0;
-        mlog_entry_void();
        de_buf = first_de;
        dlimit = de_buf + bytes;
@@ -402,7 +402,7 @@ static int inline ocfs2_search_dirblock(struct buffer_head *bh,
        }
 bail:
-        mlog_exit(ret);
+        trace_ocfs2_search_dirblock(ret);
        return ret;
 }
@@ -447,8 +447,7 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
         * We don't validate dirents here, that's handled
         * in-place when the code walks them.
         */
-        mlog(0, "Validating dirblock %llu\n",
+        trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr);
-             (unsigned long long)bh->b_blocknr);
        BUG_ON(!buffer_uptodate(bh));
@@ -706,8 +705,6 @@ static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
        int num = 0;
        int nblocks, i, err;
-        mlog_entry_void();
        sb = dir->i_sb;
        nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
@@ -788,7 +785,7 @@ cleanup_and_exit:
        for (; ra_ptr < ra_max; ra_ptr++)
                brelse(bh_use[ra_ptr]);
-        mlog_exit_ptr(ret);
+        trace_ocfs2_find_entry_el(ret);
        return ret;
 }
@@ -950,11 +947,9 @@ static int ocfs2_dx_dir_search(const char *name, int namelen,
                goto out;
        }
-        mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
+        trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno,
-             "returns: %llu\n",
+                                  namelen, name, hinfo->major_hash,
-             (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                                  hinfo->minor_hash, (unsigned long long)phys);
-             namelen, name, hinfo->major_hash, hinfo->minor_hash,
-             (unsigned long long)phys);
        ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
        if (ret) {
@@ -964,9 +959,9 @@ static int ocfs2_dx_dir_search(const char *name, int namelen,
        dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
-        mlog(0, "leaf info: num_used: %d, count: %d\n",
+        trace_ocfs2_dx_dir_search_leaf_info(
-             le16_to_cpu(dx_leaf->dl_list.de_num_used),
+                        le16_to_cpu(dx_leaf->dl_list.de_num_used),
-             le16_to_cpu(dx_leaf->dl_list.de_count));
+                        le16_to_cpu(dx_leaf->dl_list.de_count));
        entry_list = &dx_leaf->dl_list;
@@ -1166,8 +1161,6 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
        int i, status = -ENOENT;
        ocfs2_journal_access_func access = ocfs2_journal_access_db;
-        mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                access = ocfs2_journal_access_di;
@@ -1202,7 +1195,6 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
        }
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -1348,8 +1340,8 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
                }
        }
-        mlog(0, "Dir %llu: delete entry at index: %d\n",
+        trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno,
-             (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
+                                    index);
        ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
                                   leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
@@ -1632,8 +1624,6 @@ int __ocfs2_add_entry(handle_t *handle,
        struct buffer_head *insert_bh = lookup->dl_leaf_bh;
        char *data_start = insert_bh->b_data;
-        mlog_entry_void();
        if (!namelen)
                return -EINVAL;
@@ -1765,8 +1755,9 @@ int __ocfs2_add_entry(handle_t *handle,
         * from ever getting here. */
        retval = -ENOSPC;
 bail:
+        if (retval)
+                mlog_errno(retval);
-        mlog_exit(retval);
        return retval;
 }
@@ -2028,8 +2019,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
        struct inode *inode = filp->f_path.dentry->d_inode;
        int lock_level = 0;
-        mlog_entry("dirino=%llu\n",
+        trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
-                   (unsigned long long)OCFS2_I(inode)->ip_blkno);
        error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
        if (lock_level && error >= 0) {
@@ -2051,9 +2041,10 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
                                      dirent, filldir, NULL);
        ocfs2_inode_unlock(inode, lock_level);
+        if (error)
+                mlog_errno(error);
 bail_nolock:
-        mlog_exit(error);
        return error;
 }
@@ -2069,8 +2060,8 @@ int ocfs2_find_files_on_disk(const char *name,
 {
        int status = -ENOENT;
-        mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
+        trace_ocfs2_find_files_on_disk(namelen, name, blkno,
-             (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
        status = ocfs2_find_entry(name, namelen, inode, lookup);
        if (status)
@@ -2114,8 +2105,8 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
        int ret;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
-        mlog_entry("dir %llu, name '%.*s'\n",
+        trace_ocfs2_check_dir_for_entry(
-                   (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
+                (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
        ret = -EEXIST;
        if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
@@ -2125,7 +2116,8 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
 bail:
        ocfs2_free_dir_lookup_result(&lookup);
-        mlog_exit(ret);
+        if (ret)
+                mlog_errno(ret);
        return ret;
 }
@@ -2324,8 +2316,6 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        struct buffer_head *new_bh = NULL;
        struct ocfs2_dir_entry *de;
-        mlog_entry_void();
        if (ocfs2_new_dir_wants_trailer(inode))
                size = ocfs2_dir_trailer_blk_off(parent->i_sb);
@@ -2380,7 +2370,6 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 bail:
        brelse(new_bh);
-        mlog_exit(status);
        return status;
 }
@@ -2409,9 +2398,9 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
                goto out;
        }
-        mlog(0, "Dir %llu, attach new index block: %llu\n",
+        trace_ocfs2_dx_dir_attach_index(
-             (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                                (unsigned long long)OCFS2_I(dir)->ip_blkno,
-             (unsigned long long)dr_blkno);
+                                (unsigned long long)dr_blkno);
        dx_root_bh = sb_getblk(osb->sb, dr_blkno);
        if (dx_root_bh == NULL) {
@@ -2511,11 +2500,10 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
                dx_leaf->dl_list.de_count =
                        cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
-                mlog(0,
+                trace_ocfs2_dx_dir_format_cluster(
-                     "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
+                                (unsigned long long)OCFS2_I(dir)->ip_blkno,
-                     (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                                (unsigned long long)bh->b_blocknr,
-                     (unsigned long long)bh->b_blocknr,
+                                le16_to_cpu(dx_leaf->dl_list.de_count));
-                     le16_to_cpu(dx_leaf->dl_list.de_count));
                ocfs2_journal_dirty(handle, bh);
        }
@@ -2759,12 +2747,11 @@ static void ocfs2_dx_dir_index_root_block(struct inode *dir,
                ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
-                mlog(0,
+                trace_ocfs2_dx_dir_index_root_block(
-                     "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
+                                (unsigned long long)dir->i_ino,
-                     (unsigned long long)dir->i_ino, hinfo.major_hash,
+                                hinfo.major_hash, hinfo.minor_hash,
-                     hinfo.minor_hash,
+                                de->name_len, de->name,
-                     le16_to_cpu(dx_root->dr_entries.de_num_used),
+                                le16_to_cpu(dx_root->dr_entries.de_num_used));
-                     de->name_len, de->name);
                ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
                                           dirent_blk);
@@ -3235,7 +3222,6 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 bail:
        if (did_quota && status < 0)
                dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
-        mlog_exit(status);
        return status;
 }
@@ -3270,8 +3256,6 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
        struct ocfs2_extent_tree et;
        struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
-        mlog_entry_void();
        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
                /*
                 * This would be a code error as an inline directory should
@@ -3320,8 +3304,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
        down_write(&OCFS2_I(dir)->ip_alloc_sem);
        drop_alloc_sem = 1;
        dir_i_size = i_size_read(dir);
-        mlog(0, "extending dir %llu (i_size = %lld)\n",
+        trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno,
-             (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
+                               dir_i_size);
        /* dir->i_size is always block aligned. */
        spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -3436,7 +3420,6 @@ bail:
        brelse(new_bh);
-        mlog_exit(status);
        return status;
 }
@@ -3583,8 +3566,9 @@ next:
        status = 0;
 bail:
        brelse(bh);
+        if (status)
+                mlog_errno(status);
-        mlog_exit(status);
        return status;
 }
@@ -3815,9 +3799,9 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
        struct ocfs2_dx_root_block *dx_root;
        struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
-        mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
+        trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno,
-             (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                                     (unsigned long long)leaf_blkno,
-             (unsigned long long)leaf_blkno, insert_hash);
+                                     insert_hash);
        ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
@@ -3897,8 +3881,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
                goto  out_commit;
        }
-        mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
+        trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash);
-             leaf_cpos, split_hash, insert_hash);
        /*
         * We have to carefully order operations here. There are items
@@ -4355,8 +4338,8 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
        unsigned int blocks_wanted = 1;
        struct buffer_head *bh = NULL;
-        mlog(0, "getting ready to insert namelen %d into dir %llu\n",
+        trace_ocfs2_prepare_dir_for_insert(
-             namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
+                (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
        if (!namelen) {
                ret = -EINVAL;
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index dcebf0d920fa..c8a044efbb15 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,4 +1,4 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
 obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 9f30491e5e88..29a886d1e82c 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -128,8 +128,8 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
        assert_spin_locked(&res->spinlock);
-        mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
+        mlog(0, "type=%d, convert_type=%d, new convert_type=%d\n",
-                   lock->ml.type, lock->ml.convert_type, type);
+             lock->ml.type, lock->ml.convert_type, type);
        spin_lock(&lock->spinlock);
@@ -353,7 +353,7 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
        struct kvec vec[2];
        size_t veclen = 1;
-        mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+        mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
        memset(&convert, 0, sizeof(struct dlm_convert_lock));
        convert.node_idx = dlm->node_num;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 7e38a072d720..7540a492eaba 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -188,7 +188,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
        struct hlist_head *bucket;
        struct hlist_node *list;
-        mlog_entry("%.*s\n", len, name);
+        mlog(0, "%.*s\n", len, name);
        assert_spin_locked(&dlm->spinlock);
@@ -222,7 +222,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
 {
        struct dlm_lock_resource *res = NULL;
-        mlog_entry("%.*s\n", len, name);
+        mlog(0, "%.*s\n", len, name);
        assert_spin_locked(&dlm->spinlock);
@@ -531,7 +531,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
        unsigned int node;
        struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
-        mlog_entry("%p %u %p", msg, len, data);
+        mlog(0, "%p %u %p", msg, len, data);
        if (!dlm_grab(dlm))
                return 0;
@@ -926,9 +926,10 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
 }
 static int dlm_match_regions(struct dlm_ctxt *dlm,
-                             struct dlm_query_region *qr)
+                             struct dlm_query_region *qr,
+                             char *local, int locallen)
 {
-        char *local = NULL, *remote = qr->qr_regions;
+        char *remote = qr->qr_regions;
        char *l, *r;
        int localnr, i, j, foundit;
        int status = 0;
@@ -957,13 +958,8 @@ static int dlm_match_regions(struct dlm_ctxt *dlm,
                r += O2HB_MAX_REGION_NAME_LEN;
        }
-        local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC);
+        localnr = min(O2NM_MAX_REGIONS, locallen/O2HB_MAX_REGION_NAME_LEN);
-        if (!local) {
+        localnr = o2hb_get_all_regions(local, (u8)localnr);
-                status = -ENOMEM;
-                goto bail;
-        }
-        localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS);
        /* compare local regions with remote */
        l = local;
@@ -1012,8 +1008,6 @@ static int dlm_match_regions(struct dlm_ctxt *dlm,
        }
 bail:
-        kfree(local);
        return status;
 }
@@ -1075,6 +1069,7 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
 {
        struct dlm_query_region *qr;
        struct dlm_ctxt *dlm = NULL;
+        char *local = NULL;
        int status = 0;
        int locked = 0;
@@ -1083,6 +1078,13 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
        mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
             qr->qr_domain);
+        /* buffer used in dlm_mast_regions() */
+        local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
+        if (!local) {
+                status = -ENOMEM;
+                goto bail;
+        }
        status = -EINVAL;
        spin_lock(&dlm_domain_lock);
@@ -1112,13 +1114,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
                goto bail;
        }
-        status = dlm_match_regions(dlm, qr);
+        status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions));
 bail:
        if (locked)
                spin_unlock(&dlm->spinlock);
        spin_unlock(&dlm_domain_lock);
+        kfree(local);
        return status;
 }
@@ -1553,7 +1557,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
        struct domain_join_ctxt *ctxt;
        enum dlm_query_join_response_code response = JOIN_DISALLOW;
-        mlog_entry("%p", dlm);
+        mlog(0, "%p", dlm);
        ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
        if (!ctxt) {
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 7009292aac5a..8d39e0fd66f7 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -128,7 +128,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
        int call_ast = 0, kick_thread = 0;
        enum dlm_status status = DLM_NORMAL;
-        mlog_entry("type=%d\n", lock->ml.type);
+        mlog(0, "type=%d\n", lock->ml.type);
        spin_lock(&res->spinlock);
        /* if called from dlm_create_lock_handler, need to
@@ -227,8 +227,8 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
        enum dlm_status status = DLM_DENIED;
        int lockres_changed = 1;
-        mlog_entry("type=%d\n", lock->ml.type);
+        mlog(0, "type=%d, lockres %.*s, flags = 0x%x\n",
-        mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
+             lock->ml.type, res->lockname.len,
             res->lockname.name, flags);
        spin_lock(&res->spinlock);
@@ -308,8 +308,6 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
        int tmpret, status = 0;
        enum dlm_status ret;
-        mlog_entry_void();
        memset(&create, 0, sizeof(create));
        create.node_idx = dlm->node_num;
        create.requested_type = lock->ml.type;
@@ -477,8 +475,6 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
        BUG_ON(!dlm);
-        mlog_entry_void();
        if (!dlm_grab(dlm))
                return DLM_REJECTED;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 59f0f6bdfc62..fede57ed005f 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -426,8 +426,6 @@ static void dlm_mle_release(struct kref *kref)
        struct dlm_master_list_entry *mle;
        struct dlm_ctxt *dlm;
-        mlog_entry_void();
        mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
        dlm = mle->dlm;
@@ -810,7 +808,7 @@ lookup:
                                dlm_mle_detach_hb_events(dlm, mle);
                        dlm_put_mle(mle);
                        mle = NULL;
-                        /* this is lame, but we cant wait on either
+                        /* this is lame, but we can't wait on either
                         * the mle or lockres waitqueue here */
                        if (mig)
                                msleep(100);
@@ -845,7 +843,7 @@ lookup:
        /* finally add the lockres to its hash bucket */
        __dlm_insert_lockres(dlm, res);
-        /* since this lockres is new it doesnt not require the spinlock */
+        /* since this lockres is new it doesn't not require the spinlock */
        dlm_lockres_grab_inflight_ref_new(dlm, res);
        /* if this node does not become the master make sure to drop
@@ -3120,8 +3118,6 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
        *oldmle = NULL;
-        mlog_entry_void();
        assert_spin_locked(&dlm->spinlock);
        assert_spin_locked(&dlm->master_lock);
@@ -3261,7 +3257,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
        struct hlist_node *list;
        unsigned int i;
-        mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
+        mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
 top:
        assert_spin_locked(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index aaaffbcbe916..f1beb6fc254d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -727,7 +727,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
        if (destroy)
                dlm_destroy_recovery_area(dlm, dead_node);
-        mlog_exit(status);
        return status;
 }
@@ -1496,9 +1495,9 @@ leave:
                        kfree(buf);
                if (item)
                        kfree(item);
+                mlog_errno(ret);
        }
-        mlog_exit(ret);
        return ret;
 }
@@ -1567,7 +1566,6 @@ leave:
                dlm_lockres_put(res);
        }
        kfree(data);
-        mlog_exit(ret);
 }
@@ -1986,7 +1984,6 @@ leave:
                        dlm_lock_put(newlock);
        }
-        mlog_exit(ret);
        return ret;
 }
@@ -2083,8 +2080,6 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
        struct hlist_head *bucket;
        struct dlm_lock_resource *res, *next;
-        mlog_entry_void();
        assert_spin_locked(&dlm->spinlock);
        list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
@@ -2607,8 +2602,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
        int nodenum;
        int status;
-        mlog_entry("%u\n", dead_node);
        mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
        spin_lock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 817287c6a6db..850aa7e87537 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -317,7 +317,7 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
        struct kvec vec[2];
        size_t veclen = 1;
-        mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+        mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
        if (owner == dlm->node_num) {
                /* ended up trying to contact ourself.  this means
@@ -588,8 +588,6 @@ enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
        struct dlm_lock *lock = NULL;
        int call_ast, is_master;
-        mlog_entry_void();
        if (!lksb) {
                dlm_error(DLM_BADARGS);
                return DLM_BADARGS;
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index df69b4856d0d..f14be89a6701 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -1,4 +1,4 @@
-EXTRA_CFLAGS += -Ifs/ocfs2
+ccflags-y := -Ifs/ocfs2
 obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e8d94d722ecb..7642d7ca73e5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -64,7 +64,7 @@ struct ocfs2_mask_waiter {
        unsigned long           mw_mask;
        unsigned long           mw_goal;
 #ifdef CONFIG_OCFS2_FS_STATS
-        unsigned long long      mw_lock_start;
+        ktime_t                 mw_lock_start;
 #endif
 };
@@ -397,8 +397,6 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
 {
        int len;
-        mlog_entry_void();
        BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
        len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
@@ -408,8 +406,6 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
        BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
        mlog(0, "built lock resource with name: %s\n", name);
-        mlog_exit_void();
 }
 static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
@@ -435,44 +431,41 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
 #ifdef CONFIG_OCFS2_FS_STATS
 static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
 {
-        res->l_lock_num_prmode = 0;
-        res->l_lock_num_prmode_failed = 0;
-        res->l_lock_total_prmode = 0;
-        res->l_lock_max_prmode = 0;
-        res->l_lock_num_exmode = 0;
-        res->l_lock_num_exmode_failed = 0;
-        res->l_lock_total_exmode = 0;
-        res->l_lock_max_exmode = 0;
        res->l_lock_refresh = 0;
+        memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats));
+        memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats));
 }
 static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
                                    struct ocfs2_mask_waiter *mw, int ret)
 {
-        unsigned long long *num, *sum;
+        u32 usec;
-        unsigned int *max, *failed;
+        ktime_t kt;
-        struct timespec ts = current_kernel_time();
+        struct ocfs2_lock_stats *stats;
-        unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start;
+        if (level == LKM_PRMODE)
-        if (level == LKM_PRMODE) {
+                stats = &res->l_lock_prmode;
-                num = &res->l_lock_num_prmode;
+        else if (level == LKM_EXMODE)
-                sum = &res->l_lock_total_prmode;
+                stats = &res->l_lock_exmode;
-                max = &res->l_lock_max_prmode;
+        else
-                failed = &res->l_lock_num_prmode_failed;
-        } else if (level == LKM_EXMODE) {
-                num = &res->l_lock_num_exmode;
-                sum = &res->l_lock_total_exmode;
-                max = &res->l_lock_max_exmode;
-                failed = &res->l_lock_num_exmode_failed;
-        } else
                return;
-        (*num)++;
+        kt = ktime_sub(ktime_get(), mw->mw_lock_start);
-        (*sum) += time;
+        usec = ktime_to_us(kt);
-        if (time > *max)
-                *max = time;
+        stats->ls_gets++;
+        stats->ls_total += ktime_to_ns(kt);
+        /* overflow */
+        if (unlikely(stats->ls_gets) == 0) {
+                stats->ls_gets++;
+                stats->ls_total = ktime_to_ns(kt);
+        }
+        if (stats->ls_max < usec)
+                stats->ls_max = usec;
        if (ret)
-                (*failed)++;
+                stats->ls_fail++;
 }
 static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
@@ -482,8 +475,7 @@ static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
 static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
 {
-        struct timespec ts = current_kernel_time();
+        mw->mw_lock_start = ktime_get();
-        mw->mw_lock_start = timespec_to_ns(&ts);
 }
 #else
 static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
@@ -729,8 +721,6 @@ void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
-        mlog_entry_void();
        if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
                return;
@@ -756,14 +746,11 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
        memset(&res->l_lksb, 0, sizeof(res->l_lksb));
        res->l_flags = 0UL;
-        mlog_exit_void();
 }
 static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
                                     int level)
 {
-        mlog_entry_void();
        BUG_ON(!lockres);
        switch(level) {
@@ -776,15 +763,11 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
        default:
                BUG();
        }
-        mlog_exit_void();
 }
 static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
                                     int level)
 {
-        mlog_entry_void();
        BUG_ON(!lockres);
        switch(level) {
@@ -799,7 +782,6 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
        default:
                BUG();
        }
-        mlog_exit_void();
 }
 /* WARNING: This function lives in a world where the only three lock
@@ -846,8 +828,6 @@ static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
 {
-        mlog_entry_void();
        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
@@ -860,14 +840,10 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res
                lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
        }
        lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-        mlog_exit_void();
 }
 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
 {
-        mlog_entry_void();
        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
        BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
@@ -889,14 +865,10 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
        lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
        lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-        mlog_exit_void();
 }
 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
 {
-        mlog_entry_void();
        BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
        BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
@@ -908,15 +880,12 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
        lockres->l_level = lockres->l_requested;
        lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
        lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-        mlog_exit_void();
 }
 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
                                     int level)
 {
        int needs_downconvert = 0;
-        mlog_entry_void();
        assert_spin_locked(&lockres->l_lock);
@@ -938,8 +907,7 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
        if (needs_downconvert)
                lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
+        mlog(0, "needs_downconvert = %d\n", needs_downconvert);
-        mlog_exit(needs_downconvert);
        return needs_downconvert;
 }
@@ -1151,8 +1119,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
        struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
        unsigned long flags;
-        mlog_entry_void();
        mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
             lockres->l_name, lockres->l_unlock_action);
@@ -1162,7 +1128,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
                     "unlock_action %d\n", error, lockres->l_name,
                     lockres->l_unlock_action);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
-                mlog_exit_void();
                return;
        }
@@ -1186,8 +1151,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
        lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
        wake_up(&lockres->l_event);
        spin_unlock_irqrestore(&lockres->l_lock, flags);
-        mlog_exit_void();
 }
 /*
@@ -1233,7 +1196,6 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 {
        unsigned long flags;
-        mlog_entry_void();
        spin_lock_irqsave(&lockres->l_lock, flags);
        lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
        lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
@@ -1244,7 +1206,6 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
        spin_unlock_irqrestore(&lockres->l_lock, flags);
        wake_up(&lockres->l_event);
-        mlog_exit_void();
 }
 /* Note: If we detect another process working on the lock (i.e.,
@@ -1260,8 +1221,6 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
        unsigned long flags;
        unsigned int gen;
-        mlog_entry_void();
        mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
             dlm_flags);
@@ -1293,7 +1252,6 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
        mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
 bail:
-        mlog_exit(ret);
        return ret;
 }
@@ -1416,8 +1374,6 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
        unsigned int gen;
        int noqueue_attempted = 0;
-        mlog_entry_void();
        ocfs2_init_mask_waiter(&mw);
        if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
@@ -1583,7 +1539,6 @@ out:
                                caller_ip);
        }
 #endif
-        mlog_exit(ret);
        return ret;
 }
@@ -1605,7 +1560,6 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
 {
        unsigned long flags;
-        mlog_entry_void();
        spin_lock_irqsave(&lockres->l_lock, flags);
        ocfs2_dec_holders(lockres, level);
        ocfs2_downconvert_on_unlock(osb, lockres);
@@ -1614,7 +1568,6 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
        if (lockres->l_lockdep_map.key != NULL)
                rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
 #endif
-        mlog_exit_void();
 }
 static int ocfs2_create_new_lock(struct ocfs2_super *osb,
@@ -1648,8 +1601,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
        BUG_ON(!inode);
        BUG_ON(!ocfs2_inode_is_new(inode));
-        mlog_entry_void();
        mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
        /* NOTE: That we don't increment any of the holder counts, nor
@@ -1683,7 +1634,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
        }
 bail:
-        mlog_exit(ret);
        return ret;
 }
@@ -1695,16 +1645,12 @@ int ocfs2_rw_lock(struct inode *inode, int write)
        BUG_ON(!inode);
-        mlog_entry_void();
        mlog(0, "inode %llu take %s RW lock\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
             write ? "EXMODE" : "PRMODE");
-        if (ocfs2_mount_local(osb)) {
+        if (ocfs2_mount_local(osb))
-                mlog_exit(0);
                return 0;
-        }
        lockres = &OCFS2_I(inode)->ip_rw_lockres;
@@ -1715,7 +1661,6 @@ int ocfs2_rw_lock(struct inode *inode, int write)
        if (status < 0)
                mlog_errno(status);
-        mlog_exit(status);
        return status;
 }
@@ -1725,16 +1670,12 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        mlog_entry_void();
        mlog(0, "inode %llu drop %s RW lock\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
             write ? "EXMODE" : "PRMODE");
        if (!ocfs2_mount_local(osb))
                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
-        mlog_exit_void();
 }
 /*
@@ -1748,8 +1689,6 @@ int ocfs2_open_lock(struct inode *inode)
        BUG_ON(!inode);
-        mlog_entry_void();
        mlog(0, "inode %llu take PRMODE open lock\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1764,7 +1703,6 @@ int ocfs2_open_lock(struct inode *inode)
                mlog_errno(status);
 out:
-        mlog_exit(status);
        return status;
 }
@@ -1776,8 +1714,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
        BUG_ON(!inode);
-        mlog_entry_void();
        mlog(0, "inode %llu try to take %s open lock\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
             write ? "EXMODE" : "PRMODE");
@@ -1799,7 +1735,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
                                    level, DLM_LKF_NOQUEUE, 0);
 out:
-        mlog_exit(status);
        return status;
 }
@@ -1811,8 +1746,6 @@ void ocfs2_open_unlock(struct inode *inode)
        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        mlog_entry_void();
        mlog(0, "inode %llu drop open lock\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1827,7 +1760,7 @@ void ocfs2_open_unlock(struct inode *inode)
                                     DLM_LOCK_EX);
 out:
-        mlog_exit_void();
+        return;
 }
 static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
@@ -2043,8 +1976,6 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
 {
        int kick = 0;
-        mlog_entry_void();
        /* If we know that another node is waiting on our lock, kick
         * the downconvert thread * pre-emptively when we reach a release
         * condition. */
@@ -2065,8 +1996,6 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
        if (kick)
                ocfs2_wake_downconvert_thread(osb);
-        mlog_exit_void();
 }
 #define OCFS2_SEC_BITS   34
@@ -2095,8 +2024,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
        struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
        struct ocfs2_meta_lvb *lvb;
-        mlog_entry_void();
        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
        /*
@@ -2128,8 +2055,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 out:
        mlog_meta_lvb(0, lockres);
-        mlog_exit_void();
 }
 static void ocfs2_unpack_timespec(struct timespec *spec,
@@ -2145,8 +2070,6 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
        struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
        struct ocfs2_meta_lvb *lvb;
-        mlog_entry_void();
        mlog_meta_lvb(0, lockres);
        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -2177,8 +2100,6 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
        ocfs2_unpack_timespec(&inode->i_ctime,
                              be64_to_cpu(lvb->lvb_ictime_packed));
        spin_unlock(&oi->ip_lock);
-        mlog_exit_void();
 }
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
@@ -2205,8 +2126,6 @@ static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
        unsigned long flags;
        int status = 0;
-        mlog_entry_void();
 refresh_check:
        spin_lock_irqsave(&lockres->l_lock, flags);
        if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
@@ -2227,7 +2146,7 @@ refresh_check:
        status = 1;
 bail:
-        mlog_exit(status);
+        mlog(0, "status %d\n", status);
        return status;
 }
@@ -2237,7 +2156,6 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
                                                   int status)
 {
        unsigned long flags;
-        mlog_entry_void();
        spin_lock_irqsave(&lockres->l_lock, flags);
        lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
@@ -2246,8 +2164,6 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
        spin_unlock_irqrestore(&lockres->l_lock, flags);
        wake_up(&lockres->l_event);
-        mlog_exit_void();
 }
 /* may or may not return a bh if it went to disk. */
@@ -2260,8 +2176,6 @@ static int ocfs2_inode_lock_update(struct inode *inode,
        struct ocfs2_dinode *fe;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        mlog_entry_void();
        if (ocfs2_mount_local(osb))
                goto bail;
@@ -2330,7 +2244,6 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 bail_refresh:
        ocfs2_complete_lock_res_refresh(lockres, status);
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -2374,8 +2287,6 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
        BUG_ON(!inode);
-        mlog_entry_void();
        mlog(0, "inode %llu, take %s META lock\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
             ex ? "EXMODE" : "PRMODE");
@@ -2467,7 +2378,6 @@ bail:
        if (local_bh)
                brelse(local_bh);
-        mlog_exit(status);
        return status;
 }
@@ -2517,7 +2427,6 @@ int ocfs2_inode_lock_atime(struct inode *inode,
 {
        int ret;
-        mlog_entry_void();
        ret = ocfs2_inode_lock(inode, NULL, 0);
        if (ret < 0) {
                mlog_errno(ret);
@@ -2545,7 +2454,6 @@ int ocfs2_inode_lock_atime(struct inode *inode,
        } else
                *level = 0;
-        mlog_exit(ret);
        return ret;
 }
@@ -2556,8 +2464,6 @@ void ocfs2_inode_unlock(struct inode *inode,
        struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        mlog_entry_void();
        mlog(0, "inode %llu drop %s META lock\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
             ex ? "EXMODE" : "PRMODE");
@@ -2565,8 +2471,6 @@ void ocfs2_inode_unlock(struct inode *inode,
        if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
            !ocfs2_mount_local(osb))
                ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
-        mlog_exit_void();
 }
 int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
@@ -2617,8 +2521,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
-        mlog_entry_void();
        if (ocfs2_is_hard_readonly(osb))
                return -EROFS;
@@ -2650,7 +2552,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
                ocfs2_track_lock_refresh(lockres);
        }
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -2869,8 +2770,15 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
        return iter;
 }
-/* So that debugfs.ocfs2 can determine which format is being used */
+/*
-#define OCFS2_DLM_DEBUG_STR_VERSION 2
+ * Version is used by debugfs.ocfs2 to determine the format being used
+ *
+ * New in version 2
+ *      - Lock stats printed
+ * New in version 3
+ *      - Max time in lock stats is in usecs (instead of nsecs)
+ */
+#define OCFS2_DLM_DEBUG_STR_VERSION 3
 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
 {
        int i;
@@ -2912,18 +2820,18 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
                seq_printf(m, "0x%x\t", lvb[i]);
 #ifdef CONFIG_OCFS2_FS_STATS
-# define lock_num_prmode(_l)            (_l)->l_lock_num_prmode
+# define lock_num_prmode(_l)            ((_l)->l_lock_prmode.ls_gets)
-# define lock_num_exmode(_l)            (_l)->l_lock_num_exmode
+# define lock_num_exmode(_l)            ((_l)->l_lock_exmode.ls_gets)
-# define lock_num_prmode_failed(_l)     (_l)->l_lock_num_prmode_failed
+# define lock_num_prmode_failed(_l)     ((_l)->l_lock_prmode.ls_fail)
-# define lock_num_exmode_failed(_l)     (_l)->l_lock_num_exmode_failed
+# define lock_num_exmode_failed(_l)     ((_l)->l_lock_exmode.ls_fail)
-# define lock_total_prmode(_l)          (_l)->l_lock_total_prmode
+# define lock_total_prmode(_l)          ((_l)->l_lock_prmode.ls_total)
-# define lock_total_exmode(_l)          (_l)->l_lock_total_exmode
+# define lock_total_exmode(_l)          ((_l)->l_lock_exmode.ls_total)
-# define lock_max_prmode(_l)            (_l)->l_lock_max_prmode
+# define lock_max_prmode(_l)            ((_l)->l_lock_prmode.ls_max)
-# define lock_max_exmode(_l)            (_l)->l_lock_max_exmode
+# define lock_max_exmode(_l)            ((_l)->l_lock_exmode.ls_max)
-# define lock_refresh(_l)               (_l)->l_lock_refresh
+# define lock_refresh(_l)               ((_l)->l_lock_refresh)
 #else
-# define lock_num_prmode(_l)            (0ULL)
+# define lock_num_prmode(_l)            (0)
-# define lock_num_exmode(_l)            (0ULL)
+# define lock_num_exmode(_l)            (0)
 # define lock_num_prmode_failed(_l)     (0)
 # define lock_num_exmode_failed(_l)     (0)
 # define lock_total_prmode(_l)          (0ULL)
@@ -2933,8 +2841,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
 # define lock_refresh(_l)               (0)
 #endif
        /* The following seq_print was added in version 2 of this output */
-        seq_printf(m, "%llu\t"
+        seq_printf(m, "%u\t"
-                   "%llu\t"
+                   "%u\t"
                   "%u\t"
                   "%u\t"
                   "%llu\t"
@@ -3054,8 +2962,6 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
        int status = 0;
        struct ocfs2_cluster_connection *conn = NULL;
-        mlog_entry_void();
        if (ocfs2_mount_local(osb)) {
                osb->node_num = 0;
                goto local;
@@ -3112,15 +3018,12 @@ bail:
                        kthread_stop(osb->dc_task);
        }
-        mlog_exit(status);
        return status;
 }
 void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
                        int hangup_pending)
 {
-        mlog_entry_void();
        ocfs2_drop_osb_locks(osb);
        /*
@@ -3143,8 +3046,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
        osb->cconn = NULL;
        ocfs2_dlm_shutdown_debug(osb);
-        mlog_exit_void();
 }
 static int ocfs2_drop_lock(struct ocfs2_super *osb,
@@ -3226,7 +3127,6 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
        ocfs2_wait_on_busy_lock(lockres);
 out:
-        mlog_exit(0);
        return 0;
 }
@@ -3284,8 +3184,6 @@ int ocfs2_drop_inode_locks(struct inode *inode)
 {
        int status, err;
-        mlog_entry_void();
        /* No need to call ocfs2_mark_lockres_freeing here -
         * ocfs2_clear_inode has done it for us. */
@@ -3310,7 +3208,6 @@ int ocfs2_drop_inode_locks(struct inode *inode)
        if (err < 0 && !status)
                status = err;
-        mlog_exit(status);
        return status;
 }
@@ -3352,8 +3249,6 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
        int ret;
        u32 dlm_flags = DLM_LKF_CONVERT;
-        mlog_entry_void();
        mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
             lockres->l_level, new_level);
@@ -3375,7 +3270,6 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
        ret = 0;
 bail:
-        mlog_exit(ret);
        return ret;
 }
@@ -3385,8 +3279,6 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
 {
        assert_spin_locked(&lockres->l_lock);
-        mlog_entry_void();
        if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
                /* If we're already trying to cancel a lock conversion
                 * then just drop the spinlock and allow the caller to
@@ -3416,8 +3308,6 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
 {
        int ret;
-        mlog_entry_void();
        ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
                               DLM_LKF_CANCEL);
        if (ret) {
@@ -3427,7 +3317,6 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
        mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
-        mlog_exit(ret);
        return ret;
 }
@@ -3443,8 +3332,6 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
        int set_lvb = 0;
        unsigned int gen;
-        mlog_entry_void();
        spin_lock_irqsave(&lockres->l_lock, flags);
 recheck:
@@ -3619,14 +3506,14 @@ downconvert:
                                     gen);
 leave:
-        mlog_exit(ret);
+        if (ret)
+                mlog_errno(ret);
        return ret;
 leave_requeue:
        spin_unlock_irqrestore(&lockres->l_lock, flags);
        ctl->requeue = 1;
-        mlog_exit(0);
        return 0;
 }
@@ -3859,8 +3746,6 @@ static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
        struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
                                            oinfo->dqi_gi.dqi_type);
-        mlog_entry_void();
        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
        lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
        lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
@@ -3869,8 +3754,6 @@ static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
        lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
        lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
        lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
-        mlog_exit_void();
 }
 void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
@@ -3879,10 +3762,8 @@ void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
        struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
-        mlog_entry_void();
        if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
                ocfs2_cluster_unlock(osb, lockres, level);
-        mlog_exit_void();
 }
 static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
@@ -3937,8 +3818,6 @@ int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
        int status = 0;
-        mlog_entry_void();
        /* On RO devices, locking really isn't needed... */
        if (ocfs2_is_hard_readonly(osb)) {
                if (ex)
@@ -3961,7 +3840,6 @@ int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
                ocfs2_qinfo_unlock(oinfo, ex);
        ocfs2_complete_lock_res_refresh(lockres, status);
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -4007,8 +3885,6 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
         * considered valid until we remove the OCFS2_LOCK_QUEUED
         * flag. */
-        mlog_entry_void();
        BUG_ON(!lockres);
        BUG_ON(!lockres->l_ops);
@@ -4042,15 +3918,11 @@ unqueue:
        if (ctl.unblock_action != UNBLOCK_CONTINUE
            && lockres->l_ops->post_unlock)
                lockres->l_ops->post_unlock(osb, lockres);
-        mlog_exit_void();
 }
 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
                                        struct ocfs2_lock_res *lockres)
 {
-        mlog_entry_void();
        assert_spin_locked(&lockres->l_lock);
        if (lockres->l_flags & OCFS2_LOCK_FREEING) {
@@ -4071,8 +3943,6 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
                osb->blocked_lock_count++;
        }
        spin_unlock(&osb->dc_task_lock);
-        mlog_exit_void();
 }
 static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
@@ -4080,8 +3950,6 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
        unsigned long processed;
        struct ocfs2_lock_res *lockres;
-        mlog_entry_void();
        spin_lock(&osb->dc_task_lock);
        /* grab this early so we know to try again if a state change and
         * wake happens part-way through our work  */
@@ -4105,8 +3973,6 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
                spin_lock(&osb->dc_task_lock);
        }
        spin_unlock(&osb->dc_task_lock);
-        mlog_exit_void();
 }
 static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 5dbc3062b4fd..745db42528d5 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#define MLOG_MASK_PREFIX ML_EXPORT
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -40,6 +39,7 @@
 #include "buffer_head_io.h"
 #include "suballoc.h"
+#include "ocfs2_trace.h"
 struct ocfs2_inode_handle
 {
@@ -56,10 +56,9 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
        int status, set;
        struct dentry *result;
-        mlog_entry("(0x%p, 0x%p)\n", sb, handle);
+        trace_ocfs2_get_dentry_begin(sb, handle, (unsigned long long)blkno);
        if (blkno == 0) {
-                mlog(0, "nfs wants inode with blkno: 0\n");
                result = ERR_PTR(-ESTALE);
                goto bail;
        }
@@ -83,6 +82,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
        }
        status = ocfs2_test_inode_bit(osb, blkno, &set);
+        trace_ocfs2_get_dentry_test_bit(status, set);
        if (status < 0) {
                if (status == -EINVAL) {
                        /*
@@ -90,18 +90,14 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
                         * as an inode, we return -ESTALE to be
                         * nice
                         */
-                        mlog(0, "test inode bit failed %d\n", status);
                        status = -ESTALE;
-                } else {
+                } else
                        mlog(ML_ERROR, "test inode bit failed %d\n", status);
-                }
                goto unlock_nfs_sync;
        }
        /* If the inode allocator bit is clear, this inode must be stale */
        if (!set) {
-                mlog(0, "inode %llu suballoc bit is clear\n",
-                     (unsigned long long)blkno);
                status = -ESTALE;
                goto unlock_nfs_sync;
        }
@@ -114,8 +110,8 @@ unlock_nfs_sync:
 check_err:
        if (status < 0) {
                if (status == -ESTALE) {
-                        mlog(0, "stale inode ino: %llu generation: %u\n",
+                        trace_ocfs2_get_dentry_stale((unsigned long long)blkno,
-                             (unsigned long long)blkno, handle->ih_generation);
+                                                     handle->ih_generation);
                }
                result = ERR_PTR(status);
                goto bail;
@@ -130,8 +126,9 @@ check_err:
 check_gen:
        if (handle->ih_generation != inode->i_generation) {
                iput(inode);
-                mlog(0, "stale inode ino: %llu generation: %u\n",
+                trace_ocfs2_get_dentry_generation((unsigned long long)blkno,
-                     (unsigned long long)blkno, handle->ih_generation);
+                                                  handle->ih_generation,
+                                                  inode->i_generation);
                result = ERR_PTR(-ESTALE);
                goto bail;
        }
@@ -141,7 +138,7 @@ check_gen:
                mlog_errno(PTR_ERR(result));
 bail:
-        mlog_exit_ptr(result);
+        trace_ocfs2_get_dentry_end(result);
        return result;
 }
@@ -152,11 +149,8 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
        struct dentry *parent;
        struct inode *dir = child->d_inode;
-        mlog_entry("(0x%p, '%.*s')\n", child,
+        trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
-                   child->d_name.len, child->d_name.name);
+                               (unsigned long long)OCFS2_I(dir)->ip_blkno);
-        mlog(0, "find parent of directory %llu\n",
-             (unsigned long long)OCFS2_I(dir)->ip_blkno);
        status = ocfs2_inode_lock(dir, NULL, 0);
        if (status < 0) {
@@ -178,7 +172,7 @@ bail_unlock:
        ocfs2_inode_unlock(dir, 0);
 bail:
-        mlog_exit_ptr(parent);
+        trace_ocfs2_get_parent_end(parent);
        return parent;
 }
@@ -193,12 +187,16 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
        u32 generation;
        __le32 *fh = (__force __le32 *) fh_in;
-        mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
+        trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len,
-                   dentry->d_name.len, dentry->d_name.name,
+                                    dentry->d_name.name,
-                   fh, len, connectable);
+                                    fh, len, connectable);
-        if (len < 3 || (connectable && len < 6)) {
+        if (connectable && (len < 6)) {
-                mlog(ML_ERROR, "fh buffer is too small for encoding\n");
+                *max_len = 6;
+                type = 255;
+                goto bail;
+        } else if (len < 3) {
+                *max_len = 3;
                type = 255;
                goto bail;
        }
@@ -206,8 +204,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
        blkno = OCFS2_I(inode)->ip_blkno;
        generation = inode->i_generation;
-        mlog(0, "Encoding fh: blkno: %llu, generation: %u\n",
+        trace_ocfs2_encode_fh_self((unsigned long long)blkno, generation);
-             (unsigned long long)blkno, generation);
        len = 3;
        fh[0] = cpu_to_le32((u32)(blkno >> 32));
@@ -232,14 +229,14 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
                len = 6;
                type = 2;
-                mlog(0, "Encoding parent: blkno: %llu, generation: %u\n",
+                trace_ocfs2_encode_fh_parent((unsigned long long)blkno,
-                     (unsigned long long)blkno, generation);
+                                             generation);
        }
        *max_len = len;
 bail:
-        mlog_exit(type);
+        trace_ocfs2_encode_fh_type(type);
        return type;
 }
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 09e3fdfa6d33..23457b491e8c 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -28,7 +28,6 @@
 #include <linux/types.h>
 #include <linux/fiemap.h>
-#define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -39,6 +38,7 @@
 #include "inode.h"
 #include "super.h"
 #include "symlink.h"
+#include "ocfs2_trace.h"
 #include "buffer_head_io.h"
@@ -841,10 +841,9 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
        u64 p_block, p_count;
        int i, count, done = 0;
-        mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
+        trace_ocfs2_read_virt_blocks(
-                   "flags = %x, validate = %p)\n",
+             inode, (unsigned long long)v_block, nr, bhs, flags,
-                   inode, (unsigned long long)v_block, nr, bhs, flags,
+             validate);
-                   validate);
        if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
            i_size_read(inode)) {
@@ -897,7 +896,6 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
        }
 out:
-        mlog_exit(rc);
        return rc;
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a6651956482e..41565ae52856 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -38,7 +38,6 @@
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -61,6 +60,7 @@
 #include "acl.h"
 #include "quota.h"
 #include "refcounttree.h"
+#include "ocfs2_trace.h"
 #include "buffer_head_io.h"
@@ -99,8 +99,10 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
        int mode = file->f_flags;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
+        trace_ocfs2_file_open(inode, file, file->f_path.dentry,
-                   file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
+                              (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                              file->f_path.dentry->d_name.len,
+                              file->f_path.dentry->d_name.name, mode);
        if (file->f_mode & FMODE_WRITE)
                dquot_initialize(inode);
@@ -135,7 +137,6 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
        }
 leave:
-        mlog_exit(status);
        return status;
 }
@@ -143,19 +144,19 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
 {
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
-                       file->f_path.dentry->d_name.len,
-                       file->f_path.dentry->d_name.name);
        spin_lock(&oi->ip_lock);
        if (!--oi->ip_open_count)
                oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
+        trace_ocfs2_file_release(inode, file, file->f_path.dentry,
+                                 oi->ip_blkno,
+                                 file->f_path.dentry->d_name.len,
+                                 file->f_path.dentry->d_name.name,
+                                 oi->ip_open_count);
        spin_unlock(&oi->ip_lock);
        ocfs2_free_file_private(inode, file);
-        mlog_exit(0);
        return 0;
 }
@@ -177,9 +178,11 @@ static int ocfs2_sync_file(struct file *file, int datasync)
        struct inode *inode = file->f_mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
+        trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
-                   file->f_path.dentry, file->f_path.dentry->d_name.len,
+                              OCFS2_I(inode)->ip_blkno,
-                   file->f_path.dentry->d_name.name);
+                              file->f_path.dentry->d_name.len,
+                              file->f_path.dentry->d_name.name,
+                              (unsigned long long)datasync);
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
                /*
@@ -195,7 +198,8 @@ static int ocfs2_sync_file(struct file *file, int datasync)
        err = jbd2_journal_force_commit(journal);
 bail:
-        mlog_exit(err);
+        if (err)
+                mlog_errno(err);
        return (err < 0) ? -EIO : 0;
 }
@@ -251,8 +255,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
        handle_t *handle;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
-        mlog_entry_void();
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
@@ -280,7 +282,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
 out_commit:
        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
-        mlog_exit(ret);
        return ret;
 }
@@ -291,7 +292,6 @@ static int ocfs2_set_inode_size(handle_t *handle,
 {
        int status;
-        mlog_entry_void();
        i_size_write(inode, new_i_size);
        inode->i_blocks = ocfs2_inode_sector_count(inode);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -303,7 +303,6 @@ static int ocfs2_set_inode_size(handle_t *handle,
        }
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -375,8 +374,6 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
        struct ocfs2_dinode *di;
        u64 cluster_bytes;
-        mlog_entry_void();
        /*
         * We need to CoW the cluster contains the offset if it is reflinked
         * since we will call ocfs2_zero_range_for_truncate later which will
@@ -429,8 +426,6 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 out_commit:
        ocfs2_commit_trans(osb, handle);
 out:
-        mlog_exit(status);
        return status;
 }
@@ -442,14 +437,14 @@ static int ocfs2_truncate_file(struct inode *inode,
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        mlog_entry("(inode = %llu, new_i_size = %llu\n",
-                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                   (unsigned long long)new_i_size);
        /* We trust di_bh because it comes from ocfs2_inode_lock(), which
         * already validated it */
        fe = (struct ocfs2_dinode *) di_bh->b_data;
+        trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                  (unsigned long long)le64_to_cpu(fe->i_size),
+                                  (unsigned long long)new_i_size);
        mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
                        "Inode %llu, inode i_size = %lld != di "
                        "i_size = %llu, i_flags = 0x%x\n",
@@ -459,19 +454,14 @@ static int ocfs2_truncate_file(struct inode *inode,
                        le32_to_cpu(fe->i_flags));
        if (new_i_size > le64_to_cpu(fe->i_size)) {
-                mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
+                trace_ocfs2_truncate_file_error(
-                     (unsigned long long)le64_to_cpu(fe->i_size),
+                        (unsigned long long)le64_to_cpu(fe->i_size),
-                     (unsigned long long)new_i_size);
+                        (unsigned long long)new_i_size);
                status = -EINVAL;
                mlog_errno(status);
                goto bail;
        }
-        mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
-             (unsigned long long)le64_to_cpu(fe->i_blkno),
-             (unsigned long long)le64_to_cpu(fe->i_size),
-             (unsigned long long)new_i_size);
        /* lets handle the simple truncate cases before doing any more
         * cluster locking. */
        if (new_i_size == le64_to_cpu(fe->i_size))
@@ -525,7 +515,6 @@ bail:
        if (!status && OCFS2_I(inode)->ip_clusters == 0)
                status = ocfs2_try_remove_refcount_tree(inode, di_bh);
-        mlog_exit(status);
        return status;
 }
@@ -578,8 +567,6 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
        struct ocfs2_extent_tree et;
        int did_quota = 0;
-        mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
        /*
         * This function only exists for file systems which don't
         * support holes.
@@ -596,11 +583,6 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 restart_all:
        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
-        mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-             "clusters_to_add = %u\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
-             clusters_to_add);
        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
        status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
                                       &data_ac, &meta_ac);
@@ -620,6 +602,12 @@ restart_all:
        }
 restarted_transaction:
+        trace_ocfs2_extend_allocation(
+                (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                (unsigned long long)i_size_read(inode),
+                le32_to_cpu(fe->i_clusters), clusters_to_add,
+                why, restart_func);
        status = dquot_alloc_space_nodirty(inode,
                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
        if (status)
@@ -666,13 +654,11 @@ restarted_transaction:
        if (why != RESTART_NONE && clusters_to_add) {
                if (why == RESTART_META) {
-                        mlog(0, "restarting function.\n");
                        restart_func = 1;
                        status = 0;
                } else {
                        BUG_ON(why != RESTART_TRANS);
-                        mlog(0, "restarting transaction.\n");
                        /* TODO: This can be more intelligent. */
                        credits = ocfs2_calc_extend_credits(osb->sb,
                                                            &fe->id2.i_list,
@@ -689,11 +675,11 @@ restarted_transaction:
                }
        }
-        mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
+        trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
             le32_to_cpu(fe->i_clusters),
-             (unsigned long long)le64_to_cpu(fe->i_size));
+             (unsigned long long)le64_to_cpu(fe->i_size),
-        mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
+             OCFS2_I(inode)->ip_clusters,
-             OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
+             (unsigned long long)i_size_read(inode));
 leave:
        if (status < 0 && did_quota)
@@ -718,7 +704,6 @@ leave:
        brelse(bh);
        bh = NULL;
-        mlog_exit(status);
        return status;
 }
@@ -785,10 +770,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
        if (!zero_to)
                zero_to = PAGE_CACHE_SIZE;
-        mlog(0,
+        trace_ocfs2_write_zero_page(
-             "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             (unsigned long long)abs_from, (unsigned long long)abs_to,
+                        (unsigned long long)abs_from,
-             index, zero_from, zero_to);
+                        (unsigned long long)abs_to,
+                        index, zero_from, zero_to);
        /* We know that zero_from is block aligned */
        for (block_start = zero_from; block_start < zero_to;
@@ -928,9 +914,10 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
        u64 next_pos;
        u64 zero_pos = range_start;
-        mlog(0, "range_start = %llu, range_end = %llu\n",
+        trace_ocfs2_zero_extend_range(
-             (unsigned long long)range_start,
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             (unsigned long long)range_end);
+                        (unsigned long long)range_start,
+                        (unsigned long long)range_end);
        BUG_ON(range_start >= range_end);
        while (zero_pos < range_end) {
@@ -962,9 +949,9 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
        struct super_block *sb = inode->i_sb;
        zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
-        mlog(0, "zero_start %llu for i_size %llu\n",
+        trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
-             (unsigned long long)zero_start,
+                                (unsigned long long)zero_start,
-             (unsigned long long)i_size_read(inode));
+                                (unsigned long long)i_size_read(inode));
        while (zero_start < zero_to_size) {
                ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
                                                  zero_to_size,
@@ -1113,30 +1100,20 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        struct dquot *transfer_to[MAXQUOTAS] = { };
        int qtype;
-        mlog_entry("(0x%p, '%.*s')\n", dentry,
+        trace_ocfs2_setattr(inode, dentry,
-                   dentry->d_name.len, dentry->d_name.name);
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            dentry->d_name.len, dentry->d_name.name,
+                            attr->ia_valid, attr->ia_mode,
+                            attr->ia_uid, attr->ia_gid);
        /* ensuring we don't even attempt to truncate a symlink */
        if (S_ISLNK(inode->i_mode))
                attr->ia_valid &= ~ATTR_SIZE;
-        if (attr->ia_valid & ATTR_MODE)
-                mlog(0, "mode change: %d\n", attr->ia_mode);
-        if (attr->ia_valid & ATTR_UID)
-                mlog(0, "uid change: %d\n", attr->ia_uid);
-        if (attr->ia_valid & ATTR_GID)
-                mlog(0, "gid change: %d\n", attr->ia_gid);
-        if (attr->ia_valid & ATTR_SIZE)
-                mlog(0, "size change...\n");
-        if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
-                mlog(0, "time change...\n");
 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
                           | ATTR_GID | ATTR_UID | ATTR_MODE)
-        if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
+        if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
-                mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
                return 0;
-        }
        status = inode_change_ok(inode, attr);
        if (status)
@@ -1274,7 +1251,6 @@ bail:
                        mlog_errno(status);
        }
-        mlog_exit(status);
        return status;
 }
@@ -1287,8 +1263,6 @@ int ocfs2_getattr(struct vfsmount *mnt,
        struct ocfs2_super *osb = sb->s_fs_info;
        int err;
-        mlog_entry_void();
        err = ocfs2_inode_revalidate(dentry);
        if (err) {
                if (err != -ENOENT)
@@ -1302,8 +1276,6 @@ int ocfs2_getattr(struct vfsmount *mnt,
        stat->blksize = osb->s_clustersize;
 bail:
-        mlog_exit(err);
        return err;
 }
@@ -1314,8 +1286,6 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
        if (flags & IPERM_FLAG_RCU)
                return -ECHILD;
-        mlog_entry_void();
        ret = ocfs2_inode_lock(inode, NULL, 0);
        if (ret) {
                if (ret != -ENOENT)
@@ -1327,7 +1297,6 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
        ocfs2_inode_unlock(inode, 0);
 out:
-        mlog_exit(ret);
        return ret;
 }
@@ -1339,8 +1308,9 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_dinode *di;
-        mlog_entry("(Inode %llu, mode 0%o)\n",
+        trace_ocfs2_write_remove_suid(
-                   (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                        inode->i_mode);
        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
        if (IS_ERR(handle)) {
@@ -1368,7 +1338,6 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
 out_trans:
        ocfs2_commit_trans(osb, handle);
 out:
-        mlog_exit(ret);
        return ret;
 }
@@ -1547,8 +1516,9 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
         * partial clusters here. There's no need to worry about
         * physical allocation - the zeroing code knows to skip holes.
         */
-        mlog(0, "byte start: %llu, end: %llu\n",
+        trace_ocfs2_zero_partial_clusters(
-             (unsigned long long)start, (unsigned long long)end);
+                (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                (unsigned long long)start, (unsigned long long)end);
        /*
         * If both edges are on a cluster boundary then there's no
@@ -1572,8 +1542,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
        if (tmpend > end)
                tmpend = end;
-        mlog(0, "1st range: start: %llu, tmpend: %llu\n",
+        trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
-             (unsigned long long)start, (unsigned long long)tmpend);
+                                                 (unsigned long long)tmpend);
        ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
        if (ret)
@@ -1587,8 +1557,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
                 */
                start = end & ~(osb->s_clustersize - 1);
-                mlog(0, "2nd range: start: %llu, end: %llu\n",
+                trace_ocfs2_zero_partial_clusters_range2(
-                     (unsigned long long)start, (unsigned long long)end);
+                        (unsigned long long)start, (unsigned long long)end);
                ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
                if (ret)
@@ -1688,6 +1658,11 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
        ocfs2_init_dealloc_ctxt(&dealloc);
+        trace_ocfs2_remove_inode_range(
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                        (unsigned long long)byte_start,
+                        (unsigned long long)byte_len);
        if (byte_len == 0)
                return 0;
@@ -1734,11 +1709,6 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
        cluster_in_el = trunc_end;
-        mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
-             (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             (unsigned long long)byte_start,
-             (unsigned long long)byte_len, trunc_start, trunc_end);
        ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
        if (ret) {
                mlog_errno(ret);
@@ -2093,7 +2063,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
        int ret = 0, meta_level = 0;
        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
-        loff_t saved_pos, end;
+        loff_t saved_pos = 0, end;
        /*
         * We start with a read level meta lock and only jump to an ex
@@ -2132,12 +2102,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                /* work on a copy of ppos until we're sure that we won't have
                 * to recalculate it due to relocking. */
-                if (appending) {
+                if (appending)
                        saved_pos = i_size_read(inode);
-                        mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
+                else
-                } else {
                        saved_pos = *ppos;
-                }
                end = saved_pos + count;
@@ -2208,6 +2176,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                *ppos = saved_pos;
 out_unlock:
+        trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
+                                            saved_pos, appending, count,
+                                            direct_io, has_refcount);
        if (meta_level >= 0)
                ocfs2_inode_unlock(inode, meta_level);
@@ -2233,10 +2205,11 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        int full_coherency = !(osb->s_mount_opt &
                               OCFS2_MOUNT_COHERENCY_BUFFERED);
-        mlog_entry("(0x%p, %u, '%.*s')\n", file,
+        trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
-                   (unsigned int)nr_segs,
+                (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                   file->f_path.dentry->d_name.len,
+                file->f_path.dentry->d_name.len,
-                   file->f_path.dentry->d_name.name);
+                file->f_path.dentry->d_name.name,
+                (unsigned int)nr_segs);
        if (iocb->ki_left == 0)
                return 0;
@@ -2402,7 +2375,6 @@ out_sems:
        if (written)
                ret = written;
-        mlog_exit(ret);
        return ret;
 }
@@ -2438,10 +2410,11 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                .u.file = out,
        };
-        mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
-                   (unsigned int)len,
+        trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
-                   out->f_path.dentry->d_name.len,
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                   out->f_path.dentry->d_name.name);
+                        out->f_path.dentry->d_name.len,
+                        out->f_path.dentry->d_name.name, len);
        if (pipe->inode)
                mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
@@ -2485,7 +2458,6 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
        }
-        mlog_exit(ret);
        return ret;
 }
@@ -2498,10 +2470,10 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
        int ret = 0, lock_level = 0;
        struct inode *inode = in->f_path.dentry->d_inode;
-        mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
+        trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
-                   (unsigned int)len,
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                   in->f_path.dentry->d_name.len,
+                        in->f_path.dentry->d_name.len,
-                   in->f_path.dentry->d_name.name);
+                        in->f_path.dentry->d_name.name, len);
        /*
         * See the comment in ocfs2_file_aio_read()
@@ -2516,7 +2488,6 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
        ret = generic_file_splice_read(in, ppos, pipe, len, flags);
 bail:
-        mlog_exit(ret);
        return ret;
 }
@@ -2529,10 +2500,11 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        struct file *filp = iocb->ki_filp;
        struct inode *inode = filp->f_path.dentry->d_inode;
-        mlog_entry("(0x%p, %u, '%.*s')\n", filp,
+        trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
-                   (unsigned int)nr_segs,
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                   filp->f_path.dentry->d_name.len,
+                        filp->f_path.dentry->d_name.len,
-                   filp->f_path.dentry->d_name.name);
+                        filp->f_path.dentry->d_name.name, nr_segs);
        if (!inode) {
                ret = -EINVAL;
@@ -2578,8 +2550,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
        ocfs2_inode_unlock(inode, lock_level);
        ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
-        if (ret == -EINVAL)
+        trace_generic_file_aio_read_ret(ret);
-                mlog(0, "generic_file_aio_read returned -EINVAL\n");
        /* buffered aio wouldn't have proper lock coverage today */
        BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
@@ -2597,7 +2568,6 @@ bail:
        }
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
-        mlog_exit(ret);
        return ret;
 }
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 1aa863dd901f..d8208b20dc53 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -28,7 +28,6 @@
 #include <linux/types.h>
 #include <linux/highmem.h>
-#define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -37,6 +36,7 @@
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
+#include "ocfs2_trace.h"
 #include "buffer_head_io.h"
@@ -66,7 +66,7 @@ void ocfs2_do_node_down(int node_num, void *data)
        BUG_ON(osb->node_num == node_num);
-        mlog(0, "ocfs2: node down event for %d\n", node_num);
+        trace_ocfs2_do_node_down(node_num);
        if (!osb->cconn) {
                /*
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4068c6c4c6f6..b4c8bb6b8d28 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -31,7 +31,6 @@
 #include <asm/byteorder.h>
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -53,6 +52,7 @@
 #include "uptodate.h"
 #include "xattr.h"
 #include "refcounttree.h"
+#include "ocfs2_trace.h"
 #include "buffer_head_io.h"
@@ -131,7 +131,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
        struct super_block *sb = osb->sb;
        struct ocfs2_find_inode_args args;
-        mlog_entry("(blkno = %llu)\n", (unsigned long long)blkno);
+        trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
+                               sysfile_type);
        /* Ok. By now we've either got the offsets passed to us by the
         * caller, or we just pulled them off the bh. Lets do some
@@ -152,16 +153,16 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
        /* inode was *not* in the inode cache. 2.6.x requires
         * us to do our own read_inode call and unlock it
         * afterwards. */
-        if (inode && inode->i_state & I_NEW) {
-                mlog(0, "Inode was not in inode cache, reading it.\n");
-                ocfs2_read_locked_inode(inode, &args);
-                unlock_new_inode(inode);
-        }
        if (inode == NULL) {
                inode = ERR_PTR(-ENOMEM);
                mlog_errno(PTR_ERR(inode));
                goto bail;
        }
+        trace_ocfs2_iget5_locked(inode->i_state);
+        if (inode->i_state & I_NEW) {
+                ocfs2_read_locked_inode(inode, &args);
+                unlock_new_inode(inode);
+        }
        if (is_bad_inode(inode)) {
                iput(inode);
                inode = ERR_PTR(-ESTALE);
@@ -170,9 +171,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
 bail:
        if (!IS_ERR(inode)) {
-                mlog(0, "returning inode with number %llu\n",
+                trace_ocfs2_iget_end(inode, 
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno);
-                mlog_exit_ptr(inode);
        }
        return inode;
@@ -192,18 +192,17 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        int ret = 0;
-        mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
        args = opaque;
        mlog_bug_on_msg(!inode, "No inode in find actor!\n");
+        trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno);
        if (oi->ip_blkno != args->fi_blkno)
                goto bail;
        ret = 1;
 bail:
-        mlog_exit(ret);
        return ret;
 }
@@ -218,8 +217,6 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
        static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
                                     ocfs2_file_ip_alloc_sem_key;
-        mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
        inode->i_ino = args->fi_ino;
        OCFS2_I(inode)->ip_blkno = args->fi_blkno;
        if (args->fi_sysfile_type != 0)
@@ -235,7 +232,6 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
                lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
                                  &ocfs2_file_ip_alloc_sem_key);
-        mlog_exit(0);
        return 0;
 }
@@ -246,9 +242,6 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        struct ocfs2_super *osb;
        int use_plocks = 1;
-        mlog_entry("(0x%p, size:%llu)\n", inode,
-                   (unsigned long long)le64_to_cpu(fe->i_size));
        sb = inode->i_sb;
        osb = OCFS2_SB(sb);
@@ -300,20 +293,20 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        inode->i_nlink = ocfs2_read_links_count(fe);
+        trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno,
+                                   le32_to_cpu(fe->i_flags));
        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
                inode->i_flags |= S_NOQUOTA;
        }
+  
        if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
-                mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
        } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
        } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
                inode->i_flags |= S_NOQUOTA;
        } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
-                mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
                /* we can't actually hit this as read_inode can't
                 * handle superblocks today ;-) */
                BUG();
@@ -381,7 +374,6 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        if (S_ISDIR(inode->i_mode))
                ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
                                    OCFS2_RESV_FLAG_DIR);
-        mlog_exit_void();
 }
 static int ocfs2_read_locked_inode(struct inode *inode,
@@ -394,8 +386,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        int status, can_lock;
        u32 generation = 0;
-        mlog_entry("(0x%p, 0x%p)\n", inode, args);
        status = -EINVAL;
        if (inode == NULL || inode->i_sb == NULL) {
                mlog(ML_ERROR, "bad inode\n");
@@ -443,6 +433,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
                && !ocfs2_mount_local(osb);
+        trace_ocfs2_read_locked_inode(
+                (unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock);
        /*
         * To maintain backwards compatibility with older versions of
         * ocfs2-tools, we still store the generation value for system
@@ -534,7 +527,6 @@ bail:
        if (args && bh)
                brelse(bh);
-        mlog_exit(status);
        return status;
 }
@@ -551,8 +543,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
        struct ocfs2_dinode *fe;
        handle_t *handle = NULL;
-        mlog_entry_void();
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
        /*
@@ -600,7 +590,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 out:
        if (handle)
                ocfs2_commit_trans(osb, handle);
-        mlog_exit(status);
        return status;
 }
@@ -696,8 +685,6 @@ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
        spin_lock(&osb->osb_lock);
        if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) {
-                mlog(0, "Recovery is happening on orphan dir %d, will skip "
-                     "this inode\n", slot);
                ret = -EDEADLK;
                goto out;
        }
@@ -706,6 +693,7 @@ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
        osb->osb_orphan_wipes[slot]++;
 out:
        spin_unlock(&osb->osb_lock);
+        trace_ocfs2_check_orphan_recovery_state(slot, ret);
        return ret;
 }
@@ -816,6 +804,10 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task,
+                                             (unsigned long long)oi->ip_blkno,
+                                             oi->ip_flags);
        /* We shouldn't be getting here for the root directory
         * inode.. */
        if (inode == osb->root_inode) {
@@ -828,11 +820,8 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
         * have to skip deleting this guy. That's OK though because
         * the node who's doing the actual deleting should handle it
         * anyway. */
-        if (current == osb->dc_task) {
+        if (current == osb->dc_task)
-                mlog(0, "Skipping delete of %lu because we're currently "
-                     "in downconvert\n", inode->i_ino);
                goto bail;
-        }
        spin_lock(&oi->ip_lock);
        /* OCFS2 *never* deletes system files. This should technically
@@ -846,12 +835,9 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
        /* If we have allowd wipe of this inode for another node, it
         * will be marked here so we can safely skip it. Recovery will
-         * cleanup any inodes we might inadvertantly skip here. */
+         * cleanup any inodes we might inadvertently skip here. */
-        if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
+        if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE)
-                mlog(0, "Skipping delete of %lu because another node "
-                     "has done this for us.\n", inode->i_ino);
                goto bail_unlock;
-        }
        ret = 1;
 bail_unlock:
@@ -868,28 +854,27 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
                                  struct buffer_head *di_bh,
                                  int *wipe)
 {
-        int status = 0;
+        int status = 0, reason = 0;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_dinode *di;
        *wipe = 0;
+        trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno,
+                                           inode->i_nlink);
        /* While we were waiting for the cluster lock in
         * ocfs2_delete_inode, another node might have asked to delete
         * the inode. Recheck our flags to catch this. */
        if (!ocfs2_inode_is_valid_to_delete(inode)) {
-                mlog(0, "Skipping delete of %llu because flags changed\n",
+                reason = 1;
-                     (unsigned long long)oi->ip_blkno);
                goto bail;
        }
        /* Now that we have an up to date inode, we can double check
         * the link count. */
-        if (inode->i_nlink) {
+        if (inode->i_nlink)
-                mlog(0, "Skipping delete of %llu because nlink = %u\n",
-                     (unsigned long long)oi->ip_blkno, inode->i_nlink);
                goto bail;
-        }
        /* Do some basic inode verification... */
        di = (struct ocfs2_dinode *) di_bh->b_data;
@@ -904,9 +889,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
                 * ORPHANED_FL not.
                 */
                if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
-                        mlog(0, "Reflinked inode %llu is no longer orphaned.  "
+                        reason = 2;
-                             "it shouldn't be deleted\n",
-                             (unsigned long long)oi->ip_blkno);
                        goto bail;
                }
@@ -934,7 +917,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
         * the inode open lock in ocfs2_read_locked_inode(). When we
         * get to ->delete_inode(), each node tries to convert it's
         * lock to an exclusive. Trylocks are serialized by the inode
-         * meta data lock. If the upconvert suceeds, we know the inode
+         * meta data lock. If the upconvert succeeds, we know the inode
         * is no longer live and can be deleted.
         *
         * Though we call this with the meta data lock held, the
@@ -943,8 +926,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
        status = ocfs2_try_open_lock(inode, 1);
        if (status == -EAGAIN) {
                status = 0;
-                mlog(0, "Skipping delete of %llu because it is in use on "
+                reason = 3;
-                     "other nodes\n", (unsigned long long)oi->ip_blkno);
                goto bail;
        }
        if (status < 0) {
@@ -953,11 +935,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
        }
        *wipe = 1;
-        mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
+        trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot));
-             (unsigned long long)oi->ip_blkno,
-             le16_to_cpu(di->i_orphaned_slot));
 bail:
+        trace_ocfs2_query_inode_wipe_end(status, reason);
        return status;
 }
@@ -967,8 +948,8 @@ bail:
 static void ocfs2_cleanup_delete_inode(struct inode *inode,
                                       int sync_data)
 {
-        mlog(0, "Cleanup inode %llu, sync = %d\n",
+        trace_ocfs2_cleanup_delete_inode(
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
+                (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
        if (sync_data)
                write_inode_now(inode, 1);
        truncate_inode_pages(&inode->i_data, 0);
@@ -980,15 +961,15 @@ static void ocfs2_delete_inode(struct inode *inode)
        sigset_t oldset;
        struct buffer_head *di_bh = NULL;
-        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+        trace_ocfs2_delete_inode(inode->i_ino,
+                                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                 is_bad_inode(inode));
        /* When we fail in read_inode() we mark inode as bad. The second test
         * catches the case when inode allocation fails before allocating
         * a block for inode. */
-        if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
+        if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
-                mlog(0, "Skipping delete of bad inode\n");
                goto bail;
-        }
        dquot_initialize(inode);
@@ -1080,7 +1061,7 @@ bail_unlock_nfs_sync:
 bail_unblock:
        ocfs2_unblock_signals(&oldset);
 bail:
-        mlog_exit_void();
+        return;
 }
 static void ocfs2_clear_inode(struct inode *inode)
@@ -1088,11 +1069,9 @@ static void ocfs2_clear_inode(struct inode *inode)
        int status;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        mlog_entry_void();
        end_writeback(inode);
-        mlog(0, "Clearing inode: %llu, nlink = %u\n",
+        trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink);
+                                inode->i_nlink);
        mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
                        "Inode=%lu\n", inode->i_ino);
@@ -1181,8 +1160,6 @@ static void ocfs2_clear_inode(struct inode *inode)
         */
        jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
                                       &oi->ip_jinode);
-        mlog_exit_void();
 }
 void ocfs2_evict_inode(struct inode *inode)
@@ -1204,17 +1181,14 @@ int ocfs2_drop_inode(struct inode *inode)
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        int res;
-        mlog_entry_void();
+        trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
+                                inode->i_nlink, oi->ip_flags);
-        mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
-             (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
        if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
                res = 1;
        else
                res = generic_drop_inode(inode);
-        mlog_exit_void();
        return res;
 }
@@ -1226,11 +1200,11 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        int status = 0;
-        mlog_entry("(inode = 0x%p, ino = %llu)\n", inode,
+        trace_ocfs2_inode_revalidate(inode,
-                   inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL);
+                inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL,
+                inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0);
        if (!inode) {
-                mlog(0, "eep, no inode!\n");
                status = -ENOENT;
                goto bail;
        }
@@ -1238,7 +1212,6 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
        spin_lock(&OCFS2_I(inode)->ip_lock);
        if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
                spin_unlock(&OCFS2_I(inode)->ip_lock);
-                mlog(0, "inode deleted!\n");
                status = -ENOENT;
                goto bail;
        }
@@ -1254,8 +1227,6 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
        }
        ocfs2_inode_unlock(inode, 0);
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -1271,8 +1242,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        int status;
        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
-        mlog_entry("(inode %llu)\n",
+        trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno);
-                   (unsigned long long)OCFS2_I(inode)->ip_blkno);
        status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
                                         OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1302,7 +1272,6 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        ocfs2_journal_dirty(handle, bh);
 leave:
-        mlog_exit(status);
        return status;
 }
@@ -1345,8 +1314,7 @@ int ocfs2_validate_inode_block(struct super_block *sb,
        int rc;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
-        mlog(0, "Validating dinode %llu\n",
+        trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr);
-             (unsigned long long)bh->b_blocknr);
        BUG_ON(!buffer_uptodate(bh));
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7a4868196152..8f13c5989eae 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -9,7 +9,6 @@
 #include <linux/mount.h>
 #include <linux/compat.h>
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -46,6 +45,22 @@ static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
 #define o2info_set_request_error(a, b) \
                __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
+static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
+{
+        req->ir_flags |= OCFS2_INFO_FL_FILLED;
+}
+#define o2info_set_request_filled(a) \
+                __o2info_set_request_filled((struct ocfs2_info_request *)&(a))
+static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
+{
+        req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
+}
+#define o2info_clear_request_filled(a) \
+                __o2info_clear_request_filled((struct ocfs2_info_request *)&(a))
 static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 {
        int status;
@@ -59,7 +74,6 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
        *flags = OCFS2_I(inode)->ip_attr;
        ocfs2_inode_unlock(inode, 0);
-        mlog_exit(status);
        return status;
 }
@@ -82,7 +96,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
        }
        status = -EACCES;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                goto bail_unlock;
        if (!S_ISDIR(inode->i_mode))
@@ -125,7 +139,6 @@ bail:
        brelse(bh);
-        mlog_exit(status);
        return status;
 }
@@ -139,7 +152,8 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
                goto bail;
        oib.ib_blocksize = inode->i_sb->s_blocksize;
-        oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+        o2info_set_request_filled(oib);
        if (o2info_to_user(oib, req))
                goto bail;
@@ -163,7 +177,8 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
                goto bail;
        oic.ic_clustersize = osb->s_clustersize;
-        oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+        o2info_set_request_filled(oic);
        if (o2info_to_user(oic, req))
                goto bail;
@@ -187,7 +202,8 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
                goto bail;
        oim.im_max_slots = osb->max_slots;
-        oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+        o2info_set_request_filled(oim);
        if (o2info_to_user(oim, req))
                goto bail;
@@ -211,7 +227,8 @@ int ocfs2_info_handle_label(struct inode *inode,
                goto bail;
        memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
-        oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+        o2info_set_request_filled(oil);
        if (o2info_to_user(oil, req))
                goto bail;
@@ -235,7 +252,8 @@ int ocfs2_info_handle_uuid(struct inode *inode,
                goto bail;
        memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
-        oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+        o2info_set_request_filled(oiu);
        if (o2info_to_user(oiu, req))
                goto bail;
@@ -261,7 +279,8 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
        oif.if_compat_features = osb->s_feature_compat;
        oif.if_incompat_features = osb->s_feature_incompat;
        oif.if_ro_compat_features = osb->s_feature_ro_compat;
-        oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+        o2info_set_request_filled(oif);
        if (o2info_to_user(oif, req))
                goto bail;
@@ -286,7 +305,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
        oij.ij_journal_size = osb->journal->j_inode->i_size;
-        oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED;
+        o2info_set_request_filled(oij);
        if (o2info_to_user(oij, req))
                goto bail;
@@ -308,7 +327,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
        if (o2info_from_user(oir, req))
                goto bail;
-        oir.ir_flags &= ~OCFS2_INFO_FL_FILLED;
+        o2info_clear_request_filled(oir);
        if (o2info_to_user(oir, req))
                goto bail;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index faa2303dbf0a..b141a44605ca 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -31,7 +31,6 @@
 #include <linux/time.h>
 #include <linux/random.h>
-#define MLOG_MASK_PREFIX ML_JOURNAL
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -52,6 +51,7 @@
 #include "quota.h"
 #include "buffer_head_io.h"
+#include "ocfs2_trace.h"
 DEFINE_SPINLOCK(trans_inc_lock);
@@ -303,16 +303,15 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
        unsigned int flushed;
        struct ocfs2_journal *journal = NULL;
-        mlog_entry_void();
        journal = osb->journal;
        /* Flush all pending commits and checkpoint the journal. */
        down_write(&journal->j_trans_barrier);
-        if (atomic_read(&journal->j_num_trans) == 0) {
+        flushed = atomic_read(&journal->j_num_trans);
+        trace_ocfs2_commit_cache_begin(flushed);
+        if (flushed == 0) {
                up_write(&journal->j_trans_barrier);
-                mlog(0, "No transactions for me to flush!\n");
                goto finally;
        }
@@ -331,13 +330,11 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
        atomic_set(&journal->j_num_trans, 0);
        up_write(&journal->j_trans_barrier);
-        mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
+        trace_ocfs2_commit_cache_end(journal->j_trans_id, flushed);
-             journal->j_trans_id, flushed);
        ocfs2_wake_downconvert_thread(osb);
        wake_up(&journal->j_checkpointed);
 finally:
-        mlog_exit(status);
        return status;
 }
@@ -425,9 +422,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
                return 0;
        old_nblocks = handle->h_buffer_credits;
-        mlog_entry_void();
-        mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
+        trace_ocfs2_extend_trans(old_nblocks, nblocks);
 #ifdef CONFIG_OCFS2_DEBUG_FS
        status = 1;
@@ -440,9 +436,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 #endif
        if (status > 0) {
-                mlog(0,
+                trace_ocfs2_extend_trans_restart(old_nblocks + nblocks);
-                     "jbd2_journal_extend failed, trying "
-                     "jbd2_journal_restart\n");
                status = jbd2_journal_restart(handle,
                                              old_nblocks + nblocks);
                if (status < 0) {
@@ -453,8 +447,6 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
        status = 0;
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -622,12 +614,9 @@ static int __ocfs2_journal_access(handle_t *handle,
        BUG_ON(!handle);
        BUG_ON(!bh);
-        mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n",
+        trace_ocfs2_journal_access(
-                   (unsigned long long)bh->b_blocknr, type,
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
-                   (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
+                (unsigned long long)bh->b_blocknr, type, bh->b_size);
-                   "OCFS2_JOURNAL_ACCESS_CREATE" :
-                   "OCFS2_JOURNAL_ACCESS_WRITE",
-                   bh->b_size);
        /* we can safely remove this assertion after testing. */
        if (!buffer_uptodate(bh)) {
@@ -668,7 +657,6 @@ static int __ocfs2_journal_access(handle_t *handle,
                mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
                     status, type);
-        mlog_exit(status);
        return status;
 }
@@ -737,13 +725,10 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
 {
        int status;
-        mlog_entry("(bh->b_blocknr=%llu)\n",
+        trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
-                   (unsigned long long)bh->b_blocknr);
        status = jbd2_journal_dirty_metadata(handle, bh);
        BUG_ON(status);
-        mlog_exit_void();
 }
 #define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -775,8 +760,6 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
        struct ocfs2_super *osb;
        int inode_lock = 0;
-        mlog_entry_void();
        BUG_ON(!journal);
        osb = journal->j_osb;
@@ -820,10 +803,9 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
                goto done;
        }
-        mlog(0, "inode->i_size = %lld\n", inode->i_size);
+        trace_ocfs2_journal_init(inode->i_size,
-        mlog(0, "inode->i_blocks = %llu\n",
+                                 (unsigned long long)inode->i_blocks,
-                        (unsigned long long)inode->i_blocks);
+                                 OCFS2_I(inode)->ip_clusters);
-        mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
        /* call the kernels journal init function now */
        j_journal = jbd2_journal_init_inode(inode);
@@ -833,8 +815,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
                goto done;
        }
-        mlog(0, "Returned from jbd2_journal_init_inode\n");
+        trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen);
-        mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
        *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
                  OCFS2_JOURNAL_DIRTY_FL);
@@ -859,7 +840,6 @@ done:
                }
        }
-        mlog_exit(status);
        return status;
 }
@@ -882,8 +862,6 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
        struct buffer_head *bh = journal->j_bh;
        struct ocfs2_dinode *fe;
-        mlog_entry_void();
        fe = (struct ocfs2_dinode *)bh->b_data;
        /* The journal bh on the osb always comes from ocfs2_journal_init()
@@ -906,7 +884,6 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
        if (status < 0)
                mlog_errno(status);
-        mlog_exit(status);
        return status;
 }
@@ -921,8 +898,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
        struct inode *inode = NULL;
        int num_running_trans = 0;
-        mlog_entry_void();
        BUG_ON(!osb);
        journal = osb->journal;
@@ -939,10 +914,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
                BUG();
        num_running_trans = atomic_read(&(osb->journal->j_num_trans));
-        if (num_running_trans > 0)
+        trace_ocfs2_journal_shutdown(num_running_trans);
-                mlog(0, "Shutting down journal: must wait on %d "
-                     "running transactions!\n",
-                     num_running_trans);
        /* Do a commit_cache here. It will flush our journal, *and*
         * release any locks that are still held.
@@ -955,7 +927,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
         * completely destroy the journal. */
        if (osb->commit_task) {
                /* Wait for the commit thread */
-                mlog(0, "Waiting for ocfs2commit to exit....\n");
+                trace_ocfs2_journal_shutdown_wait(osb->commit_task);
                kthread_stop(osb->commit_task);
                osb->commit_task = NULL;
        }
@@ -998,7 +970,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 done:
        if (inode)
                iput(inode);
-        mlog_exit_void();
 }
 static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1024,8 +995,6 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
        int status = 0;
        struct ocfs2_super *osb;
-        mlog_entry_void();
        BUG_ON(!journal);
        osb = journal->j_osb;
@@ -1059,7 +1028,6 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
                osb->commit_task = NULL;
 done:
-        mlog_exit(status);
        return status;
 }
@@ -1070,8 +1038,6 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
 {
        int status;
-        mlog_entry_void();
        BUG_ON(!journal);
        status = jbd2_journal_wipe(journal->j_journal, full);
@@ -1085,7 +1051,6 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
                mlog_errno(status);
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -1124,8 +1089,6 @@ static int ocfs2_force_read_journal(struct inode *inode)
 #define CONCURRENT_JOURNAL_FILL 32ULL
        struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
-        mlog_entry_void();
        memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
        num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
@@ -1161,7 +1124,6 @@ static int ocfs2_force_read_journal(struct inode *inode)
 bail:
        for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
                brelse(bhs[i]);
-        mlog_exit(status);
        return status;
 }
@@ -1185,7 +1147,7 @@ struct ocfs2_la_recovery_item {
 */
 void ocfs2_complete_recovery(struct work_struct *work)
 {
-        int ret;
+        int ret = 0;
        struct ocfs2_journal *journal =
                container_of(work, struct ocfs2_journal, j_recovery_work);
        struct ocfs2_super *osb = journal->j_osb;
@@ -1194,9 +1156,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
        struct ocfs2_quota_recovery *qrec;
        LIST_HEAD(tmp_la_list);
-        mlog_entry_void();
+        trace_ocfs2_complete_recovery(
+                (unsigned long long)OCFS2_I(journal->j_inode)->ip_blkno);
-        mlog(0, "completing recovery from keventd\n");
        spin_lock(&journal->j_lock);
        list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
@@ -1205,15 +1166,18 @@ void ocfs2_complete_recovery(struct work_struct *work)
        list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
                list_del_init(&item->lri_list);
-                mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
                ocfs2_wait_on_quotas(osb);
                la_dinode = item->lri_la_dinode;
-                if (la_dinode) {
+                tl_dinode = item->lri_tl_dinode;
-                        mlog(0, "Clean up local alloc %llu\n",
+                qrec = item->lri_qrec;
-                             (unsigned long long)le64_to_cpu(la_dinode->i_blkno));
+                trace_ocfs2_complete_recovery_slot(item->lri_slot,
+                        la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
+                        tl_dinode ? le64_to_cpu(tl_dinode->i_blkno) : 0,
+                        qrec);
+                if (la_dinode) {
                        ret = ocfs2_complete_local_alloc_recovery(osb,
                                                                  la_dinode);
                        if (ret < 0)
@@ -1222,11 +1186,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
                        kfree(la_dinode);
                }
-                tl_dinode = item->lri_tl_dinode;
                if (tl_dinode) {
-                        mlog(0, "Clean up truncate log %llu\n",
-                             (unsigned long long)le64_to_cpu(tl_dinode->i_blkno));
                        ret = ocfs2_complete_truncate_log_recovery(osb,
                                                                   tl_dinode);
                        if (ret < 0)
@@ -1239,9 +1199,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
                if (ret < 0)
                        mlog_errno(ret);
-                qrec = item->lri_qrec;
                if (qrec) {
-                        mlog(0, "Recovering quota files");
                        ret = ocfs2_finish_quota_recovery(osb, qrec,
                                                          item->lri_slot);
                        if (ret < 0)
@@ -1252,8 +1210,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
                kfree(item);
        }
-        mlog(0, "Recovery completion\n");
+        trace_ocfs2_complete_recovery_end(ret);
-        mlog_exit_void();
 }
 /* NOTE: This function always eats your references to la_dinode and
@@ -1339,8 +1296,6 @@ static int __ocfs2_recovery_thread(void *arg)
        int rm_quota_used = 0, i;
        struct ocfs2_quota_recovery *qrec;
-        mlog_entry_void();
        status = ocfs2_wait_on_mount(osb);
        if (status < 0) {
                goto bail;
@@ -1372,15 +1327,12 @@ restart:
                 * clear it until ocfs2_recover_node() has succeeded. */
                node_num = rm->rm_entries[0];
                spin_unlock(&osb->osb_lock);
-                mlog(0, "checking node %d\n", node_num);
                slot_num = ocfs2_node_num_to_slot(osb, node_num);
+                trace_ocfs2_recovery_thread_node(node_num, slot_num);
                if (slot_num == -ENOENT) {
                        status = 0;
-                        mlog(0, "no slot for this node, so no recovery"
-                             "required.\n");
                        goto skip_recovery;
                }
-                mlog(0, "node %d was using slot %d\n", node_num, slot_num);
                /* It is a bit subtle with quota recovery. We cannot do it
                 * immediately because we have to obtain cluster locks from
@@ -1407,7 +1359,7 @@ skip_recovery:
                spin_lock(&osb->osb_lock);
        }
        spin_unlock(&osb->osb_lock);
-        mlog(0, "All nodes recovered\n");
+        trace_ocfs2_recovery_thread_end(status);
        /* Refresh all journal recovery generations from disk */
        status = ocfs2_check_journals_nolocks(osb);
@@ -1416,7 +1368,7 @@ skip_recovery:
                mlog_errno(status);
        /* Now it is right time to recover quotas... We have to do this under
-         * superblock lock so that noone can start using the slot (and crash)
+         * superblock lock so that no one can start using the slot (and crash)
         * before we recover it */
        for (i = 0; i < rm_quota_used; i++) {
                qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
@@ -1451,7 +1403,6 @@ bail:
        if (rm_quota)
                kfree(rm_quota);
-        mlog_exit(status);
        /* no one is callint kthread_stop() for us so the kthread() api
         * requires that we call do_exit().  And it isn't exported, but
         * complete_and_exit() seems to be a minimal wrapper around it. */
@@ -1461,19 +1412,15 @@ bail:
 void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
 {
-        mlog_entry("(node_num=%d, osb->node_num = %d)\n",
-                   node_num, osb->node_num);
        mutex_lock(&osb->recovery_lock);
-        if (osb->disable_recovery)
-                goto out;
-        /* People waiting on recovery will wait on
+        trace_ocfs2_recovery_thread(node_num, osb->node_num,
-         * the recovery map to empty. */
+                osb->disable_recovery, osb->recovery_thread_task,
-        if (ocfs2_recovery_map_set(osb, node_num))
+                osb->disable_recovery ?
-                mlog(0, "node %d already in recovery map.\n", node_num);
+                -1 : ocfs2_recovery_map_set(osb, node_num));
-        mlog(0, "starting recovery thread...\n");
+        if (osb->disable_recovery)
+                goto out;
        if (osb->recovery_thread_task)
                goto out;
@@ -1488,8 +1435,6 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
 out:
        mutex_unlock(&osb->recovery_lock);
        wake_up(&osb->recovery_event);
-        mlog_exit_void();
 }
 static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
@@ -1563,7 +1508,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
         * If not, it needs recovery.
         */
        if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
-                mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num,
+                trace_ocfs2_replay_journal_recovered(slot_num,
                     osb->slot_recovery_generations[slot_num], slot_reco_gen);
                osb->slot_recovery_generations[slot_num] = slot_reco_gen;
                status = -EBUSY;
@@ -1574,7 +1519,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
        if (status < 0) {
-                mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
+                trace_ocfs2_replay_journal_lock_err(status);
                if (status != -ERESTARTSYS)
                        mlog(ML_ERROR, "Could not lock journal!\n");
                goto done;
@@ -1587,7 +1532,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        slot_reco_gen = ocfs2_get_recovery_generation(fe);
        if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
-                mlog(0, "No recovery required for node %d\n", node_num);
+                trace_ocfs2_replay_journal_skip(node_num);
                /* Refresh recovery generation for the slot */
                osb->slot_recovery_generations[slot_num] = slot_reco_gen;
                goto done;
@@ -1608,7 +1553,6 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
                goto done;
        }
-        mlog(0, "calling journal_init_inode\n");
        journal = jbd2_journal_init_inode(inode);
        if (journal == NULL) {
                mlog(ML_ERROR, "Linux journal layer error\n");
@@ -1628,7 +1572,6 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        ocfs2_clear_journal_error(osb->sb, journal, slot_num);
        /* wipe the journal */
-        mlog(0, "flushing the journal.\n");
        jbd2_journal_lock_updates(journal);
        status = jbd2_journal_flush(journal);
        jbd2_journal_unlock_updates(journal);
@@ -1665,7 +1608,6 @@ done:
        brelse(bh);
-        mlog_exit(status);
        return status;
 }
@@ -1688,8 +1630,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        struct ocfs2_dinode *la_copy = NULL;
        struct ocfs2_dinode *tl_copy = NULL;
-        mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
+        trace_ocfs2_recover_node(node_num, slot_num, osb->node_num);
-                   node_num, slot_num, osb->node_num);
        /* Should not ever be called to recover ourselves -- in that
         * case we should've called ocfs2_journal_load instead. */
@@ -1698,9 +1639,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        status = ocfs2_replay_journal(osb, node_num, slot_num);
        if (status < 0) {
                if (status == -EBUSY) {
-                        mlog(0, "Skipping recovery for slot %u (node %u) "
+                        trace_ocfs2_recover_node_skip(slot_num, node_num);
-                             "as another node has recovered it\n", slot_num,
-                             node_num);
                        status = 0;
                        goto done;
                }
@@ -1735,7 +1674,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        status = 0;
 done:
-        mlog_exit(status);
        return status;
 }
@@ -1808,8 +1746,8 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
                spin_lock(&osb->osb_lock);
                osb->slot_recovery_generations[i] = gen;
-                mlog(0, "Slot %u recovery generation is %u\n", i,
+                trace_ocfs2_mark_dead_nodes(i,
-                     osb->slot_recovery_generations[i]);
+                                            osb->slot_recovery_generations[i]);
                if (i == osb->slot_num) {
                        spin_unlock(&osb->osb_lock);
@@ -1845,7 +1783,6 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
        status = 0;
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -1884,11 +1821,12 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
        os = &osb->osb_orphan_scan;
-        mlog(0, "Begin orphan scan\n");
        if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
                goto out;
+        trace_ocfs2_queue_orphan_scan_begin(os->os_count, os->os_seqno,
+                                            atomic_read(&os->os_state));
        status = ocfs2_orphan_scan_lock(osb, &seqno);
        if (status < 0) {
                if (status != -EAGAIN)
@@ -1918,7 +1856,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 unlock:
        ocfs2_orphan_scan_unlock(osb, seqno);
 out:
-        mlog(0, "Orphan scan completed\n");
+        trace_ocfs2_queue_orphan_scan_end(os->os_count, os->os_seqno,
+                                          atomic_read(&os->os_state));
        return;
 }
@@ -2002,8 +1941,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
        if (IS_ERR(iter))
                return 0;
-        mlog(0, "queue orphan %llu\n",
+        trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
-             (unsigned long long)OCFS2_I(iter)->ip_blkno);
        /* No locking is required for the next_orphan queue as there
         * is only ever a single process doing orphan recovery. */
        OCFS2_I(iter)->ip_next_orphan = p->head;
@@ -2119,7 +2057,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
        struct inode *iter;
        struct ocfs2_inode_info *oi;
-        mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
+        trace_ocfs2_recover_orphans(slot);
        ocfs2_mark_recovering_orphan_dir(osb, slot);
        ret = ocfs2_queue_orphans(osb, slot, &inode);
@@ -2132,7 +2070,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
        while (inode) {
                oi = OCFS2_I(inode);
-                mlog(0, "iput orphan %llu\n", (unsigned long long)oi->ip_blkno);
+                trace_ocfs2_recover_orphans_iput(
+                                        (unsigned long long)oi->ip_blkno);
                iter = oi->ip_next_orphan;
@@ -2170,6 +2109,7 @@ static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
         * MOUNTED flag, but this is set right before
         * dismount_volume() so we can trust it. */
        if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
+                trace_ocfs2_wait_on_mount(VOLUME_DISABLED);
                mlog(0, "mount error, exiting!\n");
                return -EBUSY;
        }
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 43e56b97f9c0..68cf2f6d3c6a 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -215,7 +215,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
                /* WARNING: This only kicks off a single
                 * checkpoint. If someone races you and adds more
                 * metadata to the journal, you won't know, and will
-                 * wind up waiting *alot* longer than necessary. Right
+                 * wind up waiting *a lot* longer than necessary. Right
                 * now we only use this in clear_inode so that's
                 * OK. */
                ocfs2_start_checkpoint(osb);
@@ -405,9 +405,9 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
               ocfs2_quota_trans_credits(sb);
 }
-/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
+/* data block for new dir/symlink, allocation of directory block, dx_root
- * bitmap block for the new bit) dx_root update for free list */
+ * update for free list */
-#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1)
 static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
 {
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ec6adbf8f551..210c35237548 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -29,7 +29,6 @@
 #include <linux/highmem.h>
 #include <linux/bitops.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -43,6 +42,7 @@
 #include "suballoc.h"
 #include "super.h"
 #include "sysfile.h"
+#include "ocfs2_trace.h"
 #include "buffer_head_io.h"
@@ -201,8 +201,7 @@ void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
        la_max_mb = ocfs2_clusters_to_megabytes(sb,
                                                ocfs2_local_alloc_size(sb) * 8);
-        mlog(0, "requested: %dM, max: %uM, default: %uM\n",
+        trace_ocfs2_la_set_sizes(requested_mb, la_max_mb, la_default_mb);
-             requested_mb, la_max_mb, la_default_mb);
        if (requested_mb == -1) {
                /* No user request - use defaults */
@@ -276,8 +275,8 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
        ret = 1;
 bail:
-        mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
+        trace_ocfs2_alloc_should_use_local(
-             osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+             (unsigned long long)bits, osb->local_alloc_state, la_bits, ret);
        spin_unlock(&osb->osb_lock);
        return ret;
 }
@@ -291,8 +290,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
        struct inode *inode = NULL;
        struct ocfs2_local_alloc *la;
-        mlog_entry_void();
        if (osb->local_alloc_bits == 0)
                goto bail;
@@ -364,9 +361,10 @@ bail:
        if (inode)
                iput(inode);
-        mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
+        trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -388,8 +386,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        struct ocfs2_dinode *alloc_copy = NULL;
        struct ocfs2_dinode *alloc = NULL;
-        mlog_entry_void();
        cancel_delayed_work(&osb->la_enable_wq);
        flush_workqueue(ocfs2_wq);
@@ -482,8 +478,6 @@ out:
        if (alloc_copy)
                kfree(alloc_copy);
-        mlog_exit_void();
 }
 /*
@@ -502,7 +496,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
        struct inode *inode = NULL;
        struct ocfs2_dinode *alloc;
-        mlog_entry("(slot_num = %d)\n", slot_num);
+        trace_ocfs2_begin_local_alloc_recovery(slot_num);
        *alloc_copy = NULL;
@@ -552,7 +546,8 @@ bail:
                iput(inode);
        }
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -570,8 +565,6 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
        struct buffer_head *main_bm_bh = NULL;
        struct inode *main_bm_inode;
-        mlog_entry_void();
        main_bm_inode = ocfs2_get_system_file_inode(osb,
                                                    GLOBAL_BITMAP_SYSTEM_INODE,
                                                    OCFS2_INVALID_SLOT);
@@ -620,7 +613,8 @@ out_mutex:
 out:
        if (!status)
                ocfs2_init_steal_slots(osb);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -640,8 +634,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
        struct inode *local_alloc_inode;
        unsigned int free_bits;
-        mlog_entry_void();
        BUG_ON(!ac);
        local_alloc_inode =
@@ -712,10 +704,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
                        goto bail;
        }
-        if (ac->ac_max_block)
-                mlog(0, "Calling in_range for max block %llu\n",
-                     (unsigned long long)ac->ac_max_block);
        ac->ac_inode = local_alloc_inode;
        /* We should never use localalloc from another slot */
        ac->ac_alloc_slot = osb->slot_num;
@@ -729,10 +717,12 @@ bail:
                iput(local_alloc_inode);
        }
-        mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
+        trace_ocfs2_reserve_local_alloc_bits(
-             status);
+                (unsigned long long)ac->ac_max_block,
+                bits_wanted, osb->slot_num, status);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -749,7 +739,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
        struct ocfs2_dinode *alloc;
        struct ocfs2_local_alloc *la;
-        mlog_entry_void();
        BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
        local_alloc_inode = ac->ac_inode;
@@ -788,7 +777,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
        ocfs2_journal_dirty(handle, osb->local_alloc_bh);
 bail:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -799,13 +789,11 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
        u32 count = 0;
        struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
-        mlog_entry_void();
        buffer = la->la_bitmap;
        for (i = 0; i < le16_to_cpu(la->la_size); i++)
                count += hweight8(buffer[i]);
-        mlog_exit(count);
+        trace_ocfs2_local_alloc_count_bits(count);
        return count;
 }
@@ -820,10 +808,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
        void *bitmap = NULL;
        struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
-        mlog_entry("(numbits wanted = %u)\n", *numbits);
        if (!alloc->id1.bitmap1.i_total) {
-                mlog(0, "No bits in my window!\n");
                bitoff = -1;
                goto bail;
        }
@@ -883,8 +868,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
                }
        }
-        mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
+        trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound);
-             numfound);
        if (numfound == *numbits)
                bitoff = startoff - numfound;
@@ -895,7 +879,10 @@ bail:
        if (local_resv)
                ocfs2_resv_discard(resmap, resv);
-        mlog_exit(bitoff);
+        trace_ocfs2_local_alloc_find_clear_bits(*numbits,
+                le32_to_cpu(alloc->id1.bitmap1.i_total),
+                bitoff, numfound);
        return bitoff;
 }
@@ -903,15 +890,12 @@ static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
 {
        struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
        int i;
-        mlog_entry_void();
        alloc->id1.bitmap1.i_total = 0;
        alloc->id1.bitmap1.i_used = 0;
        la->la_bm_off = 0;
        for(i = 0; i < le16_to_cpu(la->la_size); i++)
                la->la_bitmap[i] = 0;
-        mlog_exit_void();
 }
 #if 0
@@ -952,18 +936,16 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
        void *bitmap;
        struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
-        mlog_entry("total = %u, used = %u\n",
+        trace_ocfs2_sync_local_to_main(
-                   le32_to_cpu(alloc->id1.bitmap1.i_total),
+             le32_to_cpu(alloc->id1.bitmap1.i_total),
-                   le32_to_cpu(alloc->id1.bitmap1.i_used));
+             le32_to_cpu(alloc->id1.bitmap1.i_used));
        if (!alloc->id1.bitmap1.i_total) {
-                mlog(0, "nothing to sync!\n");
                goto bail;
        }
        if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
            le32_to_cpu(alloc->id1.bitmap1.i_total)) {
-                mlog(0, "all bits were taken!\n");
                goto bail;
        }
@@ -985,8 +967,7 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
                                ocfs2_clusters_to_blocks(osb->sb,
                                                         start - count);
-                        mlog(0, "freeing %u bits starting at local alloc bit "
+                        trace_ocfs2_sync_local_to_main_free(
-                             "%u (la_start_blk = %llu, blkno = %llu)\n",
                             count, start - count,
                             (unsigned long long)la_start_blk,
                             (unsigned long long)blkno);
@@ -1007,7 +988,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
        }
 bail:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1132,7 +1114,8 @@ bail:
                *ac = NULL;
        }
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1148,17 +1131,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
        struct ocfs2_dinode *alloc = NULL;
        struct ocfs2_local_alloc *la;
-        mlog_entry_void();
        alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
        la = OCFS2_LOCAL_ALLOC(alloc);
-        if (alloc->id1.bitmap1.i_total)
+        trace_ocfs2_local_alloc_new_window(
-                mlog(0, "asking me to alloc a new window over a non-empty "
+                le32_to_cpu(alloc->id1.bitmap1.i_total),
-                     "one\n");
+                osb->local_alloc_bits);
-        mlog(0, "Allocating %u clusters for a new window.\n",
-             osb->local_alloc_bits);
        /* Instruct the allocation code to try the most recently used
         * cluster group. We'll re-record the group used this pass
@@ -1220,13 +1198,13 @@ retry_enospc:
        ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
                             OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
-        mlog(0, "New window allocated:\n");
+        trace_ocfs2_local_alloc_new_window_result(
-        mlog(0, "window la_bm_off = %u\n",
+                OCFS2_LOCAL_ALLOC(alloc)->la_bm_off,
-             OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+                le32_to_cpu(alloc->id1.bitmap1.i_total));
-        mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
 bail:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1243,8 +1221,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        struct ocfs2_dinode *alloc_copy = NULL;
        struct ocfs2_alloc_context *ac = NULL;
-        mlog_entry_void();
        ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
        /* This will lock the main bitmap for us. */
@@ -1324,7 +1300,8 @@ bail:
        if (ac)
                ocfs2_free_alloc_context(ac);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index b5cb3ede9408..e57c804069ea 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/fcntl.h>
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 #include "ocfs2.h"
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 7e32db9c2c99..3e9393ca39eb 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -31,7 +31,6 @@
 #include <linux/signal.h>
 #include <linux/rbtree.h>
-#define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -42,6 +41,7 @@
 #include "inode.h"
 #include "mmap.h"
 #include "super.h"
+#include "ocfs2_trace.h"
 static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
@@ -49,13 +49,12 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
        sigset_t oldset;
        int ret;
-        mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
        ocfs2_block_signals(&oldset);
        ret = filemap_fault(area, vmf);
        ocfs2_unblock_signals(&oldset);
-        mlog_exit_ptr(vmf->page);
+        trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno,
+                          area, vmf->page, vmf->pgoff);
        return ret;
 }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 849fb4a2e814..e5d738cd9cc0 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -42,7 +42,6 @@
 #include <linux/highmem.h>
 #include <linux/quotaops.h>
-#define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -63,6 +62,7 @@
 #include "uptodate.h"
 #include "xattr.h"
 #include "acl.h"
+#include "ocfs2_trace.h"
 #include "buffer_head_io.h"
@@ -106,17 +106,15 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
        struct dentry *ret;
        struct ocfs2_inode_info *oi;
-        mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
+        trace_ocfs2_lookup(dir, dentry, dentry->d_name.len,
-                   dentry->d_name.len, dentry->d_name.name);
+                           dentry->d_name.name,
+                           (unsigned long long)OCFS2_I(dir)->ip_blkno, 0);
        if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
                ret = ERR_PTR(-ENAMETOOLONG);
                goto bail;
        }
-        mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
-             dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
        status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT);
        if (status < 0) {
                if (status != -ENOENT)
@@ -182,7 +180,7 @@ bail_unlock:
 bail:
-        mlog_exit_ptr(ret);
+        trace_ocfs2_lookup_ret(ret);
        return ret;
 }
@@ -235,9 +233,9 @@ static int ocfs2_mknod(struct inode *dir,
        sigset_t oldset;
        int did_block_signals = 0;
-        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+        trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
-                   (unsigned long)dev, dentry->d_name.len,
+                          (unsigned long long)OCFS2_I(dir)->ip_blkno,
-                   dentry->d_name.name);
+                          (unsigned long)dev, mode);
        dquot_initialize(dir);
@@ -293,7 +291,7 @@ static int ocfs2_mknod(struct inode *dir,
        }
        /* get security xattr */
-        status = ocfs2_init_security_get(inode, dir, &si);
+        status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
        if (status) {
                if (status == -EOPNOTSUPP)
                        si.enable = 0;
@@ -354,10 +352,6 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        did_quota_inode = 1;
-        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
-                   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
-                   dentry->d_name.name);
        /* do the real work now. */
        status = ocfs2_mknod_locked(osb, dir, inode, dev,
                                    &new_fe_bh, parent_fe_bh, handle,
@@ -436,9 +430,6 @@ leave:
        if (did_block_signals)
                ocfs2_unblock_signals(&oldset);
-        if (status == -ENOSPC)
-                mlog(0, "Disk is full\n");
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
        kfree(si.name);
@@ -466,7 +457,8 @@ leave:
                iput(inode);
        }
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -577,7 +569,8 @@ leave:
                }
        }
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -615,10 +608,11 @@ static int ocfs2_mkdir(struct inode *dir,
 {
        int ret;
-        mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
+        trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name,
-                   dentry->d_name.len, dentry->d_name.name);
+                          OCFS2_I(dir)->ip_blkno, mode);
        ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
-        mlog_exit(ret);
+        if (ret)
+                mlog_errno(ret);
        return ret;
 }
@@ -630,10 +624,11 @@ static int ocfs2_create(struct inode *dir,
 {
        int ret;
-        mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
+        trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name,
-                   dentry->d_name.len, dentry->d_name.name);
+                           (unsigned long long)OCFS2_I(dir)->ip_blkno, mode);
        ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
-        mlog_exit(ret);
+        if (ret)
+                mlog_errno(ret);
        return ret;
 }
@@ -652,9 +647,9 @@ static int ocfs2_link(struct dentry *old_dentry,
        struct ocfs2_dir_lookup_result lookup = { NULL, };
        sigset_t oldset;
-        mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
+        trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,
-                   old_dentry->d_name.len, old_dentry->d_name.name,
+                         old_dentry->d_name.len, old_dentry->d_name.name,
-                   dentry->d_name.len, dentry->d_name.name);
+                         dentry->d_name.len, dentry->d_name.name);
        if (S_ISDIR(inode->i_mode))
                return -EPERM;
@@ -757,7 +752,8 @@ out:
        ocfs2_free_dir_lookup_result(&lookup);
-        mlog_exit(err);
+        if (err)
+                mlog_errno(err);
        return err;
 }
@@ -809,19 +805,17 @@ static int ocfs2_unlink(struct inode *dir,
        struct ocfs2_dir_lookup_result lookup = { NULL, };
        struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
-        mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
+        trace_ocfs2_unlink(dir, dentry, dentry->d_name.len,
-                   dentry->d_name.len, dentry->d_name.name);
+                           dentry->d_name.name,
+                           (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                           (unsigned long long)OCFS2_I(inode)->ip_blkno);
        dquot_initialize(dir);
        BUG_ON(dentry->d_parent->d_inode != dir);
-        mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
+        if (inode == osb->root_inode)
-        if (inode == osb->root_inode) {
-                mlog(0, "Cannot delete the root directory\n");
                return -EPERM;
-        }
        status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1,
                                         OI_LS_PARENT);
@@ -843,9 +837,10 @@ static int ocfs2_unlink(struct inode *dir,
        if (OCFS2_I(inode)->ip_blkno != blkno) {
                status = -ENOENT;
-                mlog(0, "ip_blkno %llu != dirent blkno %llu ip_flags = %x\n",
+                trace_ocfs2_unlink_noent(
-                     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                     (unsigned long long)blkno, OCFS2_I(inode)->ip_flags);
+                                (unsigned long long)blkno,
+                                OCFS2_I(inode)->ip_flags);
                goto leave;
        }
@@ -954,7 +949,8 @@ leave:
        ocfs2_free_dir_lookup_result(&orphan_insert);
        ocfs2_free_dir_lookup_result(&lookup);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -975,9 +971,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
        struct buffer_head **tmpbh;
        struct inode *tmpinode;
-        mlog_entry("(inode1 = %llu, inode2 = %llu)\n",
+        trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
-                   (unsigned long long)oi1->ip_blkno,
+                                (unsigned long long)oi2->ip_blkno);
-                   (unsigned long long)oi2->ip_blkno);
        if (*bh1)
                *bh1 = NULL;
@@ -988,7 +983,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
        if (oi1->ip_blkno != oi2->ip_blkno) {
                if (oi1->ip_blkno < oi2->ip_blkno) {
                        /* switch id1 and id2 around */
-                        mlog(0, "switching them around...\n");
                        tmpbh = bh2;
                        bh2 = bh1;
                        bh1 = tmpbh;
@@ -1024,8 +1018,13 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                        mlog_errno(status);
        }
+        trace_ocfs2_double_lock_end(
+                        (unsigned long long)OCFS2_I(inode1)->ip_blkno,
+                        (unsigned long long)OCFS2_I(inode2)->ip_blkno);
 bail:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1067,10 +1066,9 @@ static int ocfs2_rename(struct inode *old_dir,
        /* At some point it might be nice to break this function up a
         * bit. */
-        mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
+        trace_ocfs2_rename(old_dir, old_dentry, new_dir, new_dentry,
-                   old_dir, old_dentry, new_dir, new_dentry,
+                           old_dentry->d_name.len, old_dentry->d_name.name,
-                   old_dentry->d_name.len, old_dentry->d_name.name,
+                           new_dentry->d_name.len, new_dentry->d_name.name);
-                   new_dentry->d_name.len, new_dentry->d_name.name);
        dquot_initialize(old_dir);
        dquot_initialize(new_dir);
@@ -1227,16 +1225,15 @@ static int ocfs2_rename(struct inode *old_dir,
                if (!new_inode) {
                        status = -EACCES;
-                        mlog(0, "We found an inode for name %.*s but VFS "
+                        trace_ocfs2_rename_target_exists(new_dentry->d_name.len,
-                             "didn't give us one.\n", new_dentry->d_name.len,
+                                                new_dentry->d_name.name);
-                             new_dentry->d_name.name);
                        goto bail;
                }
                if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
                        status = -EACCES;
-                        mlog(0, "Inode %llu and dir %llu disagree. flags = %x\n",
+                        trace_ocfs2_rename_disagree(
                             (unsigned long long)OCFS2_I(new_inode)->ip_blkno,
                             (unsigned long long)newfe_blkno,
                             OCFS2_I(new_inode)->ip_flags);
@@ -1259,8 +1256,7 @@ static int ocfs2_rename(struct inode *old_dir,
                newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
-                mlog(0, "aha rename over existing... new_blkno=%llu "
+                trace_ocfs2_rename_over_existing(
-                     "newfebh=%p bhblocknr=%llu\n",
                     (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
                     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
@@ -1476,7 +1472,8 @@ bail:
        brelse(old_dir_bh);
        brelse(new_dir_bh);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1501,9 +1498,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
         * write i_size + 1 bytes. */
        blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
-        mlog_entry("i_blocks = %llu, i_size = %llu, blocks = %d\n",
+        trace_ocfs2_create_symlink_data((unsigned long long)inode->i_blocks,
-                        (unsigned long long)inode->i_blocks,
+                                        i_size_read(inode), blocks);
-                        i_size_read(inode), blocks);
        /* Sanity check -- make sure we're going to fit. */
        if (bytes_left >
@@ -1579,7 +1575,8 @@ bail:
                kfree(bhs);
        }
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1610,8 +1607,8 @@ static int ocfs2_symlink(struct inode *dir,
        sigset_t oldset;
        int did_block_signals = 0;
-        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
+        trace_ocfs2_symlink_begin(dir, dentry, symname,
-                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
+                                  dentry->d_name.len, dentry->d_name.name);
        dquot_initialize(dir);
@@ -1665,7 +1662,7 @@ static int ocfs2_symlink(struct inode *dir,
        }
        /* get security xattr */
-        status = ocfs2_init_security_get(inode, dir, &si);
+        status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
        if (status) {
                if (status == -EOPNOTSUPP)
                        si.enable = 0;
@@ -1713,9 +1710,10 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        did_quota_inode = 1;
-        mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
+        trace_ocfs2_symlink_create(dir, dentry, dentry->d_name.len,
-                   inode->i_mode, dentry->d_name.len,
+                                   dentry->d_name.name,
-                   dentry->d_name.name);
+                                   (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                                   inode->i_mode);
        status = ocfs2_mknod_locked(osb, dir, inode,
                                    0, &new_fe_bh, parent_fe_bh, handle,
@@ -1835,7 +1833,8 @@ bail:
                iput(inode);
        }
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1844,8 +1843,6 @@ static int ocfs2_blkno_stringify(u64 blkno, char *name)
 {
        int status, namelen;
-        mlog_entry_void();
        namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx",
                           (long long)blkno);
        if (namelen <= 0) {
@@ -1862,12 +1859,12 @@ static int ocfs2_blkno_stringify(u64 blkno, char *name)
                goto bail;
        }
-        mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
+        trace_ocfs2_blkno_stringify(blkno, name, namelen);
-             namelen);
        status = 0;
 bail:
-        mlog_exit(status);
+        if (status < 0)
+                mlog_errno(status);
        return status;
 }
@@ -1980,7 +1977,8 @@ out:
                iput(orphan_dir_inode);
        }
-        mlog_exit(ret);
+        if (ret)
+                mlog_errno(ret);
        return ret;
 }
@@ -1997,7 +1995,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        struct ocfs2_dinode *orphan_fe;
        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
-        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+        trace_ocfs2_orphan_add_begin(
+                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
        status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
        if (status < 0) {
@@ -2056,13 +2055,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        ocfs2_journal_dirty(handle, fe_bh);
-        mlog(0, "Inode %llu orphaned in slot %d\n",
+        trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
-             (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
+                                   osb->slot_num);
 leave:
        brelse(orphan_dir_bh);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -2078,17 +2078,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        int status = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
-        mlog_entry_void();
        status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        mlog(0, "removing '%s' from orphan dir %llu (namelen=%d)\n",
+        trace_ocfs2_orphan_del(
-             name, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
+             (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
-             OCFS2_ORPHAN_NAMELEN);
+             name, OCFS2_ORPHAN_NAMELEN);
        /* find it's spot in the orphan directory */
        status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
@@ -2124,12 +2122,13 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 leave:
        ocfs2_free_dir_lookup_result(&lookup);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
 /**
- * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly
+ * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to receive a newly
 * allocated file. This is different from the typical 'add to orphan dir'
 * operation in that the inode does not yet exist. This is a problem because
 * the orphan dir stringifies the inode block number to come up with it's
@@ -2321,9 +2320,6 @@ leave:
                iput(orphan_dir);
        }
-        if (status == -ENOSPC)
-                mlog(0, "Disk is full\n");
        if ((status < 0) && inode) {
                clear_nlink(inode);
                iput(inode);
@@ -2358,8 +2354,10 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
        struct buffer_head *di_bh = NULL;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
-        mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry,
+        trace_ocfs2_mv_orphaned_inode_to_new(dir, dentry,
-                   dentry->d_name.len, dentry->d_name.name);
+                                dentry->d_name.len, dentry->d_name.name,
+                                (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
        status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
        if (status < 0) {
@@ -2476,7 +2474,8 @@ leave:
        ocfs2_free_dir_lookup_result(&lookup);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 51cd6898e7f1..409285854f64 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -147,6 +147,17 @@ struct ocfs2_lock_res_ops;
 typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
+#ifdef CONFIG_OCFS2_FS_STATS
+struct ocfs2_lock_stats {
+        u64             ls_total;       /* Total wait in NSEC */
+        u32             ls_gets;        /* Num acquires */
+        u32             ls_fail;        /* Num failed acquires */
+        /* Storing max wait in usecs saves 24 bytes per inode */
+        u32             ls_max;         /* Max wait in USEC */
+};
+#endif
 struct ocfs2_lock_res {
        void                    *l_priv;
        struct ocfs2_lock_res_ops *l_ops;
@@ -182,15 +193,9 @@ struct ocfs2_lock_res {
        struct list_head         l_debug_list;
 #ifdef CONFIG_OCFS2_FS_STATS
-        unsigned long long       l_lock_num_prmode;        /* PR acquires */
+        struct ocfs2_lock_stats  l_lock_prmode;         /* PR mode stats */
-        unsigned long long       l_lock_num_exmode;        /* EX acquires */
+        u32                      l_lock_refresh;        /* Disk refreshes */
-        unsigned int             l_lock_num_prmode_failed; /* Failed PR gets */
+        struct ocfs2_lock_stats  l_lock_exmode;         /* EX mode stats */
-        unsigned int             l_lock_num_exmode_failed; /* Failed EX gets */
-        unsigned long long       l_lock_total_prmode;      /* Tot wait for PR */
-        unsigned long long       l_lock_total_exmode;      /* Tot wait for EX */
-        unsigned int             l_lock_max_prmode;        /* Max wait for PR */
-        unsigned int             l_lock_max_exmode;        /* Max wait for EX */
-        unsigned int             l_lock_refresh;           /* Disk refreshes */
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map       l_lockdep_map;
@@ -831,18 +836,18 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
 static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
 {
-        ext2_set_bit(bit, bitmap);
+        __test_and_set_bit_le(bit, bitmap);
 }
 #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
 static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
 {
-        ext2_clear_bit(bit, bitmap);
+        __test_and_clear_bit_le(bit, bitmap);
 }
 #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
-#define ocfs2_test_bit ext2_test_bit
+#define ocfs2_test_bit test_bit_le
-#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#define ocfs2_find_next_zero_bit find_next_zero_bit_le
-#define ocfs2_find_next_bit ext2_find_next_bit
+#define ocfs2_find_next_bit find_next_bit_le
 #endif  /* OCFS2_H */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index bf2e7764920e..b68f87a83924 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -441,7 +441,7 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
 struct ocfs2_block_check {
 /*00*/  __le32 bc_crc32e;       /* 802.3 Ethernet II CRC32 */
        __le16 bc_ecc;          /* Single-error-correction parity vector.
-                                   This is a simple Hamming code dependant
+                                   This is a simple Hamming code dependent
                                   on the blocksize.  OCFS2's maximum
                                   blocksize, 4K, requires 16 parity bits,
                                   so we fit in __le16. */
@@ -750,7 +750,7 @@ struct ocfs2_dinode {
                                                          after an unclean
                                                          shutdown */
                } journal1;
-        } id1;                          /* Inode type dependant 1 */
+        } id1;                          /* Inode type dependent 1 */
 /*C0*/  union {
                struct ocfs2_super_block        i_super;
                struct ocfs2_local_alloc        i_lab;
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
new file mode 100644
index 000000000000..a1dae5bb54ac
--- /dev/null
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -0,0 +1,2739 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ocfs2
+#if !defined(_TRACE_OCFS2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OCFS2_H
+#include <linux/tracepoint.h>
+DECLARE_EVENT_CLASS(ocfs2__int,
+        TP_PROTO(int num),
+        TP_ARGS(num),
+        TP_STRUCT__entry(
+                __field(int, num)
+        ),
+        TP_fast_assign(
+                __entry->num = num;
+        ),
+        TP_printk("%d", __entry->num)
+);
+#define DEFINE_OCFS2_INT_EVENT(name)    \
+DEFINE_EVENT(ocfs2__int, name,  \
+        TP_PROTO(int num),      \
+        TP_ARGS(num))
+DECLARE_EVENT_CLASS(ocfs2__uint,
+        TP_PROTO(unsigned int num),
+        TP_ARGS(num),
+        TP_STRUCT__entry(
+                __field(        unsigned int,   num             )
+        ),
+        TP_fast_assign(
+                __entry->num    =       num;
+        ),
+        TP_printk("%u", __entry->num)
+);
+#define DEFINE_OCFS2_UINT_EVENT(name)   \
+DEFINE_EVENT(ocfs2__uint, name, \
+        TP_PROTO(unsigned int num),     \
+        TP_ARGS(num))
+DECLARE_EVENT_CLASS(ocfs2__ull,
+        TP_PROTO(unsigned long long blkno),
+        TP_ARGS(blkno),
+        TP_STRUCT__entry(
+                __field(unsigned long long, blkno)
+        ),
+        TP_fast_assign(
+                __entry->blkno = blkno;
+        ),
+        TP_printk("%llu", __entry->blkno)
+);
+#define DEFINE_OCFS2_ULL_EVENT(name)    \
+DEFINE_EVENT(ocfs2__ull, name,  \
+        TP_PROTO(unsigned long long num),       \
+        TP_ARGS(num))
+DECLARE_EVENT_CLASS(ocfs2__pointer,
+        TP_PROTO(void *pointer),
+        TP_ARGS(pointer),
+        TP_STRUCT__entry(
+                __field(void *, pointer)
+        ),
+        TP_fast_assign(
+                __entry->pointer = pointer;
+        ),
+        TP_printk("%p", __entry->pointer)
+);
+#define DEFINE_OCFS2_POINTER_EVENT(name)        \
+DEFINE_EVENT(ocfs2__pointer, name,      \
+        TP_PROTO(void *pointer),        \
+        TP_ARGS(pointer))
+DECLARE_EVENT_CLASS(ocfs2__string,
+        TP_PROTO(const char *name),
+        TP_ARGS(name),
+        TP_STRUCT__entry(
+                __string(name,name)
+        ),
+        TP_fast_assign(
+                __assign_str(name, name);
+        ),
+        TP_printk("%s", __get_str(name))
+);
+#define DEFINE_OCFS2_STRING_EVENT(name) \
+DEFINE_EVENT(ocfs2__string, name,       \
+        TP_PROTO(const char *name),     \
+        TP_ARGS(name))
+DECLARE_EVENT_CLASS(ocfs2__int_int,
+        TP_PROTO(int value1, int value2),
+        TP_ARGS(value1, value2),
+        TP_STRUCT__entry(
+                __field(int, value1)
+                __field(int, value2)
+        ),
+        TP_fast_assign(
+                __entry->value1 = value1;
+                __entry->value2 = value2;
+        ),
+        TP_printk("%d %d", __entry->value1, __entry->value2)
+);
+#define DEFINE_OCFS2_INT_INT_EVENT(name)        \
+DEFINE_EVENT(ocfs2__int_int, name,      \
+        TP_PROTO(int val1, int val2),   \
+        TP_ARGS(val1, val2))
+DECLARE_EVENT_CLASS(ocfs2__uint_int,
+        TP_PROTO(unsigned int value1, int value2),
+        TP_ARGS(value1, value2),
+        TP_STRUCT__entry(
+                __field(unsigned int, value1)
+                __field(int, value2)
+        ),
+        TP_fast_assign(
+                __entry->value1 = value1;
+                __entry->value2 = value2;
+        ),
+        TP_printk("%u %d", __entry->value1, __entry->value2)
+);
+#define DEFINE_OCFS2_UINT_INT_EVENT(name)       \
+DEFINE_EVENT(ocfs2__uint_int, name,     \
+        TP_PROTO(unsigned int val1, int val2),  \
+        TP_ARGS(val1, val2))
+DECLARE_EVENT_CLASS(ocfs2__uint_uint,
+        TP_PROTO(unsigned int value1, unsigned int value2),
+        TP_ARGS(value1, value2),
+        TP_STRUCT__entry(
+                __field(unsigned int, value1)
+                __field(unsigned int, value2)
+        ),
+        TP_fast_assign(
+                __entry->value1 = value1;
+                __entry->value2 = value2;
+        ),
+        TP_printk("%u %u", __entry->value1, __entry->value2)
+);
+#define DEFINE_OCFS2_UINT_UINT_EVENT(name)      \
+DEFINE_EVENT(ocfs2__uint_uint, name,    \
+        TP_PROTO(unsigned int val1, unsigned int val2), \
+        TP_ARGS(val1, val2))
+DECLARE_EVENT_CLASS(ocfs2__ull_uint,
+        TP_PROTO(unsigned long long value1, unsigned int value2),
+        TP_ARGS(value1, value2),
+        TP_STRUCT__entry(
+                __field(unsigned long long, value1)
+                __field(unsigned int, value2)
+        ),
+        TP_fast_assign(
+                __entry->value1 = value1;
+                __entry->value2 = value2;
+        ),
+        TP_printk("%llu %u", __entry->value1, __entry->value2)
+);
+#define DEFINE_OCFS2_ULL_UINT_EVENT(name)       \
+DEFINE_EVENT(ocfs2__ull_uint, name,     \
+        TP_PROTO(unsigned long long val1, unsigned int val2),   \
+        TP_ARGS(val1, val2))
+DECLARE_EVENT_CLASS(ocfs2__ull_int,
+        TP_PROTO(unsigned long long value1, int value2),
+        TP_ARGS(value1, value2),
+        TP_STRUCT__entry(
+                __field(unsigned long long, value1)
+                __field(int, value2)
+        ),
+        TP_fast_assign(
+                __entry->value1 = value1;
+                __entry->value2 = value2;
+        ),
+        TP_printk("%llu %d", __entry->value1, __entry->value2)
+);
+#define DEFINE_OCFS2_ULL_INT_EVENT(name)        \
+DEFINE_EVENT(ocfs2__ull_int, name,      \
+        TP_PROTO(unsigned long long val1, int val2),    \
+        TP_ARGS(val1, val2))
+DECLARE_EVENT_CLASS(ocfs2__ull_ull,
+        TP_PROTO(unsigned long long value1, unsigned long long value2),
+        TP_ARGS(value1, value2),
+        TP_STRUCT__entry(
+                __field(unsigned long long, value1)
+                __field(unsigned long long, value2)
+        ),
+        TP_fast_assign(
+                __entry->value1 = value1;
+                __entry->value2 = value2;
+        ),
+        TP_printk("%llu %llu", __entry->value1, __entry->value2)
+);
+#define DEFINE_OCFS2_ULL_ULL_EVENT(name)        \
+DEFINE_EVENT(ocfs2__ull_ull, name,      \
+        TP_PROTO(unsigned long long val1, unsigned long long val2),     \
+        TP_ARGS(val1, val2))
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint,
+        TP_PROTO(unsigned long long value1,
+                 unsigned long long value2, unsigned int value3),
+        TP_ARGS(value1, value2, value3),
+        TP_STRUCT__entry(
+                __field(unsigned long long, value1)
+                __field(unsigned long long, value2)
+                __field(unsigned int, value3)
+        ),
+        TP_fast_assign(
+                __entry->value1 = value1;
+                __entry->value2 = value2;
+                __entry->value3 = value3;
+        ),
+        TP_printk("%llu %llu %u",
+                  __entry->value1, __entry->value2, __entry->value3)
+);
+#define DEFINE_OCFS2_ULL_ULL_UINT_EVENT(name)   \
+DEFINE_EVENT(ocfs2__ull_ull_uint, name, \
+        TP_PROTO(unsigned long long val1,       \
+                 unsigned long long val2, unsigned int val3),   \
+        TP_ARGS(val1, val2, val3))
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint,
+        TP_PROTO(unsigned long long value1,
+                 unsigned int value2, unsigned int value3),
+        TP_ARGS(value1, value2, value3),
+        TP_STRUCT__entry(
+                __field(unsigned long long, value1)
+                __field(unsigned int, value2)
+                __field(unsigned int, value3)
+        ),
+        TP_fast_assign(
+                __entry->value1 = value1;
+                __entry->value2 = value2;
+                __entry->value3 = value3;
+        ),
+        TP_printk("%llu %u %u", __entry->value1,
+                  __entry->value2, __entry->value3)
+);
+#define DEFINE_OCFS2_ULL_UINT_UINT_EVENT(name)  \
+DEFINE_EVENT(ocfs2__ull_uint_uint, name,        \
+        TP_PROTO(unsigned long long val1,       \
+                 unsigned int val2, unsigned int val3), \
+        TP_ARGS(val1, val2, val3))
+DECLARE_EVENT_CLASS(ocfs2__uint_uint_uint,
+        TP_PROTO(unsigned int value1, unsigned int value2,
+                 unsigned int value3),
+        TP_ARGS(value1, value2, value3),
+        TP_STRUCT__entry(
+                __field(        unsigned int,   value1          )
+                __field(        unsigned int,   value2          )
+                __field(        unsigned int,   value3          )
+        ),
+        TP_fast_assign(
+                __entry->value1 =       value1;
+                __entry->value2 =       value2;
+                __entry->value3 =       value3;
+        ),
+        TP_printk("%u %u %u", __entry->value1, __entry->value2, __entry->value3)
+);
+#define DEFINE_OCFS2_UINT_UINT_UINT_EVENT(name) \
+DEFINE_EVENT(ocfs2__uint_uint_uint, name,       \
+        TP_PROTO(unsigned int value1, unsigned int value2,      \
+                 unsigned int value3),  \
+        TP_ARGS(value1, value2, value3))
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_ull,
+        TP_PROTO(unsigned long long value1,
+                 unsigned long long value2, unsigned long long value3),
+        TP_ARGS(value1, value2, value3),
+        TP_STRUCT__entry(
+                __field(unsigned long long, value1)
+                __field(unsigned long long, value2)
+                __field(unsigned long long, value3)
+        ),
+        TP_fast_assign(
+                __entry->value1 = value1;
+                __entry->value2 = value2;
+                __entry->value3 = value3;
+        ),
+        TP_printk("%llu %llu %llu",
+                  __entry->value1, __entry->value2, __entry->value3)
+);
+#define DEFINE_OCFS2_ULL_ULL_ULL_EVENT(name)    \
+DEFINE_EVENT(ocfs2__ull_ull_ull, name,  \
+        TP_PROTO(unsigned long long value1, unsigned long long value2,  \
+                 unsigned long long value3),    \
+        TP_ARGS(value1, value2, value3))
+DECLARE_EVENT_CLASS(ocfs2__ull_int_int_int,
+        TP_PROTO(unsigned long long ull, int value1, int value2, int value3),
+        TP_ARGS(ull, value1, value2, value3),
+        TP_STRUCT__entry(
+                __field(        unsigned long long,     ull     )
+                __field(        int,    value1                  )
+                __field(        int,    value2                  )
+                __field(        int,    value3                  )
+        ),
+        TP_fast_assign(
+                __entry->ull            = ull;
+                __entry->value1         = value1;
+                __entry->value2         = value2;
+                __entry->value3         = value3;
+        ),
+        TP_printk("%llu %d %d %d",
+                  __entry->ull, __entry->value1,
+                  __entry->value2, __entry->value3)
+);
+#define DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(name)        \
+DEFINE_EVENT(ocfs2__ull_int_int_int, name,      \
+        TP_PROTO(unsigned long long ull, int value1,    \
+                 int value2, int value3),       \
+        TP_ARGS(ull, value1, value2, value3))
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint_uint,
+        TP_PROTO(unsigned long long ull, unsigned int value1,
+                 unsigned int value2, unsigned int value3),
+        TP_ARGS(ull, value1, value2, value3),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ull)
+                __field(unsigned int, value1)
+                __field(unsigned int, value2)
+                __field(unsigned int, value3)
+        ),
+        TP_fast_assign(
+                __entry->ull = ull;
+                __entry->value1 = value1;
+                __entry->value2 = value2;
+                __entry->value3 = value3;
+        ),
+        TP_printk("%llu %u %u %u",
+                  __entry->ull, __entry->value1,
+                  __entry->value2, __entry->value3)
+);
+#define DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(name)     \
+DEFINE_EVENT(ocfs2__ull_uint_uint_uint, name,   \
+        TP_PROTO(unsigned long long ull, unsigned int value1,   \
+                 unsigned int value2, unsigned int value3),     \
+        TP_ARGS(ull, value1, value2, value3))
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint_uint,
+        TP_PROTO(unsigned long long value1, unsigned long long value2,
+                 unsigned int value3, unsigned int value4),
+        TP_ARGS(value1, value2, value3, value4),
+        TP_STRUCT__entry(
+                __field(unsigned long long, value1)
+                __field(unsigned long long, value2)
+                __field(unsigned int, value3)
+                __field(unsigned int, value4)
+        ),
+        TP_fast_assign(
+                __entry->value1 = value1;
+                __entry->value2 = value2;
+                __entry->value3 = value3;
+                __entry->value4 = value4;
+        ),
+        TP_printk("%llu %llu %u %u",
+                  __entry->value1, __entry->value2,
+                  __entry->value3, __entry->value4)
+);
+#define DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(name)      \
+DEFINE_EVENT(ocfs2__ull_ull_uint_uint, name,    \
+        TP_PROTO(unsigned long long ull, unsigned long long ull1,       \
+                 unsigned int value2, unsigned int value3),     \
+        TP_ARGS(ull, ull1, value2, value3))
+/* Trace events for fs/ocfs2/alloc.c. */
+DECLARE_EVENT_CLASS(ocfs2__btree_ops,
+        TP_PROTO(unsigned long long owner,\
+                 unsigned int value1, unsigned int value2),
+        TP_ARGS(owner, value1, value2),
+        TP_STRUCT__entry(
+                __field(unsigned long long, owner)
+                __field(unsigned int, value1)
+                __field(unsigned int, value2)
+        ),
+        TP_fast_assign(
+                __entry->owner = owner;
+                __entry->value1 = value1;
+                __entry->value2 = value2;
+        ),
+        TP_printk("%llu %u %u",
+                  __entry->owner, __entry->value1, __entry->value2)
+);
+#define DEFINE_OCFS2_BTREE_EVENT(name)  \
+DEFINE_EVENT(ocfs2__btree_ops, name,    \
+        TP_PROTO(unsigned long long owner,      \
+                 unsigned int value1, unsigned int value2),     \
+        TP_ARGS(owner, value1, value2))
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_adjust_rightmost_branch);
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_rotate_tree_right);
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_append_rec_to_path);
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_insert_extent_start);
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_add_clusters_in_btree);
+DEFINE_OCFS2_INT_EVENT(ocfs2_num_free_extents);
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_edge_insert);
+TRACE_EVENT(ocfs2_grow_tree,
+        TP_PROTO(unsigned long long owner, int depth),
+        TP_ARGS(owner, depth),
+        TP_STRUCT__entry(
+                __field(unsigned long long, owner)
+                __field(int, depth)
+        ),
+        TP_fast_assign(
+                __entry->owner = owner;
+                __entry->depth = depth;
+        ),
+        TP_printk("%llu %d", __entry->owner, __entry->depth)
+);
+TRACE_EVENT(ocfs2_rotate_subtree,
+        TP_PROTO(int subtree_root, unsigned long long blkno,
+                 int depth),
+        TP_ARGS(subtree_root, blkno, depth),
+        TP_STRUCT__entry(
+                __field(int, subtree_root)
+                __field(unsigned long long, blkno)
+                __field(int, depth)
+        ),
+        TP_fast_assign(
+                __entry->subtree_root = subtree_root;
+                __entry->blkno = blkno;
+                __entry->depth = depth;
+        ),
+        TP_printk("%d %llu %d", __entry->subtree_root,
+                  __entry->blkno, __entry->depth)
+);
+TRACE_EVENT(ocfs2_insert_extent,
+        TP_PROTO(unsigned int ins_appending, unsigned int ins_contig,
+                 int ins_contig_index, int free_records, int ins_tree_depth),
+        TP_ARGS(ins_appending, ins_contig, ins_contig_index, free_records,
+                ins_tree_depth),
+        TP_STRUCT__entry(
+                __field(unsigned int, ins_appending)
+                __field(unsigned int, ins_contig)
+                __field(int, ins_contig_index)
+                __field(int, free_records)
+                __field(int, ins_tree_depth)
+        ),
+        TP_fast_assign(
+                __entry->ins_appending = ins_appending;
+                __entry->ins_contig = ins_contig;
+                __entry->ins_contig_index = ins_contig_index;
+                __entry->free_records = free_records;
+                __entry->ins_tree_depth = ins_tree_depth;
+        ),
+        TP_printk("%u %u %d %d %d",
+                  __entry->ins_appending, __entry->ins_contig,
+                  __entry->ins_contig_index, __entry->free_records,
+                  __entry->ins_tree_depth)
+);
+TRACE_EVENT(ocfs2_split_extent,
+        TP_PROTO(int split_index, unsigned int c_contig_type,
+                 unsigned int c_has_empty_extent,
+                 unsigned int c_split_covers_rec),
+        TP_ARGS(split_index, c_contig_type,
+                c_has_empty_extent, c_split_covers_rec),
+        TP_STRUCT__entry(
+                __field(int, split_index)
+                __field(unsigned int, c_contig_type)
+                __field(unsigned int, c_has_empty_extent)
+                __field(unsigned int, c_split_covers_rec)
+        ),
+        TP_fast_assign(
+                __entry->split_index = split_index;
+                __entry->c_contig_type = c_contig_type;
+                __entry->c_has_empty_extent = c_has_empty_extent;
+                __entry->c_split_covers_rec = c_split_covers_rec;
+        ),
+        TP_printk("%d %u %u %u", __entry->split_index, __entry->c_contig_type,
+                  __entry->c_has_empty_extent, __entry->c_split_covers_rec)
+);
+TRACE_EVENT(ocfs2_remove_extent,
+        TP_PROTO(unsigned long long owner, unsigned int cpos,
+                 unsigned int len, int index,
+                 unsigned int e_cpos, unsigned int clusters),
+        TP_ARGS(owner, cpos, len, index, e_cpos, clusters),
+        TP_STRUCT__entry(
+                __field(unsigned long long, owner)
+                __field(unsigned int, cpos)
+                __field(unsigned int, len)
+                __field(int, index)
+                __field(unsigned int, e_cpos)
+                __field(unsigned int, clusters)
+        ),
+        TP_fast_assign(
+                __entry->owner = owner;
+                __entry->cpos = cpos;
+                __entry->len = len;
+                __entry->index = index;
+                __entry->e_cpos = e_cpos;
+                __entry->clusters = clusters;
+        ),
+        TP_printk("%llu %u %u %d %u %u",
+                  __entry->owner, __entry->cpos, __entry->len, __entry->index,
+                  __entry->e_cpos, __entry->clusters)
+);
+TRACE_EVENT(ocfs2_commit_truncate,
+        TP_PROTO(unsigned long long ino, unsigned int new_cpos,
+                 unsigned int clusters, unsigned int depth),
+        TP_ARGS(ino, new_cpos, clusters, depth),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(unsigned int, new_cpos)
+                __field(unsigned int, clusters)
+                __field(unsigned int, depth)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->new_cpos = new_cpos;
+                __entry->clusters = clusters;
+                __entry->depth = depth;
+        ),
+        TP_printk("%llu %u %u %u",
+                  __entry->ino, __entry->new_cpos,
+                  __entry->clusters, __entry->depth)
+);
+TRACE_EVENT(ocfs2_validate_extent_block,
+        TP_PROTO(unsigned long long blkno),
+        TP_ARGS(blkno),
+        TP_STRUCT__entry(
+                __field(unsigned long long, blkno)
+        ),
+        TP_fast_assign(
+                __entry->blkno = blkno;
+        ),
+        TP_printk("%llu ", __entry->blkno)
+);
+TRACE_EVENT(ocfs2_rotate_leaf,
+        TP_PROTO(unsigned int insert_cpos, int insert_index,
+                 int has_empty, int next_free,
+                 unsigned int l_count),
+        TP_ARGS(insert_cpos, insert_index, has_empty,
+                next_free, l_count),
+        TP_STRUCT__entry(
+                __field(unsigned int, insert_cpos)
+                __field(int, insert_index)
+                __field(int, has_empty)
+                __field(int, next_free)
+                __field(unsigned int, l_count)
+        ),
+        TP_fast_assign(
+                __entry->insert_cpos = insert_cpos;
+                __entry->insert_index = insert_index;
+                __entry->has_empty = has_empty;
+                __entry->next_free = next_free;
+                __entry->l_count = l_count;
+        ),
+        TP_printk("%u %d %d %d %u", __entry->insert_cpos,
+                  __entry->insert_index, __entry->has_empty,
+                  __entry->next_free, __entry->l_count)
+);
+TRACE_EVENT(ocfs2_add_clusters_in_btree_ret,
+        TP_PROTO(int status, int reason, int err),
+        TP_ARGS(status, reason, err),
+        TP_STRUCT__entry(
+                __field(int, status)
+                __field(int, reason)
+                __field(int, err)
+        ),
+        TP_fast_assign(
+                __entry->status = status;
+                __entry->reason = reason;
+                __entry->err = err;
+        ),
+        TP_printk("%d %d %d", __entry->status,
+                  __entry->reason, __entry->err)
+);
+TRACE_EVENT(ocfs2_mark_extent_written,
+        TP_PROTO(unsigned long long owner, unsigned int cpos,
+                 unsigned int len, unsigned int phys),
+        TP_ARGS(owner, cpos, len, phys),
+        TP_STRUCT__entry(
+                __field(unsigned long long, owner)
+                __field(unsigned int, cpos)
+                __field(unsigned int, len)
+                __field(unsigned int, phys)
+        ),
+        TP_fast_assign(
+                __entry->owner = owner;
+                __entry->cpos = cpos;
+                __entry->len = len;
+                __entry->phys = phys;
+        ),
+        TP_printk("%llu %u %u %u",
+                  __entry->owner, __entry->cpos,
+                  __entry->len, __entry->phys)
+);
+DECLARE_EVENT_CLASS(ocfs2__truncate_log_ops,
+        TP_PROTO(unsigned long long blkno, int index,
+                 unsigned int start, unsigned int num),
+        TP_ARGS(blkno, index, start, num),
+        TP_STRUCT__entry(
+                __field(unsigned long long, blkno)
+                __field(int, index)
+                __field(unsigned int, start)
+                __field(unsigned int, num)
+        ),
+        TP_fast_assign(
+                __entry->blkno = blkno;
+                __entry->index = index;
+                __entry->start = start;
+                __entry->num = num;
+        ),
+        TP_printk("%llu %d %u %u",
+                  __entry->blkno, __entry->index,
+                  __entry->start, __entry->num)
+);
+#define DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(name)       \
+DEFINE_EVENT(ocfs2__truncate_log_ops, name,     \
+        TP_PROTO(unsigned long long blkno, int index,   \
+                 unsigned int start, unsigned int num), \
+        TP_ARGS(blkno, index, start, num))
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_truncate_log_append);
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_replay_truncate_records);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_flush_truncate_log);
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_truncate_log_recovery);
+DEFINE_OCFS2_INT_EVENT(ocfs2_truncate_log_recovery_num);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_complete_truncate_log_recovery);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_free_cached_blocks);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_cache_cluster_dealloc);
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_run_deallocs);
+TRACE_EVENT(ocfs2_cache_block_dealloc,
+        TP_PROTO(int type, int slot, unsigned long long suballoc,
+                 unsigned long long blkno, unsigned int bit),
+        TP_ARGS(type, slot, suballoc, blkno, bit),
+        TP_STRUCT__entry(
+                __field(int, type)
+                __field(int, slot)
+                __field(unsigned long long, suballoc)
+                __field(unsigned long long, blkno)
+                __field(unsigned int, bit)
+        ),
+        TP_fast_assign(
+                __entry->type = type;
+                __entry->slot = slot;
+                __entry->suballoc = suballoc;
+                __entry->blkno = blkno;
+                __entry->bit = bit;
+        ),
+        TP_printk("%d %d %llu %llu %u",
+                  __entry->type, __entry->slot, __entry->suballoc,
+                  __entry->blkno, __entry->bit)
+);
+/* End of trace events for fs/ocfs2/alloc.c. */
+/* Trace events for fs/ocfs2/localalloc.c. */
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_la_set_sizes);
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_alloc_should_use_local);
+DEFINE_OCFS2_INT_EVENT(ocfs2_load_local_alloc);
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_local_alloc_recovery);
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_reserve_local_alloc_bits);
+DEFINE_OCFS2_UINT_EVENT(ocfs2_local_alloc_count_bits);
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits_search_bitmap);
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits);
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_sync_local_to_main);
+TRACE_EVENT(ocfs2_sync_local_to_main_free,
+        TP_PROTO(int count, int bit, unsigned long long start_blk,
+                 unsigned long long blkno),
+        TP_ARGS(count, bit, start_blk, blkno),
+        TP_STRUCT__entry(
+                __field(int, count)
+                __field(int, bit)
+                __field(unsigned long long, start_blk)
+                __field(unsigned long long, blkno)
+        ),
+        TP_fast_assign(
+                __entry->count = count;
+                __entry->bit = bit;
+                __entry->start_blk = start_blk;
+                __entry->blkno = blkno;
+        ),
+        TP_printk("%d %d %llu %llu",
+                  __entry->count, __entry->bit, __entry->start_blk,
+                  __entry->blkno)
+);
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_new_window);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_local_alloc_new_window_result);
+/* End of trace events for fs/ocfs2/localalloc.c. */
+/* Trace events for fs/ocfs2/resize.c. */
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_update_last_group_and_inode);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_group_extend);
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_group_add);
+/* End of trace events for fs/ocfs2/resize.c. */
+/* Trace events for fs/ocfs2/suballoc.c. */
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_group_descriptor);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_contig);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_discontig);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_block_group_alloc);
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_nospc);
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_no_new_group);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_reserve_new_inode_new_group);
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_set_bits);
+TRACE_EVENT(ocfs2_relink_block_group,
+        TP_PROTO(unsigned long long i_blkno, unsigned int chain,
+                 unsigned long long bg_blkno,
+                 unsigned long long prev_blkno),
+        TP_ARGS(i_blkno, chain, bg_blkno, prev_blkno),
+        TP_STRUCT__entry(
+                __field(unsigned long long, i_blkno)
+                __field(unsigned int, chain)
+                __field(unsigned long long, bg_blkno)
+                __field(unsigned long long, prev_blkno)
+        ),
+        TP_fast_assign(
+                __entry->i_blkno = i_blkno;
+                __entry->chain = chain;
+                __entry->bg_blkno = bg_blkno;
+                __entry->prev_blkno = prev_blkno;
+        ),
+        TP_printk("%llu %u %llu %llu",
+                  __entry->i_blkno, __entry->chain, __entry->bg_blkno,
+                  __entry->prev_blkno)
+);
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_cluster_group_search_wrong_max_bits);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cluster_group_search_max_block);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_block_group_search_max_block);
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_search_chain_begin);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_succ);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_end);
+DEFINE_OCFS2_UINT_EVENT(ocfs2_claim_suballoc_bits);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_claim_new_inode_at_loc);
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_clear_bits);
+TRACE_EVENT(ocfs2_free_suballoc_bits,
+        TP_PROTO(unsigned long long inode, unsigned long long group,
+                 unsigned int start_bit, unsigned int count),
+        TP_ARGS(inode, group, start_bit, count),
+        TP_STRUCT__entry(
+                __field(unsigned long long, inode)
+                __field(unsigned long long, group)
+                __field(unsigned int, start_bit)
+                __field(unsigned int, count)
+        ),
+        TP_fast_assign(
+                __entry->inode = inode;
+                __entry->group = group;
+                __entry->start_bit = start_bit;
+                __entry->count = count;
+        ),
+        TP_printk("%llu %llu %u %u", __entry->inode, __entry->group,
+                  __entry->start_bit, __entry->count)
+);
+TRACE_EVENT(ocfs2_free_clusters,
+        TP_PROTO(unsigned long long bg_blkno, unsigned long long start_blk,
+                 unsigned int start_bit, unsigned int count),
+        TP_ARGS(bg_blkno, start_blk, start_bit, count),
+        TP_STRUCT__entry(
+                __field(unsigned long long, bg_blkno)
+                __field(unsigned long long, start_blk)
+                __field(unsigned int, start_bit)
+                __field(unsigned int, count)
+        ),
+        TP_fast_assign(
+                __entry->bg_blkno = bg_blkno;
+                __entry->start_blk = start_blk;
+                __entry->start_bit = start_bit;
+                __entry->count = count;
+        ),
+        TP_printk("%llu %llu %u %u", __entry->bg_blkno, __entry->start_blk,
+                  __entry->start_bit, __entry->count)
+);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_get_suballoc_slot_bit);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_test_suballoc_bit);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_test_inode_bit);
+/* End of trace events for fs/ocfs2/suballoc.c. */
+/* Trace events for fs/ocfs2/refcounttree.c. */
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_refcount_block);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_refcount_trees);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree_blkno);
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_change_refcount_rec);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_expand_inline_ref_root);
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_divide_leaf_refcount_block);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_new_leaf_refcount_block);
+DECLARE_EVENT_CLASS(ocfs2__refcount_tree_ops,
+        TP_PROTO(unsigned long long blkno, int index,
+                 unsigned long long cpos,
+                 unsigned int clusters, unsigned int refcount),
+        TP_ARGS(blkno, index, cpos, clusters, refcount),
+        TP_STRUCT__entry(
+                __field(unsigned long long, blkno)
+                __field(int, index)
+                __field(unsigned long long, cpos)
+                __field(unsigned int, clusters)
+                __field(unsigned int, refcount)
+        ),
+        TP_fast_assign(
+                __entry->blkno = blkno;
+                __entry->index = index;
+                __entry->cpos = cpos;
+                __entry->clusters = clusters;
+                __entry->refcount = refcount;
+        ),
+        TP_printk("%llu %d %llu %u %u", __entry->blkno, __entry->index,
+                  __entry->cpos, __entry->clusters, __entry->refcount)
+);
+#define DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(name)      \
+DEFINE_EVENT(ocfs2__refcount_tree_ops, name,            \
+        TP_PROTO(unsigned long long blkno, int index,   \
+                 unsigned long long cpos,               \
+                 unsigned int count, unsigned int refcount),    \
+        TP_ARGS(blkno, index, cpos, count, refcount))
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_insert_refcount_rec);
+TRACE_EVENT(ocfs2_split_refcount_rec,
+        TP_PROTO(unsigned long long cpos,
+                 unsigned int clusters, unsigned int refcount,
+                 unsigned long long split_cpos,
+                 unsigned int split_clusters, unsigned int split_refcount),
+        TP_ARGS(cpos, clusters, refcount,
+                split_cpos, split_clusters, split_refcount),
+        TP_STRUCT__entry(
+                __field(unsigned long long, cpos)
+                __field(unsigned int, clusters)
+                __field(unsigned int, refcount)
+                __field(unsigned long long, split_cpos)
+                __field(unsigned int, split_clusters)
+                __field(unsigned int, split_refcount)
+        ),
+        TP_fast_assign(
+                __entry->cpos = cpos;
+                __entry->clusters = clusters;
+                __entry->refcount = refcount;
+                __entry->split_cpos = split_cpos;
+                __entry->split_clusters = split_clusters;
+                __entry->split_refcount = split_refcount;
+        ),
+        TP_printk("%llu %u %u %llu %u %u",
+                  __entry->cpos, __entry->clusters, __entry->refcount,
+                  __entry->split_cpos, __entry->split_clusters,
+                  __entry->split_refcount)
+);
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_split_refcount_rec_insert);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_increase_refcount_begin);
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_change);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_increase_refcount_insert);
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_split);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_remove_refcount_extent);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_restore_refcount_block);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_decrease_refcount_rec);
+TRACE_EVENT(ocfs2_decrease_refcount,
+        TP_PROTO(unsigned long long owner,
+                 unsigned long long cpos,
+                 unsigned int len, int delete),
+        TP_ARGS(owner, cpos, len, delete),
+        TP_STRUCT__entry(
+                __field(unsigned long long, owner)
+                __field(unsigned long long, cpos)
+                __field(unsigned int, len)
+                __field(int, delete)
+        ),
+        TP_fast_assign(
+                __entry->owner = owner;
+                __entry->cpos = cpos;
+                __entry->len = len;
+                __entry->delete = delete;
+        ),
+        TP_printk("%llu %llu %u %d",
+                  __entry->owner, __entry->cpos, __entry->len, __entry->delete)
+);
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_mark_extent_refcounted);
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_calc_refcount_meta_credits);
+TRACE_EVENT(ocfs2_calc_refcount_meta_credits_iterate,
+        TP_PROTO(int recs_add, unsigned long long cpos,
+                 unsigned int clusters, unsigned long long r_cpos,
+                 unsigned int r_clusters, unsigned int refcount, int index),
+        TP_ARGS(recs_add, cpos, clusters, r_cpos, r_clusters, refcount, index),
+        TP_STRUCT__entry(
+                __field(int, recs_add)
+                __field(unsigned long long, cpos)
+                __field(unsigned int, clusters)
+                __field(unsigned long long, r_cpos)
+                __field(unsigned int, r_clusters)
+                __field(unsigned int, refcount)
+                __field(int, index)
+        ),
+        TP_fast_assign(
+                __entry->recs_add = recs_add;
+                __entry->cpos = cpos;
+                __entry->clusters = clusters;
+                __entry->r_cpos = r_cpos;
+                __entry->r_clusters = r_clusters;
+                __entry->refcount = refcount;
+                __entry->index = index;
+        ),
+        TP_printk("%d %llu %u %llu %u %u %d",
+                  __entry->recs_add, __entry->cpos, __entry->clusters,
+                  __entry->r_cpos, __entry->r_clusters,
+                  __entry->refcount, __entry->index)
+);
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_add_refcount_flag);
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_prepare_refcount_change_for_del);
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_lock_refcount_allocators);
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_page);
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_jbd);
+TRACE_EVENT(ocfs2_clear_ext_refcount,
+        TP_PROTO(unsigned long long ino, unsigned int cpos,
+                 unsigned int len, unsigned int p_cluster,
+                 unsigned int ext_flags),
+        TP_ARGS(ino, cpos, len, p_cluster, ext_flags),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(unsigned int, cpos)
+                __field(unsigned int, len)
+                __field(unsigned int, p_cluster)
+                __field(unsigned int, ext_flags)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->cpos = cpos;
+                __entry->len = len;
+                __entry->p_cluster = p_cluster;
+                __entry->ext_flags = ext_flags;
+        ),
+        TP_printk("%llu %u %u %u %u",
+                  __entry->ino, __entry->cpos, __entry->len,
+                  __entry->p_cluster, __entry->ext_flags)
+);
+TRACE_EVENT(ocfs2_replace_clusters,
+        TP_PROTO(unsigned long long ino, unsigned int cpos,
+                 unsigned int old, unsigned int new, unsigned int len,
+                 unsigned int ext_flags),
+        TP_ARGS(ino, cpos, old, new, len, ext_flags),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(unsigned int, cpos)
+                __field(unsigned int, old)
+                __field(unsigned int, new)
+                __field(unsigned int, len)
+                __field(unsigned int, ext_flags)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->cpos = cpos;
+                __entry->old = old;
+                __entry->new = new;
+                __entry->len = len;
+                __entry->ext_flags = ext_flags;
+        ),
+        TP_printk("%llu %u %u %u %u %u",
+                  __entry->ino, __entry->cpos, __entry->old, __entry->new,
+                  __entry->len, __entry->ext_flags)
+);
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_make_clusters_writable);
+TRACE_EVENT(ocfs2_refcount_cow_hunk,
+        TP_PROTO(unsigned long long ino, unsigned int cpos,
+                 unsigned int write_len, unsigned int max_cpos,
+                 unsigned int cow_start, unsigned int cow_len),
+        TP_ARGS(ino, cpos, write_len, max_cpos, cow_start, cow_len),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(unsigned int, cpos)
+                __field(unsigned int, write_len)
+                __field(unsigned int, max_cpos)
+                __field(unsigned int, cow_start)
+                __field(unsigned int, cow_len)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->cpos = cpos;
+                __entry->write_len = write_len;
+                __entry->max_cpos = max_cpos;
+                __entry->cow_start = cow_start;
+                __entry->cow_len = cow_len;
+        ),
+        TP_printk("%llu %u %u %u %u %u",
+                  __entry->ino, __entry->cpos, __entry->write_len,
+                  __entry->max_cpos, __entry->cow_start, __entry->cow_len)
+);
+/* End of trace events for fs/ocfs2/refcounttree.c. */
+/* Trace events for fs/ocfs2/aops.c. */
+DECLARE_EVENT_CLASS(ocfs2__get_block,
+        TP_PROTO(unsigned long long ino, unsigned long long iblock,
+                 void *bh_result, int create),
+        TP_ARGS(ino, iblock, bh_result, create),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(unsigned long long, iblock)
+                __field(void *, bh_result)
+                __field(int, create)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->iblock = iblock;
+                __entry->bh_result = bh_result;
+                __entry->create = create;
+        ),
+        TP_printk("%llu %llu %p %d",
+                  __entry->ino, __entry->iblock,
+                  __entry->bh_result, __entry->create)
+);
+#define DEFINE_OCFS2_GET_BLOCK_EVENT(name)      \
+DEFINE_EVENT(ocfs2__get_block, name,    \
+        TP_PROTO(unsigned long long ino, unsigned long long iblock,     \
+                 void *bh_result, int create),  \
+        TP_ARGS(ino, iblock, bh_result, create))
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_symlink_get_block);
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_get_block);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap);
+TRACE_EVENT(ocfs2_try_to_write_inline_data,
+        TP_PROTO(unsigned long long ino, unsigned int len,
+                 unsigned long long pos, unsigned int flags),
+        TP_ARGS(ino, len, pos, flags),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(unsigned int, len)
+                __field(unsigned long long, pos)
+                __field(unsigned int, flags)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->len = len;
+                __entry->pos = pos;
+                __entry->flags = flags;
+        ),
+        TP_printk("%llu %u %llu 0x%x",
+                  __entry->ino, __entry->len, __entry->pos, __entry->flags)
+);
+TRACE_EVENT(ocfs2_write_begin_nolock,
+        TP_PROTO(unsigned long long ino,
+                 long long i_size, unsigned int i_clusters,
+                 unsigned long long pos, unsigned int len,
+                 unsigned int flags, void *page,
+                 unsigned int clusters, unsigned int extents_to_split),
+        TP_ARGS(ino, i_size, i_clusters, pos, len, flags,
+                page, clusters, extents_to_split),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(long long, i_size)
+                __field(unsigned int, i_clusters)
+                __field(unsigned long long, pos)
+                __field(unsigned int, len)
+                __field(unsigned int, flags)
+                __field(void *, page)
+                __field(unsigned int, clusters)
+                __field(unsigned int, extents_to_split)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->i_size = i_size;
+                __entry->i_clusters = i_clusters;
+                __entry->pos = pos;
+                __entry->len = len;
+                __entry->flags = flags;
+                __entry->page = page;
+                __entry->clusters = clusters;
+                __entry->extents_to_split = extents_to_split;
+        ),
+        TP_printk("%llu %lld %u %llu %u %u %p %u %u",
+                  __entry->ino, __entry->i_size, __entry->i_clusters,
+                  __entry->pos, __entry->len,
+                  __entry->flags, __entry->page, __entry->clusters,
+                  __entry->extents_to_split)
+);
+TRACE_EVENT(ocfs2_write_end_inline,
+        TP_PROTO(unsigned long long ino,
+                 unsigned long long pos, unsigned int copied,
+                 unsigned int id_count, unsigned int features),
+        TP_ARGS(ino, pos, copied, id_count, features),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(unsigned long long, pos)
+                __field(unsigned int, copied)
+                __field(unsigned int, id_count)
+                __field(unsigned int, features)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->pos = pos;
+                __entry->copied = copied;
+                __entry->id_count = id_count;
+                __entry->features = features;
+        ),
+        TP_printk("%llu %llu %u %u %u",
+                  __entry->ino, __entry->pos, __entry->copied,
+                  __entry->id_count, __entry->features)
+);
+/* End of trace events for fs/ocfs2/aops.c. */
+/* Trace events for fs/ocfs2/mmap.c. */
+TRACE_EVENT(ocfs2_fault,
+        TP_PROTO(unsigned long long ino,
+                 void *area, void *page, unsigned long pgoff),
+        TP_ARGS(ino, area, page, pgoff),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(void *, area)
+                __field(void *, page)
+                __field(unsigned long, pgoff)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->area = area;
+                __entry->page = page;
+                __entry->pgoff = pgoff;
+        ),
+        TP_printk("%llu %p %p %lu",
+                  __entry->ino, __entry->area, __entry->page, __entry->pgoff)
+);
+/* End of trace events for fs/ocfs2/mmap.c. */
+/* Trace events for fs/ocfs2/file.c. */
+DECLARE_EVENT_CLASS(ocfs2__file_ops,
+        TP_PROTO(void *inode, void *file, void *dentry,
+                 unsigned long long ino,
+                 unsigned int d_len, const unsigned char *d_name,
+                 unsigned long long para),
+        TP_ARGS(inode, file, dentry, ino, d_len, d_name, para),
+        TP_STRUCT__entry(
+                __field(void *, inode)
+                __field(void *, file)
+                __field(void *, dentry)
+                __field(unsigned long long, ino)
+                __field(unsigned int, d_len)
+                __string(d_name, d_name)
+                __field(unsigned long long, para)
+        ),
+        TP_fast_assign(
+                __entry->inode = inode;
+                __entry->file = file;
+                __entry->dentry = dentry;
+                __entry->ino = ino;
+                __entry->d_len = d_len;
+                __assign_str(d_name, d_name);
+                __entry->para = para;
+        ),
+        TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file,
+                  __entry->dentry, __entry->ino, __entry->para,
+                  __entry->d_len, __get_str(d_name))
+);
+#define DEFINE_OCFS2_FILE_OPS(name)                             \
+DEFINE_EVENT(ocfs2__file_ops, name,                             \
+TP_PROTO(void *inode, void *file, void *dentry,                 \
+         unsigned long long ino,                                \
+         unsigned int d_len, const unsigned char *d_name,       \
+         unsigned long long mode),                              \
+        TP_ARGS(inode, file, dentry, ino, d_len, d_name, mode))
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_open);
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_release);
+DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file);
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write);
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write);
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read);
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read);
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error);
+TRACE_EVENT(ocfs2_extend_allocation,
+        TP_PROTO(unsigned long long ip_blkno, unsigned long long size,
+                 unsigned int clusters, unsigned int clusters_to_add,
+                 int why, int restart_func),
+        TP_ARGS(ip_blkno, size, clusters, clusters_to_add, why, restart_func),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ip_blkno)
+                __field(unsigned long long, size)
+                __field(unsigned int, clusters)
+                __field(unsigned int, clusters_to_add)
+                __field(int, why)
+                __field(int, restart_func)
+        ),
+        TP_fast_assign(
+                __entry->ip_blkno = ip_blkno;
+                __entry->size = size;
+                __entry->clusters = clusters;
+                __entry->clusters_to_add = clusters_to_add;
+                __entry->why = why;
+                __entry->restart_func = restart_func;
+        ),
+        TP_printk("%llu %llu %u %u %d %d",
+                  __entry->ip_blkno, __entry->size, __entry->clusters,
+                  __entry->clusters_to_add, __entry->why, __entry->restart_func)
+);
+TRACE_EVENT(ocfs2_extend_allocation_end,
+        TP_PROTO(unsigned long long ino,
+                 unsigned int di_clusters, unsigned long long di_size,
+                 unsigned int ip_clusters, unsigned long long i_size),
+        TP_ARGS(ino, di_clusters, di_size, ip_clusters, i_size),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(unsigned int, di_clusters)
+                __field(unsigned long long, di_size)
+                __field(unsigned int, ip_clusters)
+                __field(unsigned long long, i_size)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->di_clusters = di_clusters;
+                __entry->di_size = di_size;
+                __entry->ip_clusters = ip_clusters;
+                __entry->i_size = i_size;
+        ),
+        TP_printk("%llu %u %llu %u %llu", __entry->ino, __entry->di_clusters,
+                  __entry->di_size, __entry->ip_clusters, __entry->i_size)
+);
+TRACE_EVENT(ocfs2_write_zero_page,
+        TP_PROTO(unsigned long long ino,
+                 unsigned long long abs_from, unsigned long long abs_to,
+                 unsigned long index, unsigned int zero_from,
+                 unsigned int zero_to),
+        TP_ARGS(ino, abs_from, abs_to, index, zero_from, zero_to),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(unsigned long long, abs_from)
+                __field(unsigned long long, abs_to)
+                __field(unsigned long, index)
+                __field(unsigned int, zero_from)
+                __field(unsigned int, zero_to)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->abs_from = abs_from;
+                __entry->abs_to = abs_to;
+                __entry->index = index;
+                __entry->zero_from = zero_from;
+                __entry->zero_to = zero_to;
+        ),
+        TP_printk("%llu %llu %llu %lu %u %u", __entry->ino,
+                  __entry->abs_from, __entry->abs_to,
+                  __entry->index, __entry->zero_from, __entry->zero_to)
+);
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend_range);
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend);
+TRACE_EVENT(ocfs2_setattr,
+        TP_PROTO(void *inode, void *dentry,
+                 unsigned long long ino,
+                 unsigned int d_len, const unsigned char *d_name,
+                 unsigned int ia_valid, unsigned int ia_mode,
+                 unsigned int ia_uid, unsigned int ia_gid),
+        TP_ARGS(inode, dentry, ino, d_len, d_name,
+                ia_valid, ia_mode, ia_uid, ia_gid),
+        TP_STRUCT__entry(
+                __field(void *, inode)
+                __field(void *, dentry)
+                __field(unsigned long long, ino)
+                __field(unsigned int, d_len)
+                __string(d_name, d_name)
+                __field(unsigned int, ia_valid)
+                __field(unsigned int, ia_mode)
+                __field(unsigned int, ia_uid)
+                __field(unsigned int, ia_gid)
+        ),
+        TP_fast_assign(
+                __entry->inode = inode;
+                __entry->dentry = dentry;
+                __entry->ino = ino;
+                __entry->d_len = d_len;
+                __assign_str(d_name, d_name);
+                __entry->ia_valid = ia_valid;
+                __entry->ia_mode = ia_mode;
+                __entry->ia_uid = ia_uid;
+                __entry->ia_gid = ia_gid;
+        ),
+        TP_printk("%p %p %llu %.*s %u %u %u %u", __entry->inode,
+                  __entry->dentry, __entry->ino, __entry->d_len,
+                  __get_str(d_name), __entry->ia_valid, __entry->ia_mode,
+                  __entry->ia_uid, __entry->ia_gid)
+);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_write_remove_suid);
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_partial_clusters);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range1);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range2);
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
+TRACE_EVENT(ocfs2_prepare_inode_for_write,
+        TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
+                 int appending, unsigned long count,
+                 int *direct_io, int *has_refcount),
+        TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(unsigned long long, saved_pos)
+                __field(int, appending)
+                __field(unsigned long, count)
+                __field(int, direct_io)
+                __field(int, has_refcount)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->saved_pos = saved_pos;
+                __entry->appending = appending;
+                __entry->count = count;
+                __entry->direct_io = direct_io ? *direct_io : -1;
+                __entry->has_refcount = has_refcount ? *has_refcount : -1;
+        ),
+        TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
+                  __entry->saved_pos, __entry->appending, __entry->count,
+                  __entry->direct_io, __entry->has_refcount)
+);
+DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
+/* End of trace events for fs/ocfs2/file.c. */
+/* Trace events for fs/ocfs2/inode.c. */
+TRACE_EVENT(ocfs2_iget_begin,
+        TP_PROTO(unsigned long long ino, unsigned int flags, int sysfile_type),
+        TP_ARGS(ino, flags, sysfile_type),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(unsigned int, flags)
+                __field(int, sysfile_type)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->flags = flags;
+                __entry->sysfile_type = sysfile_type;
+        ),
+        TP_printk("%llu %u %d", __entry->ino,
+                  __entry->flags, __entry->sysfile_type)
+);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_iget5_locked);
+TRACE_EVENT(ocfs2_iget_end,
+        TP_PROTO(void *inode, unsigned long long ino),
+        TP_ARGS(inode, ino),
+        TP_STRUCT__entry(
+                __field(void *, inode)
+                __field(unsigned long long, ino)
+        ),
+        TP_fast_assign(
+                __entry->inode = inode;
+                __entry->ino = ino;
+        ),
+        TP_printk("%p %llu", __entry->inode, __entry->ino)
+);
+TRACE_EVENT(ocfs2_find_actor,
+        TP_PROTO(void *inode, unsigned long long ino,
+                 void *args,  unsigned long long fi_blkno),
+        TP_ARGS(inode, ino, args, fi_blkno),
+        TP_STRUCT__entry(
+                __field(void *, inode)
+                __field(unsigned long long, ino)
+                __field(void *, args)
+                __field(unsigned long long, fi_blkno)
+        ),
+        TP_fast_assign(
+                __entry->inode = inode;
+                __entry->ino = ino;
+                __entry->args = args;
+                __entry->fi_blkno = fi_blkno;
+        ),
+        TP_printk("%p %llu %p %llu", __entry->inode, __entry->ino,
+                  __entry->args, __entry->fi_blkno)
+);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_populate_inode);
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
+        TP_PROTO(void *task, void *dc_task, unsigned long long ino,
+                 unsigned int flags),
+        TP_ARGS(task, dc_task, ino, flags),
+        TP_STRUCT__entry(
+                __field(void *, task)
+                __field(void *, dc_task)
+                __field(unsigned long long, ino)
+                __field(unsigned int, flags)
+        ),
+        TP_fast_assign(
+                __entry->task = task;
+                __entry->dc_task = dc_task;
+                __entry->ino = ino;
+                __entry->flags = flags;
+        ),
+        TP_printk("%p %p %llu %u", __entry->task, __entry->dc_task,
+                  __entry->ino, __entry->flags)
+);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_query_inode_wipe_begin);
+DEFINE_OCFS2_UINT_EVENT(ocfs2_query_inode_wipe_succ);
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_query_inode_wipe_end);
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_cleanup_delete_inode);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode);
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode);
+TRACE_EVENT(ocfs2_inode_revalidate,
+        TP_PROTO(void *inode, unsigned long long ino,
+                 unsigned int flags),
+        TP_ARGS(inode, ino, flags),
+        TP_STRUCT__entry(
+                __field(void *, inode)
+                __field(unsigned long long, ino)
+                __field(unsigned int, flags)
+        ),
+        TP_fast_assign(
+                __entry->inode = inode;
+                __entry->ino = ino;
+                __entry->flags = flags;
+        ),
+        TP_printk("%p %llu %u", __entry->inode, __entry->ino, __entry->flags)
+);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_mark_inode_dirty);
+/* End of trace events for fs/ocfs2/inode.c. */
+/* Trace events for fs/ocfs2/extent_map.c. */
+TRACE_EVENT(ocfs2_read_virt_blocks,
+        TP_PROTO(void *inode, unsigned long long vblock, int nr,
+                 void *bhs, unsigned int flags, void *validate),
+        TP_ARGS(inode, vblock, nr, bhs, flags, validate),
+        TP_STRUCT__entry(
+                __field(void *, inode)
+                __field(unsigned long long, vblock)
+                __field(int, nr)
+                __field(void *, bhs)
+                __field(unsigned int, flags)
+                __field(void *, validate)
+        ),
+        TP_fast_assign(
+                __entry->inode = inode;
+                __entry->vblock = vblock;
+                __entry->nr = nr;
+                __entry->bhs = bhs;
+                __entry->flags = flags;
+                __entry->validate = validate;
+        ),
+        TP_printk("%p %llu %d %p %x %p", __entry->inode, __entry->vblock,
+                  __entry->nr, __entry->bhs, __entry->flags, __entry->validate)
+);
+/* End of trace events for fs/ocfs2/extent_map.c. */
+/* Trace events for fs/ocfs2/slot_map.c. */
+DEFINE_OCFS2_UINT_EVENT(ocfs2_refresh_slot_info);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers_block);
+DEFINE_OCFS2_INT_EVENT(ocfs2_find_slot);
+/* End of trace events for fs/ocfs2/slot_map.c. */
+/* Trace events for fs/ocfs2/heartbeat.c. */
+DEFINE_OCFS2_INT_EVENT(ocfs2_do_node_down);
+/* End of trace events for fs/ocfs2/heartbeat.c. */
+/* Trace events for fs/ocfs2/super.c. */
+TRACE_EVENT(ocfs2_remount,
+        TP_PROTO(unsigned long s_flags, unsigned long osb_flags, int flags),
+        TP_ARGS(s_flags, osb_flags, flags),
+        TP_STRUCT__entry(
+                __field(unsigned long, s_flags)
+                __field(unsigned long, osb_flags)
+                __field(int, flags)
+        ),
+        TP_fast_assign(
+                __entry->s_flags = s_flags;
+                __entry->osb_flags = osb_flags;
+                __entry->flags = flags;
+        ),
+        TP_printk("%lu %lu %d", __entry->s_flags,
+                  __entry->osb_flags, __entry->flags)
+);
+TRACE_EVENT(ocfs2_fill_super,
+        TP_PROTO(void *sb, void *data, int silent),
+        TP_ARGS(sb, data, silent),
+        TP_STRUCT__entry(
+                __field(void *, sb)
+                __field(void *, data)
+                __field(int, silent)
+        ),
+        TP_fast_assign(
+                __entry->sb = sb;
+                __entry->data = data;
+                __entry->silent = silent;
+        ),
+        TP_printk("%p %p %d", __entry->sb,
+                  __entry->data, __entry->silent)
+);
+TRACE_EVENT(ocfs2_parse_options,
+        TP_PROTO(int is_remount, char *options),
+        TP_ARGS(is_remount, options),
+        TP_STRUCT__entry(
+                __field(int, is_remount)
+                __string(options, options)
+        ),
+        TP_fast_assign(
+                __entry->is_remount = is_remount;
+                __assign_str(options, options);
+        ),
+        TP_printk("%d %s", __entry->is_remount, __get_str(options))
+);
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super);
+TRACE_EVENT(ocfs2_statfs,
+        TP_PROTO(void *sb, void *buf),
+        TP_ARGS(sb, buf),
+        TP_STRUCT__entry(
+                __field(void *, sb)
+                __field(void *, buf)
+        ),
+        TP_fast_assign(
+                __entry->sb = sb;
+                __entry->buf = buf;
+        ),
+        TP_printk("%p %p", __entry->sb, __entry->buf)
+);
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_dismount_volume);
+TRACE_EVENT(ocfs2_initialize_super,
+        TP_PROTO(char *label, char *uuid_str, unsigned long long root_dir,
+                 unsigned long long system_dir, int cluster_bits),
+        TP_ARGS(label, uuid_str, root_dir, system_dir, cluster_bits),
+        TP_STRUCT__entry(
+                __string(label, label)
+                __string(uuid_str, uuid_str)
+                __field(unsigned long long, root_dir)
+                __field(unsigned long long, system_dir)
+                __field(int, cluster_bits)
+        ),
+        TP_fast_assign(
+                __assign_str(label, label);
+                __assign_str(uuid_str, uuid_str);
+                __entry->root_dir = root_dir;
+                __entry->system_dir = system_dir;
+                __entry->cluster_bits = cluster_bits;
+        ),
+        TP_printk("%s %s %llu %llu %d", __get_str(label), __get_str(uuid_str),
+                  __entry->root_dir, __entry->system_dir, __entry->cluster_bits)
+);
+/* End of trace events for fs/ocfs2/super.c. */
+/* Trace events for fs/ocfs2/xattr.c. */
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_xattr_block);
+DEFINE_OCFS2_UINT_EVENT(ocfs2_xattr_extend_allocation);
+TRACE_EVENT(ocfs2_init_xattr_set_ctxt,
+        TP_PROTO(const char *name, int meta, int clusters, int credits),
+        TP_ARGS(name, meta, clusters, credits),
+        TP_STRUCT__entry(
+                __string(name, name)
+                __field(int, meta)
+                __field(int, clusters)
+                __field(int, credits)
+        ),
+        TP_fast_assign(
+                __assign_str(name, name);
+                __entry->meta = meta;
+                __entry->clusters = clusters;
+                __entry->credits = credits;
+        ),
+        TP_printk("%s %d %d %d", __get_str(name), __entry->meta,
+                  __entry->clusters, __entry->credits)
+);
+DECLARE_EVENT_CLASS(ocfs2__xattr_find,
+        TP_PROTO(unsigned long long ino, const char *name, int name_index,
+                 unsigned int hash, unsigned long long location,
+                 int xe_index),
+        TP_ARGS(ino, name, name_index, hash, location, xe_index),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __string(name, name)
+                __field(int, name_index)
+                __field(unsigned int, hash)
+                __field(unsigned long long, location)
+                __field(int, xe_index)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __assign_str(name, name);
+                __entry->name_index = name_index;
+                __entry->hash = hash;
+                __entry->location = location;
+                __entry->xe_index = xe_index;
+        ),
+        TP_printk("%llu %s %d %u %llu %d", __entry->ino, __get_str(name),
+                  __entry->name_index, __entry->hash, __entry->location,
+                  __entry->xe_index)
+);
+#define DEFINE_OCFS2_XATTR_FIND_EVENT(name)                                     \
+DEFINE_EVENT(ocfs2__xattr_find, name,                                   \
+TP_PROTO(unsigned long long ino, const char *name, int name_index,      \
+         unsigned int hash, unsigned long long bucket,                  \
+         int xe_index),                                                 \
+        TP_ARGS(ino, name, name_index, hash, bucket, xe_index))
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_bucket_find);
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find);
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find_rec);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_iterate_xattr_buckets);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_iterate_xattr_bucket);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cp_xattr_block_to_bucket_begin);
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cp_xattr_block_to_bucket_end);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block_begin);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block);
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_defrag_xattr_bucket);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_bucket_cross_cluster);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_divide_xattr_bucket_begin);
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_divide_xattr_bucket_move);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_cp_xattr_bucket);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_buckets);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_adjust_xattr_cross_cluster);
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_begin);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_add_new_xattr_cluster);
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_insert);
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_extend_xattr_bucket);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_add_new_xattr_bucket);
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_xattr_bucket_value_truncate);
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_rm_xattr_cluster);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_header);
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_create_empty_xattr_block);
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_bucket);
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_index_block);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_xattr_bucket_value_refcount);
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_reflink_xattr_buckets);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_rec);
+/* End of trace events for fs/ocfs2/xattr.c. */
+/* Trace events for fs/ocfs2/reservations.c. */
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_insert);
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_begin);
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_end);
+TRACE_EVENT(ocfs2_resv_find_window_begin,
+        TP_PROTO(unsigned int r_start, unsigned int r_end, unsigned int goal,
+                 unsigned int wanted, int empty_root),
+        TP_ARGS(r_start, r_end, goal, wanted, empty_root),
+        TP_STRUCT__entry(
+                __field(unsigned int, r_start)
+                __field(unsigned int, r_end)
+                __field(unsigned int, goal)
+                __field(unsigned int, wanted)
+                __field(int, empty_root)
+        ),
+        TP_fast_assign(
+                __entry->r_start = r_start;
+                __entry->r_end = r_end;
+                __entry->goal = goal;
+                __entry->wanted = wanted;
+                __entry->empty_root = empty_root;
+        ),
+        TP_printk("%u %u %u %u %d", __entry->r_start, __entry->r_end,
+                  __entry->goal, __entry->wanted, __entry->empty_root)
+);
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_find_window_prev);
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_resv_find_window_next);
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cannibalize_resv_begin);
+TRACE_EVENT(ocfs2_cannibalize_resv_end,
+        TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+                 unsigned int last_start, unsigned int last_len),
+        TP_ARGS(start, end, len, last_start, last_len),
+        TP_STRUCT__entry(
+                __field(unsigned int, start)
+                __field(unsigned int, end)
+                __field(unsigned int, len)
+                __field(unsigned int, last_start)
+                __field(unsigned int, last_len)
+        ),
+        TP_fast_assign(
+                __entry->start = start;
+                __entry->end = end;
+                __entry->len = len;
+                __entry->last_start = last_start;
+                __entry->last_len = last_len;
+        ),
+        TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+                  __entry->len, __entry->last_start, __entry->last_len)
+);
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_resv_bits);
+TRACE_EVENT(ocfs2_resmap_claimed_bits_begin,
+        TP_PROTO(unsigned int cstart, unsigned int cend, unsigned int clen,
+                 unsigned int r_start, unsigned int r_end, unsigned int r_len,
+                 unsigned int last_start, unsigned int last_len),
+        TP_ARGS(cstart, cend, clen, r_start, r_end,
+                r_len, last_start, last_len),
+        TP_STRUCT__entry(
+                __field(unsigned int, cstart)
+                __field(unsigned int, cend)
+                __field(unsigned int, clen)
+                __field(unsigned int, r_start)
+                __field(unsigned int, r_end)
+                __field(unsigned int, r_len)
+                __field(unsigned int, last_start)
+                __field(unsigned int, last_len)
+        ),
+        TP_fast_assign(
+                __entry->cstart = cstart;
+                __entry->cend = cend;
+                __entry->clen = clen;
+                __entry->r_start = r_start;
+                __entry->r_end = r_end;
+                __entry->r_len = r_len;
+                __entry->last_start = last_start;
+                __entry->last_len = last_len;
+        ),
+        TP_printk("%u %u %u %u %u %u %u %u",
+                  __entry->cstart, __entry->cend, __entry->clen,
+                  __entry->r_start, __entry->r_end, __entry->r_len,
+                  __entry->last_start, __entry->last_len)
+);
+TRACE_EVENT(ocfs2_resmap_claimed_bits_end,
+        TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+                 unsigned int last_start, unsigned int last_len),
+        TP_ARGS(start, end, len, last_start, last_len),
+        TP_STRUCT__entry(
+                __field(unsigned int, start)
+                __field(unsigned int, end)
+                __field(unsigned int, len)
+                __field(unsigned int, last_start)
+                __field(unsigned int, last_len)
+        ),
+        TP_fast_assign(
+                __entry->start = start;
+                __entry->end = end;
+                __entry->len = len;
+                __entry->last_start = last_start;
+                __entry->last_len = last_len;
+        ),
+        TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+                  __entry->len, __entry->last_start, __entry->last_len)
+);
+/* End of trace events for fs/ocfs2/reservations.c. */
+/* Trace events for fs/ocfs2/quota_local.c. */
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_recover_local_quota_file);
+DEFINE_OCFS2_INT_EVENT(ocfs2_finish_quota_recovery);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(olq_set_dquot);
+/* End of trace events for fs/ocfs2/quota_local.c. */
+/* Trace events for fs/ocfs2/quota_global.c. */
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_quota_block);
+TRACE_EVENT(ocfs2_sync_dquot,
+        TP_PROTO(unsigned int dq_id, long long dqb_curspace,
+                 long long spacechange, long long curinodes,
+                 long long inodechange),
+        TP_ARGS(dq_id, dqb_curspace, spacechange, curinodes, inodechange),
+        TP_STRUCT__entry(
+                __field(unsigned int, dq_id)
+                __field(long long, dqb_curspace)
+                __field(long long, spacechange)
+                __field(long long, curinodes)
+                __field(long long, inodechange)
+        ),
+        TP_fast_assign(
+                __entry->dq_id = dq_id;
+                __entry->dqb_curspace = dqb_curspace;
+                __entry->spacechange = spacechange;
+                __entry->curinodes = curinodes;
+                __entry->inodechange = inodechange;
+        ),
+        TP_printk("%u %lld %lld %lld %lld", __entry->dq_id,
+                  __entry->dqb_curspace, __entry->spacechange,
+                  __entry->curinodes, __entry->inodechange)
+);
+TRACE_EVENT(ocfs2_sync_dquot_helper,
+        TP_PROTO(unsigned int dq_id, unsigned int dq_type, unsigned long type,
+                 const char *s_id),
+        TP_ARGS(dq_id, dq_type, type, s_id),
+        TP_STRUCT__entry(
+                __field(unsigned int, dq_id)
+                __field(unsigned int, dq_type)
+                __field(unsigned long, type)
+                __string(s_id, s_id)
+        ),
+        TP_fast_assign(
+                __entry->dq_id = dq_id;
+                __entry->dq_type = dq_type;
+                __entry->type = type;
+                __assign_str(s_id, s_id);
+        ),
+        TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type,
+                  __entry->type, __get_str(s_id))
+);
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_write_dquot);
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot);
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
+/* End of trace events for fs/ocfs2/quota_global.c. */
+/* Trace events for fs/ocfs2/dir.c. */
+DEFINE_OCFS2_INT_EVENT(ocfs2_search_dirblock);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_dir_block);
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_find_entry_el);
+TRACE_EVENT(ocfs2_dx_dir_search,
+        TP_PROTO(unsigned long long ino, int namelen, const char *name,
+                 unsigned int major_hash, unsigned int minor_hash,
+                 unsigned long long blkno),
+        TP_ARGS(ino, namelen, name, major_hash, minor_hash, blkno),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(int, namelen)
+                __string(name, name)
+                __field(unsigned int, major_hash)
+                __field(unsigned int,minor_hash)
+                __field(unsigned long long, blkno)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->namelen = namelen;
+                __assign_str(name, name);
+                __entry->major_hash = major_hash;
+                __entry->minor_hash = minor_hash;
+                __entry->blkno = blkno;
+        ),
+        TP_printk("%llu %.*s %u %u %llu", __entry->ino,
+                   __entry->namelen, __get_str(name),
+                  __entry->major_hash, __entry->minor_hash, __entry->blkno)
+);
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_dx_dir_search_leaf_info);
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_delete_entry_dx);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_readdir);
+TRACE_EVENT(ocfs2_find_files_on_disk,
+        TP_PROTO(int namelen, const char *name, void *blkno,
+                 unsigned long long dir),
+        TP_ARGS(namelen, name, blkno, dir),
+        TP_STRUCT__entry(
+                __field(int, namelen)
+                __string(name, name)
+                __field(void *, blkno)
+                __field(unsigned long long, dir)
+        ),
+        TP_fast_assign(
+                __entry->namelen = namelen;
+                __assign_str(name, name);
+                __entry->blkno = blkno;
+                __entry->dir = dir;
+        ),
+        TP_printk("%.*s %p %llu", __entry->namelen, __get_str(name),
+                  __entry->blkno, __entry->dir)
+);
+TRACE_EVENT(ocfs2_check_dir_for_entry,
+        TP_PROTO(unsigned long long dir, int namelen, const char *name),
+        TP_ARGS(dir, namelen, name),
+        TP_STRUCT__entry(
+                __field(unsigned long long, dir)
+                __field(int, namelen)
+                __string(name, name)
+        ),
+        TP_fast_assign(
+                __entry->dir = dir;
+                __entry->namelen = namelen;
+                __assign_str(name, name);
+        ),
+        TP_printk("%llu %.*s", __entry->dir,
+                  __entry->namelen, __get_str(name))
+);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_dx_dir_attach_index);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_format_cluster);
+TRACE_EVENT(ocfs2_dx_dir_index_root_block,
+        TP_PROTO(unsigned long long dir,
+                 unsigned int major_hash, unsigned int minor_hash,
+                 int namelen, const char *name, unsigned int num_used),
+        TP_ARGS(dir, major_hash, minor_hash, namelen, name, num_used),
+        TP_STRUCT__entry(
+                __field(unsigned long long, dir)
+                __field(unsigned int, major_hash)
+                __field(unsigned int, minor_hash)
+                __field(int, namelen)
+                __string(name, name)
+                __field(unsigned int, num_used)
+        ),
+        TP_fast_assign(
+                __entry->dir = dir;
+                __entry->major_hash = major_hash;
+                __entry->minor_hash = minor_hash;
+                __entry->namelen = namelen;
+                __assign_str(name, name);
+                __entry->num_used = num_used;
+        ),
+        TP_printk("%llu %x %x %.*s %u", __entry->dir,
+                  __entry->major_hash, __entry->minor_hash,
+                   __entry->namelen, __get_str(name), __entry->num_used)
+);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_extend_dir);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_rebalance);
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_dx_dir_rebalance_split);
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_prepare_dir_for_insert);
+/* End of trace events for fs/ocfs2/dir.c. */
+/* Trace events for fs/ocfs2/namei.c. */
+DECLARE_EVENT_CLASS(ocfs2__dentry_ops,
+        TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+                 unsigned long long dir_blkno, unsigned long long extra),
+        TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra),
+        TP_STRUCT__entry(
+                __field(void *, dir)
+                __field(void *, dentry)
+                __field(int, name_len)
+                __string(name, name)
+                __field(unsigned long long, dir_blkno)
+                __field(unsigned long long, extra)
+        ),
+        TP_fast_assign(
+                __entry->dir = dir;
+                __entry->dentry = dentry;
+                __entry->name_len = name_len;
+                __assign_str(name, name);
+                __entry->dir_blkno = dir_blkno;
+                __entry->extra = extra;
+        ),
+        TP_printk("%p %p %.*s %llu %llu", __entry->dir, __entry->dentry,
+                  __entry->name_len, __get_str(name),
+                  __entry->dir_blkno, __entry->extra)
+);
+#define DEFINE_OCFS2_DENTRY_OPS(name)                                   \
+DEFINE_EVENT(ocfs2__dentry_ops, name,                                   \
+TP_PROTO(void *dir, void *dentry, int name_len, const char *name,       \
+         unsigned long long dir_blkno, unsigned long long extra),       \
+        TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra))
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_lookup);
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mkdir);
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_create);
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_unlink);
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_symlink_create);
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mv_orphaned_inode_to_new);
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_lookup_ret);
+TRACE_EVENT(ocfs2_mknod,
+        TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+                 unsigned long long dir_blkno, unsigned long dev, int mode),
+        TP_ARGS(dir, dentry, name_len, name, dir_blkno, dev, mode),
+        TP_STRUCT__entry(
+                __field(void *, dir)
+                __field(void *, dentry)
+                __field(int, name_len)
+                __string(name, name)
+                __field(unsigned long long, dir_blkno)
+                __field(unsigned long, dev)
+                __field(int, mode)
+        ),
+        TP_fast_assign(
+                __entry->dir = dir;
+                __entry->dentry = dentry;
+                __entry->name_len = name_len;
+                __assign_str(name, name);
+                __entry->dir_blkno = dir_blkno;
+                __entry->dev = dev;
+                __entry->mode = mode;
+        ),
+        TP_printk("%p %p %.*s %llu %lu %d", __entry->dir, __entry->dentry,
+                  __entry->name_len, __get_str(name),
+                  __entry->dir_blkno, __entry->dev, __entry->mode)
+);
+TRACE_EVENT(ocfs2_link,
+        TP_PROTO(unsigned long long ino, int old_len, const char *old_name,
+                 int name_len, const char *name),
+        TP_ARGS(ino, old_len, old_name, name_len, name),
+        TP_STRUCT__entry(
+                __field(unsigned long long, ino)
+                __field(int, old_len)
+                __string(old_name, old_name)
+                __field(int, name_len)
+                __string(name, name)
+        ),
+        TP_fast_assign(
+                __entry->ino = ino;
+                __entry->old_len = old_len;
+                __assign_str(old_name, old_name);
+                __entry->name_len = name_len;
+                __assign_str(name, name);
+        ),
+        TP_printk("%llu %.*s %.*s", __entry->ino,
+                  __entry->old_len, __get_str(old_name),
+                  __entry->name_len, __get_str(name))
+);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_unlink_noent);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock_end);
+TRACE_EVENT(ocfs2_rename,
+        TP_PROTO(void *old_dir, void *old_dentry,
+                 void *new_dir, void *new_dentry,
+                 int old_len, const char *old_name,
+                 int new_len, const char *new_name),
+        TP_ARGS(old_dir, old_dentry, new_dir, new_dentry,
+                old_len, old_name, new_len, new_name),
+        TP_STRUCT__entry(
+                __field(void *, old_dir)
+                __field(void *, old_dentry)
+                __field(void *, new_dir)
+                __field(void *, new_dentry)
+                __field(int, old_len)
+                __string(old_name, old_name)
+                __field(int, new_len)
+                __string(new_name, new_name)
+        ),
+        TP_fast_assign(
+                __entry->old_dir = old_dir;
+                __entry->old_dentry = old_dentry;
+                __entry->new_dir = new_dir;
+                __entry->new_dentry = new_dentry;
+                __entry->old_len = old_len;
+                __assign_str(old_name, old_name);
+                __entry->new_len = new_len;
+                __assign_str(new_name, new_name);
+        ),
+        TP_printk("%p %p %p %p %.*s %.*s",
+                  __entry->old_dir, __entry->old_dentry,
+                  __entry->new_dir, __entry->new_dentry,
+                  __entry->old_len, __get_str(old_name),
+                  __entry->new_len, __get_str(new_name))
+);
+TRACE_EVENT(ocfs2_rename_target_exists,
+        TP_PROTO(int new_len, const char *new_name),
+        TP_ARGS(new_len, new_name),
+        TP_STRUCT__entry(
+                __field(int, new_len)
+                __string(new_name, new_name)
+        ),
+        TP_fast_assign(
+                __entry->new_len = new_len;
+                __assign_str(new_name, new_name);
+        ),
+        TP_printk("%.*s", __entry->new_len, __get_str(new_name))
+);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_rename_disagree);
+TRACE_EVENT(ocfs2_rename_over_existing,
+        TP_PROTO(unsigned long long new_blkno, void *new_bh,
+                 unsigned long long newdi_blkno),
+        TP_ARGS(new_blkno, new_bh, newdi_blkno),
+        TP_STRUCT__entry(
+                __field(unsigned long long, new_blkno)
+                __field(void *, new_bh)
+                __field(unsigned long long, newdi_blkno)
+        ),
+        TP_fast_assign(
+                __entry->new_blkno = new_blkno;
+                __entry->new_bh = new_bh;
+                __entry->newdi_blkno = newdi_blkno;
+        ),
+        TP_printk("%llu %p %llu", __entry->new_blkno, __entry->new_bh,
+                  __entry->newdi_blkno)
+);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_create_symlink_data);
+TRACE_EVENT(ocfs2_symlink_begin,
+        TP_PROTO(void *dir, void *dentry, const char *symname,
+                 int len, const char *name),
+        TP_ARGS(dir, dentry, symname, len, name),
+        TP_STRUCT__entry(
+                __field(void *, dir)
+                __field(void *, dentry)
+                __field(const char *, symname)
+                __field(int, len)
+                __string(name, name)
+        ),
+        TP_fast_assign(
+                __entry->dir = dir;
+                __entry->dentry = dentry;
+                __entry->symname = symname;
+                __entry->len = len;
+                __assign_str(name, name);
+        ),
+        TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry,
+                  __entry->symname, __entry->len, __get_str(name))
+);
+TRACE_EVENT(ocfs2_blkno_stringify,
+        TP_PROTO(unsigned long long blkno, const char *name, int namelen),
+        TP_ARGS(blkno, name, namelen),
+        TP_STRUCT__entry(
+                __field(unsigned long long, blkno)
+                __string(name, name)
+                __field(int, namelen)
+        ),
+        TP_fast_assign(
+                __entry->blkno = blkno;
+                __assign_str(name, name);
+                __entry->namelen = namelen;
+        ),
+        TP_printk("%llu %s %d", __entry->blkno, __get_str(name),
+                  __entry->namelen)
+);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_add_begin);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_orphan_add_end);
+TRACE_EVENT(ocfs2_orphan_del,
+        TP_PROTO(unsigned long long dir, const char *name, int namelen),
+        TP_ARGS(dir, name, namelen),
+        TP_STRUCT__entry(
+                __field(unsigned long long, dir)
+                __string(name, name)
+                __field(int, namelen)
+        ),
+        TP_fast_assign(
+                __entry->dir = dir;
+                __assign_str(name, name);
+                __entry->namelen = namelen;
+        ),
+        TP_printk("%llu %s %d", __entry->dir, __get_str(name),
+                  __entry->namelen)
+);
+/* End of trace events for fs/ocfs2/namei.c. */
+/* Trace events for fs/ocfs2/dcache.c. */
+TRACE_EVENT(ocfs2_dentry_revalidate,
+        TP_PROTO(void *dentry, int len, const char *name),
+        TP_ARGS(dentry, len, name),
+        TP_STRUCT__entry(
+                __field(void *, dentry)
+                __field(int, len)
+                __string(name, name)
+        ),
+        TP_fast_assign(
+                __entry->dentry = dentry;
+                __entry->len = len;
+                __assign_str(name, name);
+        ),
+        TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name))
+);
+TRACE_EVENT(ocfs2_dentry_revalidate_negative,
+        TP_PROTO(int len, const char *name, unsigned long pgen,
+                 unsigned long gen),
+        TP_ARGS(len, name, pgen, gen),
+        TP_STRUCT__entry(
+                __field(int, len)
+                __string(name, name)
+                __field(unsigned long, pgen)
+                __field(unsigned long, gen)
+        ),
+        TP_fast_assign(
+                __entry->len = len;
+                __assign_str(name, name);
+                __entry->pgen = pgen;
+                __entry->gen = gen;
+        ),
+        TP_printk("%.*s %lu %lu", __entry->len, __get_str(name),
+                  __entry->pgen, __entry->gen)
+);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_delete);
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_dentry_revalidate_orphaned);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_nofsdata);
+DEFINE_OCFS2_INT_EVENT(ocfs2_dentry_revalidate_ret);
+TRACE_EVENT(ocfs2_find_local_alias,
+        TP_PROTO(int len, const char *name),
+        TP_ARGS(len, name),
+        TP_STRUCT__entry(
+                __field(int, len)
+                __string(name, name)
+        ),
+        TP_fast_assign(
+                __entry->len = len;
+                __assign_str(name, name);
+        ),
+        TP_printk("%.*s", __entry->len, __get_str(name))
+);
+TRACE_EVENT(ocfs2_dentry_attach_lock,
+        TP_PROTO(int len, const char *name,
+                 unsigned long long parent, void *fsdata),
+        TP_ARGS(len, name, parent, fsdata),
+        TP_STRUCT__entry(
+                __field(int, len)
+                __string(name, name)
+                __field(unsigned long long, parent)
+                __field(void *, fsdata)
+        ),
+        TP_fast_assign(
+                __entry->len = len;
+                __assign_str(name, name);
+                __entry->parent = parent;
+                __entry->fsdata = fsdata;
+        ),
+        TP_printk("%.*s %llu %p", __entry->len, __get_str(name),
+                  __entry->parent, __entry->fsdata)
+);
+TRACE_EVENT(ocfs2_dentry_attach_lock_found,
+        TP_PROTO(const char *name, unsigned long long parent,
+                 unsigned long long ino),
+        TP_ARGS(name, parent, ino),
+        TP_STRUCT__entry(
+                __string(name, name)
+                __field(unsigned long long, parent)
+                __field(unsigned long long, ino)
+        ),
+        TP_fast_assign(
+                __assign_str(name, name);
+                __entry->parent = parent;
+                __entry->ino = ino;
+        ),
+        TP_printk("%s %llu %llu", __get_str(name), __entry->parent, __entry->ino)
+);
+/* End of trace events for fs/ocfs2/dcache.c. */
+/* Trace events for fs/ocfs2/export.c. */
+TRACE_EVENT(ocfs2_get_dentry_begin,
+        TP_PROTO(void *sb, void *handle, unsigned long long blkno),
+        TP_ARGS(sb, handle, blkno),
+        TP_STRUCT__entry(
+                __field(void *, sb)
+                __field(void *, handle)
+                __field(unsigned long long, blkno)
+        ),
+        TP_fast_assign(
+                __entry->sb = sb;
+                __entry->handle = handle;
+                __entry->blkno = blkno;
+        ),
+        TP_printk("%p %p %llu", __entry->sb, __entry->handle, __entry->blkno)
+);
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_get_dentry_test_bit);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_get_dentry_stale);
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_get_dentry_generation);
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_dentry_end);
+TRACE_EVENT(ocfs2_get_parent,
+        TP_PROTO(void *child, int len, const char *name,
+                 unsigned long long ino),
+        TP_ARGS(child, len, name, ino),
+        TP_STRUCT__entry(
+                __field(void *, child)
+                __field(int, len)
+                __string(name, name)
+                __field(unsigned long long, ino)
+        ),
+        TP_fast_assign(
+                __entry->child = child;
+                __entry->len = len;
+                __assign_str(name, name);
+                __entry->ino = ino;
+        ),
+        TP_printk("%p %.*s %llu", __entry->child, __entry->len,
+                  __get_str(name), __entry->ino)
+);
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_parent_end);
+TRACE_EVENT(ocfs2_encode_fh_begin,
+        TP_PROTO(void *dentry, int name_len, const char *name,
+                 void *fh, int len, int connectable),
+        TP_ARGS(dentry, name_len, name, fh, len, connectable),
+        TP_STRUCT__entry(
+                __field(void *, dentry)
+                __field(int, name_len)
+                __string(name, name)
+                __field(void *, fh)
+                __field(int, len)
+                __field(int, connectable)
+        ),
+        TP_fast_assign(
+                __entry->dentry = dentry;
+                __entry->name_len = name_len;
+                __assign_str(name, name);
+                __entry->fh = fh;
+                __entry->len = len;
+                __entry->connectable = connectable;
+        ),
+        TP_printk("%p %.*s %p %d %d", __entry->dentry, __entry->name_len,
+                  __get_str(name), __entry->fh, __entry->len,
+                  __entry->connectable)
+);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_self);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_parent);
+DEFINE_OCFS2_INT_EVENT(ocfs2_encode_fh_type);
+/* End of trace events for fs/ocfs2/export.c. */
+/* Trace events for fs/ocfs2/journal.c. */
+DEFINE_OCFS2_UINT_EVENT(ocfs2_commit_cache_begin);
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end);
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
+DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_journal_init);
+DEFINE_OCFS2_UINT_EVENT(ocfs2_journal_init_maxlen);
+DEFINE_OCFS2_INT_EVENT(ocfs2_journal_shutdown);
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_journal_shutdown_wait);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_complete_recovery);
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_recovery_end);
+TRACE_EVENT(ocfs2_complete_recovery_slot,
+        TP_PROTO(int slot, unsigned long long la_ino,
+                 unsigned long long tl_ino, void *qrec),
+        TP_ARGS(slot, la_ino, tl_ino, qrec),
+        TP_STRUCT__entry(
+                __field(int, slot)
+                __field(unsigned long long, la_ino)
+                __field(unsigned long long, tl_ino)
+                __field(void *, qrec)
+        ),
+        TP_fast_assign(
+                __entry->slot = slot;
+                __entry->la_ino = la_ino;
+                __entry->tl_ino = tl_ino;
+                __entry->qrec = qrec;
+        ),
+        TP_printk("%d %llu %llu %p", __entry->slot, __entry->la_ino,
+                  __entry->tl_ino, __entry->qrec)
+);
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_recovery_thread_node);
+DEFINE_OCFS2_INT_EVENT(ocfs2_recovery_thread_end);
+TRACE_EVENT(ocfs2_recovery_thread,
+        TP_PROTO(int node_num, int osb_node_num, int disable,
+                 void *recovery_thread, int map_set),
+        TP_ARGS(node_num, osb_node_num, disable, recovery_thread, map_set),
+        TP_STRUCT__entry(
+                __field(int, node_num)
+                __field(int, osb_node_num)
+                __field(int,disable)
+                __field(void *, recovery_thread)
+                __field(int,map_set)
+        ),
+        TP_fast_assign(
+                __entry->node_num = node_num;
+                __entry->osb_node_num = osb_node_num;
+                __entry->disable = disable;
+                __entry->recovery_thread = recovery_thread;
+                __entry->map_set = map_set;
+        ),
+        TP_printk("%d %d %d %p %d", __entry->node_num,
+                   __entry->osb_node_num, __entry->disable,
+                   __entry->recovery_thread, __entry->map_set)
+);
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_replay_journal_recovered);
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_lock_err);
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_skip);
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_recover_node);
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_recover_node_skip);
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_mark_dead_nodes);
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_begin);
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_end);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_filldir);
+DEFINE_OCFS2_INT_EVENT(ocfs2_recover_orphans);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_recover_orphans_iput);
+DEFINE_OCFS2_INT_EVENT(ocfs2_wait_on_mount);
+/* End of trace events for fs/ocfs2/journal.c. */
+/* Trace events for fs/ocfs2/buffer_head_io.c. */
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_read_blocks_sync);
+DEFINE_OCFS2_ULL_EVENT(ocfs2_read_blocks_sync_jbd);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_read_blocks_from_disk);
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_bh);
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_end);
+TRACE_EVENT(ocfs2_write_block,
+        TP_PROTO(unsigned long long block, void *ci),
+        TP_ARGS(block, ci),
+        TP_STRUCT__entry(
+                __field(unsigned long long, block)
+                __field(void *, ci)
+        ),
+        TP_fast_assign(
+                __entry->block = block;
+                __entry->ci = ci;
+        ),
+        TP_printk("%llu %p", __entry->block, __entry->ci)
+);
+TRACE_EVENT(ocfs2_read_blocks_begin,
+        TP_PROTO(void *ci, unsigned long long block,
+                 unsigned int nr, int flags),
+        TP_ARGS(ci, block, nr, flags),
+        TP_STRUCT__entry(
+                __field(void *, ci)
+                __field(unsigned long long, block)
+                __field(unsigned int, nr)
+                __field(int, flags)
+        ),
+        TP_fast_assign(
+                __entry->ci = ci;
+                __entry->block = block;
+                __entry->nr = nr;
+                __entry->flags = flags;
+        ),
+        TP_printk("%p %llu %u %d", __entry->ci, __entry->block,
+                  __entry->nr, __entry->flags)
+);
+/* End of trace events for fs/ocfs2/buffer_head_io.c. */
+/* Trace events for fs/ocfs2/uptodate.c. */
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_copied_metadata_tree);
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_metadata_cache_purge);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_buffer_cached_begin);
+TRACE_EVENT(ocfs2_buffer_cached_end,
+        TP_PROTO(int index, void *item),
+        TP_ARGS(index, item),
+        TP_STRUCT__entry(
+                __field(int, index)
+                __field(void *, item)
+        ),
+        TP_fast_assign(
+                __entry->index = index;
+                __entry->item = item;
+        ),
+        TP_printk("%d %p", __entry->index, __entry->item)
+);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_append_cache_array);
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_insert_cache_tree);
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_expand_cache);
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_set_buffer_uptodate);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_set_buffer_uptodate_begin);
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_remove_metadata_array);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_remove_metadata_tree);
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_remove_block_from_cache);
+/* End of trace events for fs/ocfs2/uptodate.c. */
+#endif /* _TRACE_OCFS2_H */
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE ocfs2_trace
+#include <trace/define_trace.h>
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 196fcb52d95d..d5ab56cbe5c5 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -114,7 +114,4 @@ int ocfs2_local_write_dquot(struct dquot *dquot);
 extern const struct dquot_operations ocfs2_quota_operations;
 extern struct quota_format_type ocfs2_quota_format;
-int ocfs2_quota_setup(void);
-void ocfs2_quota_shutdown(void);
 #endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 4607923eb24c..92fcd575775a 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -11,7 +11,6 @@
 #include <linux/writeback.h>
 #include <linux/workqueue.h>
-#define MLOG_MASK_PREFIX ML_QUOTA
 #include <cluster/masklog.h>
 #include "ocfs2_fs.h"
@@ -27,6 +26,7 @@
 #include "super.h"
 #include "buffer_head_io.h"
 #include "quota.h"
+#include "ocfs2_trace.h"
 /*
 * Locking of quotas with OCFS2 is rather complex. Here are rules that
@@ -63,8 +63,6 @@
 *        write to gf
 */
-static struct workqueue_struct *ocfs2_quota_wq = NULL;
 static void qsync_work_fn(struct work_struct *work);
 static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
@@ -132,8 +130,7 @@ int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
        struct ocfs2_disk_dqtrailer *dqt =
                ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
-        mlog(0, "Validating quota block %llu\n",
+        trace_ocfs2_validate_quota_block((unsigned long long)bh->b_blocknr);
-             (unsigned long long)bh->b_blocknr);
        BUG_ON(!buffer_uptodate(bh));
@@ -343,8 +340,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
        u64 pcount;
        int status;
-        mlog_entry_void();
        /* Read global header */
        gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
                        OCFS2_INVALID_SLOT);
@@ -400,11 +395,12 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
                                                OCFS2_QBLK_RESERVED_SPACE;
        oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
        INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
-        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+        schedule_delayed_work(&oinfo->dqi_sync_work,
-                           msecs_to_jiffies(oinfo->dqi_syncms));
+                              msecs_to_jiffies(oinfo->dqi_syncms));
 out_err:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 out_unlock:
        ocfs2_unlock_global_qf(oinfo, 0);
@@ -510,9 +506,10 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
        olditime = dquot->dq_dqb.dqb_itime;
        oldbtime = dquot->dq_dqb.dqb_btime;
        ocfs2_global_disk2memdqb(dquot, &dqblk);
-        mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
+        trace_ocfs2_sync_dquot(dquot->dq_id, dquot->dq_dqb.dqb_curspace,
-             dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
+                               (long long)spacechange,
-             dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
+                               dquot->dq_dqb.dqb_curinodes,
+                               (long long)inodechange);
        if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
                dquot->dq_dqb.dqb_curspace += spacechange;
        if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
@@ -559,7 +556,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
        spin_unlock(&dq_data_lock);
        err = ocfs2_qinfo_lock(info, freeing);
        if (err < 0) {
-                mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
+                mlog(ML_ERROR, "Failed to lock quota info, losing quota write"
                               " (type=%d, id=%u)\n", dquot->dq_type,
                               (unsigned)dquot->dq_id);
                goto out;
@@ -596,8 +593,8 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        int status = 0;
-        mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
+        trace_ocfs2_sync_dquot_helper(dquot->dq_id, dquot->dq_type,
-                   dquot->dq_type, type, sb->s_id);
+                                      type, sb->s_id);
        if (type != dquot->dq_type)
                goto out;
        status = ocfs2_lock_global_qf(oinfo, 1);
@@ -623,7 +620,6 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
 out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
 out:
-        mlog_exit(status);
        return status;
 }
@@ -635,8 +631,8 @@ static void qsync_work_fn(struct work_struct *work)
        struct super_block *sb = oinfo->dqi_gqinode->i_sb;
        dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
-        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+        schedule_delayed_work(&oinfo->dqi_sync_work,
-                           msecs_to_jiffies(oinfo->dqi_syncms));
+                              msecs_to_jiffies(oinfo->dqi_syncms));
 }
 /*
@@ -649,7 +645,7 @@ static int ocfs2_write_dquot(struct dquot *dquot)
        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
        int status = 0;
-        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        trace_ocfs2_write_dquot(dquot->dq_id, dquot->dq_type);
        handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
        if (IS_ERR(handle)) {
@@ -662,7 +658,6 @@ static int ocfs2_write_dquot(struct dquot *dquot)
        mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
        ocfs2_commit_trans(osb, handle);
 out:
-        mlog_exit(status);
        return status;
 }
@@ -688,7 +683,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
        int status = 0;
-        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        trace_ocfs2_release_dquot(dquot->dq_id, dquot->dq_type);
        mutex_lock(&dquot->dq_lock);
        /* Check whether we are not racing with some other dqget() */
@@ -724,7 +719,8 @@ out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
 out:
        mutex_unlock(&dquot->dq_lock);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -745,7 +741,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
        int need_alloc = ocfs2_global_qinit_alloc(sb, type);
        handle_t *handle;
-        mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+        trace_ocfs2_acquire_dquot(dquot->dq_id, type);
        mutex_lock(&dquot->dq_lock);
        /*
         * We need an exclusive lock, because we're going to update use count
@@ -811,7 +807,8 @@ out_dq:
        set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
 out:
        mutex_unlock(&dquot->dq_lock);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -831,7 +828,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
        handle_t *handle;
        struct ocfs2_super *osb = OCFS2_SB(sb);
-        mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+        trace_ocfs2_mark_dquot_dirty(dquot->dq_id, type);
        /* In case user set some limits, sync dquot immediately to global
         * quota file so that information propagates quicker */
@@ -868,7 +865,8 @@ out_dlock:
 out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
 out:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -879,8 +877,6 @@ static int ocfs2_write_info(struct super_block *sb, int type)
        int status = 0;
        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
-        mlog_entry_void();
        status = ocfs2_lock_global_qf(oinfo, 1);
        if (status < 0)
                goto out;
@@ -895,7 +891,8 @@ static int ocfs2_write_info(struct super_block *sb, int type)
 out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
 out:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -923,20 +920,3 @@ const struct dquot_operations ocfs2_quota_operations = {
        .alloc_dquot    = ocfs2_alloc_dquot,
        .destroy_dquot  = ocfs2_destroy_dquot,
 };
-int ocfs2_quota_setup(void)
-{
-        ocfs2_quota_wq = create_workqueue("o2quot");
-        if (!ocfs2_quota_wq)
-                return -ENOMEM;
-        return 0;
-}
-void ocfs2_quota_shutdown(void)
-{
-        if (ocfs2_quota_wq) {
-                flush_workqueue(ocfs2_quota_wq);
-                destroy_workqueue(ocfs2_quota_wq);
-                ocfs2_quota_wq = NULL;
-        }
-}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index dc78764ccc4c..dc8007fc9247 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -8,7 +8,6 @@
 #include <linux/quotaops.h>
 #include <linux/module.h>
-#define MLOG_MASK_PREFIX ML_QUOTA
 #include <cluster/masklog.h>
 #include "ocfs2_fs.h"
@@ -23,6 +22,7 @@
 #include "quota.h"
 #include "uptodate.h"
 #include "super.h"
+#include "ocfs2_trace.h"
 /* Number of local quota structures per block */
 static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -475,7 +475,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
        struct ocfs2_recovery_chunk *rchunk, *next;
        qsize_t spacechange, inodechange;
-        mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
+        trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type);
        list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
                chunk = rchunk->rc_chunk;
@@ -575,7 +575,8 @@ out_put_bh:
        }
        if (status < 0)
                free_recovery_list(&(rec->r_list[type]));
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -600,7 +601,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
        for (type = 0; type < MAXQUOTAS; type++) {
                if (list_empty(&(rec->r_list[type])))
                        continue;
-                mlog(0, "Recovering quota in slot %d\n", slot_num);
+                trace_ocfs2_finish_quota_recovery(slot_num);
                lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
                if (!lqinode) {
                        status = -ENOENT;
@@ -882,9 +883,10 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
        dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
                                          od->dq_originodes);
        spin_unlock(&dq_data_lock);
-        mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
+        trace_olq_set_dquot(
-             od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
+                (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod),
-             (long long)le64_to_cpu(dqblk->dqb_inodemod));
+                (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod),
+                od->dq_dquot.dq_id);
 }
 /* Write dquot to local quota file */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b5f9160e93e9..5d32749c896d 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -16,7 +16,6 @@
 */
 #include <linux/sort.h>
-#define MLOG_MASK_PREFIX ML_REFCOUNT
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "inode.h"
@@ -34,6 +33,7 @@
 #include "aops.h"
 #include "xattr.h"
 #include "namei.h"
+#include "ocfs2_trace.h"
 #include <linux/bio.h>
 #include <linux/blkdev.h>
@@ -84,8 +84,7 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
        struct ocfs2_refcount_block *rb =
                (struct ocfs2_refcount_block *)bh->b_data;
-        mlog(0, "Validating refcount block %llu\n",
+        trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
-             (unsigned long long)bh->b_blocknr);
        BUG_ON(!buffer_uptodate(bh));
@@ -545,8 +544,8 @@ void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
        while ((node = rb_last(root)) != NULL) {
                tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
-                mlog(0, "Purge tree %llu\n",
+                trace_ocfs2_purge_refcount_trees(
-                     (unsigned long long) tree->rf_blkno);
+                                (unsigned long long) tree->rf_blkno);
                rb_erase(&tree->rf_node, root);
                ocfs2_free_refcount_tree(tree);
@@ -575,7 +574,8 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
        BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
-        mlog(0, "create tree for inode %lu\n", inode->i_ino);
+        trace_ocfs2_create_refcount_tree(
+                (unsigned long long)OCFS2_I(inode)->ip_blkno);
        ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
        if (ret) {
@@ -646,8 +646,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
        di->i_refcount_loc = cpu_to_le64(first_blkno);
        spin_unlock(&oi->ip_lock);
-        mlog(0, "created tree for inode %lu, refblock %llu\n",
+        trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
-             inode->i_ino, (unsigned long long)first_blkno);
        ocfs2_journal_dirty(handle, di_bh);
@@ -1256,8 +1255,9 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
                goto out;
        }
-        mlog(0, "change index %d, old count %u, change %d\n", index,
+        trace_ocfs2_change_refcount_rec(
-             le32_to_cpu(rec->r_refcount), change);
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                index, le32_to_cpu(rec->r_refcount), change);
        le32_add_cpu(&rec->r_refcount, change);
        if (!rec->r_refcount) {
@@ -1353,8 +1353,8 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
        ocfs2_journal_dirty(handle, ref_root_bh);
-        mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno,
+        trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
-             le16_to_cpu(new_rb->rf_records.rl_used));
+                le16_to_cpu(new_rb->rf_records.rl_used));
        *ref_leaf_bh = new_bh;
        new_bh = NULL;
@@ -1466,9 +1466,9 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
                        (struct ocfs2_refcount_block *)new_bh->b_data;
        struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
-        mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n",
+        trace_ocfs2_divide_leaf_refcount_block(
-             (unsigned long long)ref_leaf_bh->b_blocknr,
+                (unsigned long long)ref_leaf_bh->b_blocknr,
-             le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
+                le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
        /*
         * XXX: Improvement later.
@@ -1601,8 +1601,8 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
        ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
-        mlog(0, "insert new leaf block %llu at %u\n",
+        trace_ocfs2_new_leaf_refcount_block(
-             (unsigned long long)new_bh->b_blocknr, new_cpos);
+                        (unsigned long long)new_bh->b_blocknr, new_cpos);
        /* Insert the new leaf block with the specific offset cpos. */
        ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
@@ -1794,11 +1794,10 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
                        (le16_to_cpu(rf_list->rl_used) - index) *
                         sizeof(struct ocfs2_refcount_rec));
-        mlog(0, "insert refcount record start %llu, len %u, count %u "
+        trace_ocfs2_insert_refcount_rec(
-             "to leaf block %llu at index %d\n",
+                (unsigned long long)ref_leaf_bh->b_blocknr, index,
-             (unsigned long long)le64_to_cpu(rec->r_cpos),
+                (unsigned long long)le64_to_cpu(rec->r_cpos),
-             le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount),
+                le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount));
-             (unsigned long long)ref_leaf_bh->b_blocknr, index);
        rf_list->rl_recs[index] = *rec;
@@ -1850,10 +1849,12 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
        BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
-        mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n",
+        trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
-             le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters),
+                le32_to_cpu(orig_rec->r_clusters),
-             le64_to_cpu(split_rec->r_cpos),
+                le32_to_cpu(orig_rec->r_refcount),
-             le32_to_cpu(split_rec->r_clusters));
+                le64_to_cpu(split_rec->r_cpos),
+                le32_to_cpu(split_rec->r_clusters),
+                le32_to_cpu(split_rec->r_refcount));
        /*
         * If we just need to split the header or tail clusters,
@@ -1967,12 +1968,11 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
        if (split_rec->r_refcount) {
                rf_list->rl_recs[index] = *split_rec;
-                mlog(0, "insert refcount record start %llu, len %u, count %u "
+                trace_ocfs2_split_refcount_rec_insert(
-                     "to leaf block %llu at index %d\n",
+                        (unsigned long long)ref_leaf_bh->b_blocknr, index,
-                     (unsigned long long)le64_to_cpu(split_rec->r_cpos),
+                        (unsigned long long)le64_to_cpu(split_rec->r_cpos),
-                     le32_to_cpu(split_rec->r_clusters),
+                        le32_to_cpu(split_rec->r_clusters),
-                     le32_to_cpu(split_rec->r_refcount),
+                        le32_to_cpu(split_rec->r_refcount));
-                     (unsigned long long)ref_leaf_bh->b_blocknr, index);
                if (merge)
                        ocfs2_refcount_rec_merge(rb, index);
@@ -1997,7 +1997,7 @@ static int __ocfs2_increase_refcount(handle_t *handle,
        struct ocfs2_refcount_rec rec;
        unsigned int set_len = 0;
-        mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n",
+        trace_ocfs2_increase_refcount_begin(
             (unsigned long long)ocfs2_metadata_cache_owner(ci),
             (unsigned long long)cpos, len);
@@ -2024,9 +2024,9 @@ static int __ocfs2_increase_refcount(handle_t *handle,
                 */
                if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
                    set_len <= len) {
-                        mlog(0, "increase refcount rec, start %llu, len %u, "
+                        trace_ocfs2_increase_refcount_change(
-                             "count %u\n", (unsigned long long)cpos, set_len,
+                                (unsigned long long)cpos, set_len,
-                             le32_to_cpu(rec.r_refcount));
+                                le32_to_cpu(rec.r_refcount));
                        ret = ocfs2_change_refcount_rec(handle, ci,
                                                        ref_leaf_bh, index,
                                                        merge, 1);
@@ -2037,7 +2037,7 @@ static int __ocfs2_increase_refcount(handle_t *handle,
                } else if (!rec.r_refcount) {
                        rec.r_refcount = cpu_to_le32(1);
-                        mlog(0, "insert refcount rec, start %llu, len %u\n",
+                        trace_ocfs2_increase_refcount_insert(
                             (unsigned long long)le64_to_cpu(rec.r_cpos),
                             set_len);
                        ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
@@ -2055,8 +2055,7 @@ static int __ocfs2_increase_refcount(handle_t *handle,
                        rec.r_clusters = cpu_to_le32(set_len);
                        le32_add_cpu(&rec.r_refcount, 1);
-                        mlog(0, "split refcount rec, start %llu, "
+                        trace_ocfs2_increase_refcount_split(
-                             "len %u, count %u\n",
                             (unsigned long long)le64_to_cpu(rec.r_cpos),
                             set_len, le32_to_cpu(rec.r_refcount));
                        ret = ocfs2_split_refcount_rec(handle, ci,
@@ -2095,6 +2094,11 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
        BUG_ON(rb->rf_records.rl_used);
+        trace_ocfs2_remove_refcount_extent(
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                (unsigned long long)ref_leaf_bh->b_blocknr,
+                le32_to_cpu(rb->rf_cpos));
        ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
        ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
                                  1, meta_ac, dealloc);
@@ -2137,7 +2141,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
        if (!rb->rf_list.l_next_free_rec) {
                BUG_ON(rb->rf_clusters);
-                mlog(0, "reset refcount tree root %llu to be a record block.\n",
+                trace_ocfs2_restore_refcount_block(
                     (unsigned long long)ref_root_bh->b_blocknr);
                rb->rf_flags = 0;
@@ -2184,6 +2188,10 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle,
        BUG_ON(cpos + len >
               le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
+        trace_ocfs2_decrease_refcount_rec(
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                (unsigned long long)cpos, len);
        if (cpos == le64_to_cpu(rec->r_cpos) &&
            len == le32_to_cpu(rec->r_clusters))
                ret = ocfs2_change_refcount_rec(handle, ci,
@@ -2195,12 +2203,6 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle,
                le32_add_cpu(&split.r_refcount, -1);
-                mlog(0, "split refcount rec, start %llu, "
-                     "len %u, count %u, original start %llu, len %u\n",
-                     (unsigned long long)le64_to_cpu(split.r_cpos),
-                     len, le32_to_cpu(split.r_refcount),
-                     (unsigned long long)le64_to_cpu(rec->r_cpos),
-                     le32_to_cpu(rec->r_clusters));
                ret = ocfs2_split_refcount_rec(handle, ci,
                                               ref_root_bh, ref_leaf_bh,
                                               &split, index, 1,
@@ -2239,10 +2241,9 @@ static int __ocfs2_decrease_refcount(handle_t *handle,
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        struct buffer_head *ref_leaf_bh = NULL;
-        mlog(0, "Tree owner %llu, decrease refcount start %llu, "
+        trace_ocfs2_decrease_refcount(
-             "len %u, delete %u\n",
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
-             (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                (unsigned long long)cpos, len, delete);
-             (unsigned long long)cpos, len, delete);
        while (len) {
                ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
@@ -2352,8 +2353,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
 {
        int ret;
-        mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n",
+        trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
-             inode->i_ino, cpos, len, phys);
+                                           cpos, len, phys);
        if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
                ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
@@ -2392,8 +2393,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
        struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
        u32 len;
-        mlog(0, "start_cpos %llu, clusters %u\n",
-             (unsigned long long)start_cpos, clusters);
        while (clusters) {
                ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
                                             cpos, clusters, &rec,
@@ -2427,12 +2426,11 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
                rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
-                mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu,"
+                trace_ocfs2_calc_refcount_meta_credits_iterate(
-                     "rec->r_clusters %u, rec->r_refcount %u, index %d\n",
+                                recs_add, (unsigned long long)cpos, clusters,
-                     recs_add, (unsigned long long)cpos, clusters,
+                                (unsigned long long)le64_to_cpu(rec.r_cpos),
-                     (unsigned long long)le64_to_cpu(rec.r_cpos),
+                                le32_to_cpu(rec.r_clusters),
-                     le32_to_cpu(rec.r_clusters),
+                                le32_to_cpu(rec.r_refcount), index);
-                     le32_to_cpu(rec.r_refcount), index);
                len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
                          le32_to_cpu(rec.r_clusters)) - cpos;
@@ -2488,7 +2486,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
        if (!ref_blocks)
                goto out;
-        mlog(0, "we need ref_blocks %d\n", ref_blocks);
        *meta_add += ref_blocks;
        *credits += ref_blocks;
@@ -2514,6 +2511,10 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
        }
 out:
+        trace_ocfs2_calc_refcount_meta_credits(
+                (unsigned long long)start_cpos, clusters,
+                *meta_add, *credits);
        brelse(ref_leaf_bh);
        brelse(prev_bh);
        return ret;
@@ -2578,8 +2579,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
                goto out;
        }
-        mlog(0, "reserve new metadata %d blocks, credits = %d\n",
+        trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
-             *ref_blocks, *credits);
 out:
        brelse(ref_root_bh);
@@ -2886,8 +2886,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
                goto out;
        }
-        mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n",
+        trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
-             meta_add, num_clusters, *credits);
        ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
                                                meta_ac);
        if (ret) {
@@ -2937,8 +2936,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
        loff_t offset, end, map_end;
        struct address_space *mapping = context->inode->i_mapping;
-        mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
+        trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
-             new_cluster, new_len, cpos);
+                                               new_cluster, new_len);
        readahead_pages =
                (ocfs2_cow_contig_clusters(sb) <<
@@ -3031,8 +3030,8 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
        struct buffer_head *old_bh = NULL;
        struct buffer_head *new_bh = NULL;
-        mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster,
+        trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
-             new_cluster, new_len);
+                                               new_cluster, new_len);
        for (i = 0; i < blocks; i++, old_block++, new_block++) {
                new_bh = sb_getblk(osb->sb, new_block);
@@ -3085,8 +3084,8 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
        struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
        u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
-        mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n",
+        trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
-             (unsigned long long)ino, cpos, len, p_cluster, ext_flags);
+                                       cpos, len, p_cluster, ext_flags);
        memset(&replace_rec, 0, sizeof(replace_rec));
        replace_rec.e_cpos = cpu_to_le32(cpos);
@@ -3141,8 +3140,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
        struct ocfs2_caching_info *ci = context->data_et.et_ci;
        u64 ino = ocfs2_metadata_cache_owner(ci);
-        mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n",
+        trace_ocfs2_replace_clusters((unsigned long long)ino,
-             (unsigned long long)ino, cpos, old, new, len, ext_flags);
+                                     cpos, old, new, len, ext_flags);
        /*If the old clusters is unwritten, no need to duplicate. */
        if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
@@ -3228,7 +3227,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
                                        u32 num_clusters, unsigned int e_flags)
 {
        int ret, delete, index, credits =  0;
-        u32 new_bit, new_len;
+        u32 new_bit, new_len, orig_num_clusters;
        unsigned int set_len;
        struct ocfs2_super *osb = OCFS2_SB(sb);
        handle_t *handle;
@@ -3236,8 +3235,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
        struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
        struct ocfs2_refcount_rec rec;
-        mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n",
+        trace_ocfs2_make_clusters_writable(cpos, p_cluster,
-             cpos, p_cluster, num_clusters, e_flags);
+                                           num_clusters, e_flags);
        ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
                                             &context->data_et,
@@ -3261,6 +3260,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
                goto out;
        }
+        orig_num_clusters = num_clusters;
        while (num_clusters) {
                ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
                                             p_cluster, num_clusters,
@@ -3348,7 +3349,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
         * in write-back mode.
         */
        if (context->get_clusters == ocfs2_di_get_clusters) {
-                ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
+                ret = ocfs2_cow_sync_writeback(sb, context, cpos,
+                                               orig_num_clusters);
                if (ret)
                        mlog_errno(ret);
        }
@@ -3472,9 +3474,9 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
                goto out;
        }
-        mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
+        trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
-             "cow_len %u\n", inode->i_ino,
+                                      cpos, write_len, max_cpos,
-             cpos, write_len, cow_start, cow_len);
+                                      cow_start, cow_len);
        BUG_ON(cow_len == 0);
@@ -3753,8 +3755,7 @@ int ocfs2_add_refcount_flag(struct inode *inode,
                goto out;
        }
-        mlog(0, "reserve new metadata %d, credits = %d\n",
+        trace_ocfs2_add_refcount_flag(ref_blocks, credits);
-             ref_blocks, credits);
        if (ref_blocks) {
                ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
@@ -4325,7 +4326,8 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
        /* If the security isn't preserved, we need to re-initialize them. */
        if (!preserve) {
-                error = ocfs2_init_security_and_acl(dir, new_orphan_inode);
+                error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
+                                                    &new_dentry->d_name);
                if (error)
                        mlog_errno(error);
        }
@@ -4376,7 +4378,7 @@ static int ocfs2_user_path_parent(const char __user *path,
        if (IS_ERR(s))
                return PTR_ERR(s);
-        error = path_lookup(s, LOOKUP_PARENT, nd);
+        error = kern_path_parent(s, nd);
        if (error)
                putname(s);
        else
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 3e78db361bc7..41ffd36c689c 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -30,10 +30,10 @@
 #include <linux/bitops.h>
 #include <linux/list.h>
-#define MLOG_MASK_PREFIX ML_RESERVATIONS
 #include <cluster/masklog.h>
 #include "ocfs2.h"
+#include "ocfs2_trace.h"
 #ifdef CONFIG_OCFS2_DEBUG_FS
 #define OCFS2_CHECK_RESERVATIONS
@@ -321,8 +321,7 @@ static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
        assert_spin_locked(&resv_lock);
-        mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
+        trace_ocfs2_resv_insert(new->r_start, new->r_len);
-             new->r_len);
        while (*p) {
                parent = *p;
@@ -423,8 +422,8 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
        unsigned int best_start, best_len = 0;
        int offset, start, found;
-        mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
+        trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len,
-             wanted, search_start, search_len, resmap->m_bitmap_len);
+                                                wanted, resmap->m_bitmap_len);
        found = best_start = best_len = 0;
@@ -463,7 +462,7 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
        *rlen = best_len;
        *rstart = best_start;
-        mlog(0, "Found start: %u len: %u\n", best_start, best_len);
+        trace_ocfs2_resmap_find_free_bits_end(best_start, best_len);
        return *rlen;
 }
@@ -487,9 +486,8 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
         * - our window should be last in all reservations
         * - need to make sure we don't go past end of bitmap
         */
+        trace_ocfs2_resv_find_window_begin(resv->r_start, ocfs2_resv_end(resv),
-        mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
+                                           goal, wanted, RB_EMPTY_ROOT(root));
-             resv->r_start, ocfs2_resv_end(resv), goal, wanted);
        assert_spin_locked(&resv_lock);
@@ -498,9 +496,6 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
                 * Easiest case - empty tree. We can just take
                 * whatever window of free bits we want.
                 */
-                mlog(0, "Empty root\n");
                clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
                                                   resmap->m_bitmap_len - goal,
                                                   &cstart, &clen);
@@ -524,8 +519,6 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
        prev_resv = ocfs2_find_resv_lhs(resmap, goal);
        if (prev_resv == NULL) {
-                mlog(0, "Goal on LHS of leftmost window\n");
                /*
                 * A NULL here means that the search code couldn't
                 * find a window that starts before goal.
@@ -570,13 +563,15 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
                next_resv = NULL;
        }
+        trace_ocfs2_resv_find_window_prev(prev_resv->r_start,
+                                          ocfs2_resv_end(prev_resv));
        prev = &prev_resv->r_node;
        /* Now we do a linear search for a window, starting at 'prev_rsv' */
        while (1) {
                next = rb_next(prev);
                if (next) {
-                        mlog(0, "One more resv found in linear search\n");
                        next_resv = rb_entry(next,
                                             struct ocfs2_alloc_reservation,
                                             r_node);
@@ -585,7 +580,6 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
                        gap_end = next_resv->r_start - 1;
                        gap_len = gap_end - gap_start + 1;
                } else {
-                        mlog(0, "No next node\n");
                        /*
                         * We're at the rightmost edge of the
                         * tree. See if a reservation between this
@@ -596,6 +590,8 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
                        gap_end = resmap->m_bitmap_len - 1;
                }
+                trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
+                                        next ? ocfs2_resv_end(next_resv) : -1);
                /*
                 * No need to check this gap if we have already found
                 * a larger region of free bits.
@@ -654,8 +650,9 @@ static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
        lru_resv = list_first_entry(&resmap->m_lru,
                                    struct ocfs2_alloc_reservation, r_lru);
-        mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
+        trace_ocfs2_cannibalize_resv_begin(lru_resv->r_start,
-             lru_resv->r_len, ocfs2_resv_end(lru_resv));
+                                           lru_resv->r_len,
+                                           ocfs2_resv_end(lru_resv));
        /*
         * Cannibalize (some or all) of the target reservation and
@@ -684,10 +681,9 @@ static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
                resv->r_len = shrink;
        }
-        mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+        trace_ocfs2_cannibalize_resv_end(resv->r_start, ocfs2_resv_end(resv),
-             "r_len: %u r_last_start: %u r_last_len: %u\n",
+                                         resv->r_len, resv->r_last_start,
-             resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+                                         resv->r_last_len);
-             resv->r_last_start, resv->r_last_len);
        ocfs2_resv_insert(resmap, resv);
 }
@@ -748,7 +744,6 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
                if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
                        wanted = *clen;
-                mlog(0, "empty reservation, find new window\n");
                /*
                 * Try to get a window here. If it works, we must fall
                 * through and test the bitmap . This avoids some
@@ -757,6 +752,7 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
                 * that inode.
                 */
                ocfs2_resv_find_window(resmap, resv, wanted);
+                trace_ocfs2_resmap_resv_bits(resv->r_start, resv->r_len);
        }
        BUG_ON(ocfs2_resv_empty(resv));
@@ -813,10 +809,10 @@ void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
        spin_lock(&resv_lock);
-        mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
+        trace_ocfs2_resmap_claimed_bits_begin(cstart, cend, clen, resv->r_start,
-             "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
+                                              ocfs2_resv_end(resv), resv->r_len,
-             cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
+                                              resv->r_last_start,
-             resv->r_len, resv->r_last_start, resv->r_last_len);
+                                              resv->r_last_len);
        BUG_ON(cstart < resv->r_start);
        BUG_ON(cstart > ocfs2_resv_end(resv));
@@ -833,10 +829,9 @@ void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
        if (!ocfs2_resv_empty(resv))
                ocfs2_resv_mark_lru(resmap, resv);
-        mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
+        trace_ocfs2_resmap_claimed_bits_end(resv->r_start, ocfs2_resv_end(resv),
-             "r_len: %u r_last_start: %u r_last_len: %u\n",
+                                            resv->r_len, resv->r_last_start,
-             resv->r_start, ocfs2_resv_end(resv), resv->r_len,
+                                            resv->r_last_len);
-             resv->r_last_start, resv->r_last_len);
        ocfs2_check_resmap(resmap);
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
index 1e49cc29d06c..42c2b804f3fd 100644
--- a/fs/ocfs2/reservations.h
+++ b/fs/ocfs2/reservations.h
@@ -29,7 +29,7 @@
 struct ocfs2_alloc_reservation {
        struct rb_node  r_node;
-        unsigned int    r_start;        /* Begining of current window */
+        unsigned int    r_start;        /* Beginning of current window */
        unsigned int    r_len;          /* Length of the window */
        unsigned int    r_last_len;     /* Length of most recent alloc */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index dacd553d8617..ec55add7604a 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -27,7 +27,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -39,6 +38,7 @@
 #include "super.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "ocfs2_trace.h"
 #include "buffer_head_io.h"
 #include "suballoc.h"
@@ -82,7 +82,6 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode,
                backups++;
        }
-        mlog_exit_void();
        return backups;
 }
@@ -103,8 +102,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
        u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
        u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
-        mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
+        trace_ocfs2_update_last_group_and_inode(new_clusters,
-                   new_clusters, first_new_cluster);
+                                                first_new_cluster);
        ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
                                      group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -176,7 +175,8 @@ out_rollback:
                le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
        }
 out:
-        mlog_exit(ret);
+        if (ret)
+                mlog_errno(ret);
        return ret;
 }
@@ -281,8 +281,6 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
        u32 first_new_cluster;
        u64 lgd_blkno;
-        mlog_entry_void();
        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
                return -EROFS;
@@ -342,7 +340,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
                goto out_unlock;
        }
-        mlog(0, "extend the last group at %llu, new clusters = %d\n",
+        trace_ocfs2_group_extend(
             (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
        handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
@@ -377,7 +376,6 @@ out_mutex:
        iput(main_bm_inode);
 out:
-        mlog_exit_void();
        return ret;
 }
@@ -472,8 +470,6 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
        struct ocfs2_chain_rec *cr;
        u16 cl_bpc;
-        mlog_entry_void();
        if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
                return -EROFS;
@@ -520,8 +516,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
                goto out_unlock;
        }
-        mlog(0, "Add a new group  %llu in chain = %u, length = %u\n",
+        trace_ocfs2_group_add((unsigned long long)input->group,
-             (unsigned long long)input->group, input->chain, input->clusters);
+                               input->chain, input->clusters, input->frees);
        handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
        if (IS_ERR(handle)) {
@@ -589,6 +585,5 @@ out_mutex:
        iput(main_bm_inode);
 out:
-        mlog_exit_void();
        return ret;
 }
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index ab4e0172cc1d..26fc0014d509 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -27,7 +27,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -39,6 +38,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "sysfile.h"
+#include "ocfs2_trace.h"
 #include "buffer_head_io.h"
@@ -142,8 +142,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
        BUG_ON(si->si_blocks == 0);
        BUG_ON(si->si_bh == NULL);
-        mlog(0, "Refreshing slot map, reading %u block(s)\n",
+        trace_ocfs2_refresh_slot_info(si->si_blocks);
-             si->si_blocks);
        /*
         * We pass -1 as blocknr because we expect all of si->si_bh to
@@ -381,8 +380,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
        /* The size checks above should ensure this */
        BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
-        mlog(0, "Slot map needs %u buffers for %llu bytes\n",
+        trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
-             si->si_blocks, bytes);
        si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
                            GFP_KERNEL);
@@ -400,8 +398,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
                        goto bail;
                }
-                mlog(0, "Reading slot map block %u at %llu\n", i,
+                trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i);
-                     (unsigned long long)blkno);
                bh = NULL;  /* Acquire a fresh bh */
                status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
@@ -475,8 +472,6 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
        int slot;
        struct ocfs2_slot_info *si;
-        mlog_entry_void();
        si = osb->slot_info;
        spin_lock(&osb->osb_lock);
@@ -505,14 +500,13 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
        osb->slot_num = slot;
        spin_unlock(&osb->osb_lock);
-        mlog(0, "taking node slot %d\n", osb->slot_num);
+        trace_ocfs2_find_slot(osb->slot_num);
        status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
        if (status < 0)
                mlog_errno(status);
 bail:
-        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 8ce7398ae1d2..1ec56fdb8d0d 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -126,7 +126,7 @@ struct ocfs2_stack_operations {
         *
         * ->connect() must not return until it is guaranteed that
         *
-         *  - Node down notifications for the filesystem will be recieved
+         *  - Node down notifications for the filesystem will be received
         *    and passed to conn->cc_recovery_handler().
         *  - Locking requests for the filesystem will be processed.
         */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 71998d4d61d5..ba5d97e4a73e 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -29,7 +29,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -44,6 +43,7 @@
 #include "super.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "ocfs2_trace.h"
 #include "buffer_head_io.h"
@@ -308,8 +308,8 @@ static int ocfs2_validate_group_descriptor(struct super_block *sb,
        int rc;
        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
-        mlog(0, "Validating group descriptor %llu\n",
+        trace_ocfs2_validate_group_descriptor(
-             (unsigned long long)bh->b_blocknr);
+                                        (unsigned long long)bh->b_blocknr);
        BUG_ON(!buffer_uptodate(bh));
@@ -389,8 +389,6 @@ static int ocfs2_block_group_fill(handle_t *handle,
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        struct super_block * sb = alloc_inode->i_sb;
-        mlog_entry_void();
        if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
                ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
                            "b_blocknr (%llu)",
@@ -436,7 +434,8 @@ static int ocfs2_block_group_fill(handle_t *handle,
         * allocation time. */
 bail:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -477,8 +476,8 @@ ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
        /* setup the group */
        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-        mlog(0, "new descriptor, record %u, at block %llu\n",
+        trace_ocfs2_block_group_alloc_contig(
-             alloc_rec, (unsigned long long)bg_blkno);
+             (unsigned long long)bg_blkno, alloc_rec);
        bg_bh = sb_getblk(osb->sb, bg_blkno);
        if (!bg_bh) {
@@ -657,8 +656,8 @@ ocfs2_block_group_alloc_discontig(handle_t *handle,
        /* setup the group */
        bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-        mlog(0, "new descriptor, record %u, at block %llu\n",
+        trace_ocfs2_block_group_alloc_discontig(
-             alloc_rec, (unsigned long long)bg_blkno);
+                                (unsigned long long)bg_blkno, alloc_rec);
        bg_bh = sb_getblk(osb->sb, bg_blkno);
        if (!bg_bh) {
@@ -707,8 +706,6 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
-        mlog_entry_void();
        cl = &fe->id2.i_chain;
        status = ocfs2_reserve_clusters_with_limit(osb,
                                                   le16_to_cpu(cl->cl_cpg),
@@ -730,8 +727,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        }
        if (last_alloc_group && *last_alloc_group != 0) {
-                mlog(0, "use old allocation group %llu for block group alloc\n",
+                trace_ocfs2_block_group_alloc(
-                     (unsigned long long)*last_alloc_group);
+                                (unsigned long long)*last_alloc_group);
                ac->ac_last_group = *last_alloc_group;
        }
@@ -796,7 +793,8 @@ bail:
        brelse(bg_bh);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -814,8 +812,6 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
        struct ocfs2_dinode *fe;
        u32 free_bits;
-        mlog_entry_void();
        alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
        if (!alloc_inode) {
                mlog_errno(-EINVAL);
@@ -855,16 +851,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
        if (bits_wanted > free_bits) {
                /* cluster bitmap never grows */
                if (ocfs2_is_cluster_bitmap(alloc_inode)) {
-                        mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
+                        trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
-                             bits_wanted, free_bits);
+                                                                free_bits);
                        status = -ENOSPC;
                        goto bail;
                }
                if (!(flags & ALLOC_NEW_GROUP)) {
-                        mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
+                        trace_ocfs2_reserve_suballoc_bits_no_new_group(
-                             "and we don't alloc a new group for it.\n",
+                                                slot, bits_wanted, free_bits);
-                             slot, bits_wanted, free_bits);
                        status = -ENOSPC;
                        goto bail;
                }
@@ -890,7 +885,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 bail:
        brelse(bh);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1052,7 +1048,8 @@ bail:
                *ac = NULL;
        }
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1119,8 +1116,8 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
                spin_lock(&osb->osb_lock);
                osb->osb_inode_alloc_group = alloc_group;
                spin_unlock(&osb->osb_lock);
-                mlog(0, "after reservation, new allocation group is "
+                trace_ocfs2_reserve_new_inode_new_group(
-                     "%llu\n", (unsigned long long)alloc_group);
+                        (unsigned long long)alloc_group);
                /*
                 * Some inodes must be freed by us, so try to allocate
@@ -1152,7 +1149,8 @@ bail:
                *ac = NULL;
        }
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1189,8 +1187,6 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
 {
        int status;
-        mlog_entry_void();
        *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
        if (!(*ac)) {
                status = -ENOMEM;
@@ -1229,7 +1225,8 @@ bail:
                *ac = NULL;
        }
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1357,15 +1354,12 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        void *bitmap = bg->bg_bitmap;
        int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
-        mlog_entry_void();
        /* All callers get the descriptor via
         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
-        mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
+        trace_ocfs2_block_group_set_bits(bit_off, num_bits);
-             num_bits);
        if (ocfs2_is_cluster_bitmap(alloc_inode))
                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
@@ -1394,7 +1388,8 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        ocfs2_journal_dirty(handle, group_bh);
 bail:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1437,10 +1432,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
-        mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
+        trace_ocfs2_relink_block_group(
-             (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
+                (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
-             (unsigned long long)le64_to_cpu(bg->bg_blkno),
+                (unsigned long long)le64_to_cpu(bg->bg_blkno),
-             (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
+                (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
        fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
        bg_ptr = le64_to_cpu(bg->bg_next_group);
@@ -1484,7 +1479,8 @@ out_rollback:
                prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
        }
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1515,7 +1511,7 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                max_bits = le16_to_cpu(gd->bg_bits);
                /* Tail groups in cluster bitmaps which aren't cpg
-                 * aligned are prone to partial extention by a failed
+                 * aligned are prone to partial extension by a failed
                 * fs resize. If the file system resize never got to
                 * update the dinode cluster count, then we don't want
                 * to trust any clusters past it, regardless of what
@@ -1525,10 +1521,10 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                if ((gd_cluster_off + max_bits) >
                    OCFS2_I(inode)->ip_clusters) {
                        max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
-                        mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
+                        trace_ocfs2_cluster_group_search_wrong_max_bits(
-                             (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                                (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                             le16_to_cpu(gd->bg_bits),
+                                le16_to_cpu(gd->bg_bits),
-                             OCFS2_I(inode)->ip_clusters, max_bits);
+                                OCFS2_I(inode)->ip_clusters, max_bits);
                }
                ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
@@ -1542,9 +1538,9 @@ static int ocfs2_cluster_group_search(struct inode *inode,
                                                          gd_cluster_off +
                                                          res->sr_bit_offset +
                                                          res->sr_bits);
-                        mlog(0, "Checking %llu against %llu\n",
+                        trace_ocfs2_cluster_group_search_max_block(
-                             (unsigned long long)blkoff,
+                                (unsigned long long)blkoff,
-                             (unsigned long long)max_block);
+                                (unsigned long long)max_block);
                        if (blkoff > max_block)
                                return -ENOSPC;
                }
@@ -1588,9 +1584,9 @@ static int ocfs2_block_group_search(struct inode *inode,
                if (!ret && max_block) {
                        blkoff = le64_to_cpu(bg->bg_blkno) +
                                res->sr_bit_offset + res->sr_bits;
-                        mlog(0, "Checking %llu against %llu\n",
+                        trace_ocfs2_block_group_search_max_block(
-                             (unsigned long long)blkoff,
+                                (unsigned long long)blkoff,
-                             (unsigned long long)max_block);
+                                (unsigned long long)max_block);
                        if (blkoff > max_block)
                                ret = -ENOSPC;
                }
@@ -1756,9 +1752,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        struct ocfs2_group_desc *bg;
        chain = ac->ac_chain;
-        mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
+        trace_ocfs2_search_chain_begin(
-             bits_wanted, chain,
+                (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
-             (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
+                bits_wanted, chain);
        status = ocfs2_read_group_descriptor(alloc_inode, fe,
                                             le64_to_cpu(cl->cl_recs[chain].c_blkno),
@@ -1799,8 +1795,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                goto bail;
        }
-        mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
+        trace_ocfs2_search_chain_succ(
-             res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
+                (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
        res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
@@ -1861,8 +1857,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                goto bail;
        }
-        mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
+        trace_ocfs2_search_chain_end(
-             (unsigned long long)le64_to_cpu(fe->i_blkno));
+                        (unsigned long long)le64_to_cpu(fe->i_blkno),
+                        res->sr_bits);
 out_loc_only:
        *bits_left = le16_to_cpu(bg->bg_free_bits_count);
@@ -1870,7 +1867,8 @@ bail:
        brelse(group_bh);
        brelse(prev_group_bh);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1888,8 +1886,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
        struct ocfs2_chain_list *cl;
        struct ocfs2_dinode *fe;
-        mlog_entry_void();
        BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
        BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
        BUG_ON(!ac->ac_bh);
@@ -1945,8 +1941,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
                goto bail;
        }
-        mlog(0, "Search of victim chain %u came up with nothing, "
+        trace_ocfs2_claim_suballoc_bits(victim);
-             "trying all chains now.\n", victim);
        /* If we didn't pick a good victim, then just default to
         * searching each chain in order. Don't allow chain relinking
@@ -1984,7 +1979,8 @@ set_hint:
        }
 bail:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -2021,7 +2017,8 @@ int ocfs2_claim_metadata(handle_t *handle,
        *num_bits = res.sr_bits;
        status = 0;
 bail:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -2172,8 +2169,8 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle,
                goto out;
        }
-        mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
+        trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
-             (unsigned long long)di_blkno);
+                                           res->sr_bits);
        atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
@@ -2201,8 +2198,6 @@ int ocfs2_claim_new_inode(handle_t *handle,
        int status;
        struct ocfs2_suballoc_result res;
-        mlog_entry_void();
        BUG_ON(!ac);
        BUG_ON(ac->ac_bits_given != 0);
        BUG_ON(ac->ac_bits_wanted != 1);
@@ -2230,7 +2225,8 @@ int ocfs2_claim_new_inode(handle_t *handle,
        ocfs2_save_inode_ac_group(dir, ac);
        status = 0;
 bail:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -2307,8 +2303,6 @@ int __ocfs2_claim_clusters(handle_t *handle,
        struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
        struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
-        mlog_entry_void();
        BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
        BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
@@ -2363,7 +2357,8 @@ int __ocfs2_claim_clusters(handle_t *handle,
        ac->ac_bits_given += *num_clusters;
 bail:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -2392,13 +2387,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
        unsigned int tmp;
        struct ocfs2_group_desc *undo_bg = NULL;
-        mlog_entry_void();
        /* The caller got this descriptor from
         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-        mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
+        trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
        BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
        status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
@@ -2463,19 +2456,18 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
        struct buffer_head *group_bh = NULL;
        struct ocfs2_group_desc *group;
-        mlog_entry_void();
        /* The alloc_bh comes from ocfs2_free_dinode() or
         * ocfs2_free_clusters().  The callers have all locked the
         * allocator and gotten alloc_bh from the lock call.  This
-         * validates the dinode buffer.  Any corruption that has happended
+         * validates the dinode buffer.  Any corruption that has happened
         * is a code bug. */
        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
        BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
-        mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
+        trace_ocfs2_free_suballoc_bits(
-             (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
+                (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
-             (unsigned long long)bg_blkno, start_bit);
+                (unsigned long long)bg_blkno,
+                start_bit, count);
        status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
                                             &group_bh);
@@ -2511,7 +2503,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
 bail:
        brelse(group_bh);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -2556,11 +2549,8 @@ static int _ocfs2_free_clusters(handle_t *handle,
        /* You can't ever have a contiguous set of clusters
         * bigger than a block group bitmap so we never have to worry
-         * about looping on them. */
+         * about looping on them.
+         * This is expensive. We can safely remove once this stuff has
-        mlog_entry_void();
-        /* This is expensive. We can safely remove once this stuff has
         * gotten tested really well. */
        BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
@@ -2569,10 +2559,9 @@ static int _ocfs2_free_clusters(handle_t *handle,
        ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
                                     &bg_start_bit);
-        mlog(0, "want to free %u clusters starting at block %llu\n",
+        trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
-             num_clusters, (unsigned long long)start_blk);
+                        (unsigned long long)start_blk,
-        mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
+                        bg_start_bit, num_clusters);
-             (unsigned long long)bg_blkno, bg_start_bit);
        status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
                                           bg_start_bit, bg_blkno,
@@ -2586,7 +2575,8 @@ static int _ocfs2_free_clusters(handle_t *handle,
                                         num_clusters);
 out:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -2756,7 +2746,7 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
        struct buffer_head *inode_bh = NULL;
        struct ocfs2_dinode *inode_fe;
-        mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
+        trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
        /* dirty read disk */
        status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
@@ -2793,7 +2783,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
 bail:
        brelse(inode_bh);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -2816,8 +2807,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
        u64 bg_blkno;
        int status;
-        mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
+        trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
-                   (unsigned int)bit);
+                                      (unsigned int)bit);
        alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
        if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
@@ -2844,7 +2835,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
 bail:
        brelse(group_bh);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -2869,7 +2861,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
        struct inode *inode_alloc_inode;
        struct buffer_head *alloc_bh = NULL;
-        mlog_entry("blkno: %llu", (unsigned long long)blkno);
+        trace_ocfs2_test_inode_bit((unsigned long long)blkno);
        status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
                                             &group_blkno, &suballoc_bit);
@@ -2910,6 +2902,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
        iput(inode_alloc_inode);
        brelse(alloc_bh);
 bail:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 38f986d2447e..5a521c748859 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -42,7 +42,9 @@
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
-#define MLOG_MASK_PREFIX ML_SUPER
+#define CREATE_TRACE_POINTS
+#include "ocfs2_trace.h"
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -76,7 +78,7 @@ static struct kmem_cache *ocfs2_inode_cachep = NULL;
 struct kmem_cache *ocfs2_dquot_cachep;
 struct kmem_cache *ocfs2_qf_chunk_cachep;
-/* OCFS2 needs to schedule several differnt types of work which
+/* OCFS2 needs to schedule several different types of work which
 * require cluster locking, disk I/O, recovery waits, etc. Since these
 * types of work tend to be heavy we avoid using the kernel events
 * workqueue and schedule on our own. */
@@ -441,8 +443,6 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
        int status = 0;
        int i;
-        mlog_entry_void();
        new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
        if (IS_ERR(new)) {
                status = PTR_ERR(new);
@@ -478,7 +478,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
        }
 bail:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -488,8 +489,6 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
        int status = 0;
        int i;
-        mlog_entry_void();
        for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
             i < NUM_SYSTEM_INODES;
             i++) {
@@ -508,7 +507,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
        }
 bail:
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -517,8 +517,6 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
        int i;
        struct inode *inode;
-        mlog_entry_void();
        for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
                inode = osb->global_system_inodes[i];
                if (inode) {
@@ -540,7 +538,7 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
        }
        if (!osb->local_system_inodes)
-                goto out;
+                return;
        for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
                if (osb->local_system_inodes[i]) {
@@ -551,9 +549,6 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
        kfree(osb->local_system_inodes);
        osb->local_system_inodes = NULL;
-out:
-        mlog_exit(0);
 }
 /* We're allocating fs objects, use GFP_NOFS */
@@ -684,12 +679,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
                }
                if (*flags & MS_RDONLY) {
-                        mlog(0, "Going to ro mode.\n");
                        sb->s_flags |= MS_RDONLY;
                        osb->osb_flags |= OCFS2_OSB_SOFT_RO;
                } else {
-                        mlog(0, "Making ro filesystem writeable.\n");
                        if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
                                mlog(ML_ERROR, "Cannot remount RDWR "
                                     "filesystem due to previous errors.\n");
@@ -707,6 +699,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
                        sb->s_flags &= ~MS_RDONLY;
                        osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
                }
+                trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
 unlock_osb:
                spin_unlock(&osb->osb_lock);
                /* Enable quota accounting after remounting RW */
@@ -1032,7 +1025,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        char nodestr[8];
        struct ocfs2_blockcheck_stats stats;
-        mlog_entry("%p, %p, %i", sb, data, silent);
+        trace_ocfs2_fill_super(sb, data, silent);
        if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
                status = -EINVAL;
@@ -1208,7 +1201,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                        mlog_errno(status);
                        atomic_set(&osb->vol_state, VOLUME_DISABLED);
                        wake_up(&osb->osb_mount_event);
-                        mlog_exit(status);
                        return status;
                }
        }
@@ -1222,7 +1214,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        /* Start this when the mount is almost sure of being successful */
        ocfs2_orphan_scan_start(osb);
-        mlog_exit(status);
        return status;
 read_super_error:
@@ -1237,7 +1228,8 @@ read_super_error:
                ocfs2_dismount_volume(sb, 1);
        }
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1316,12 +1308,11 @@ static int ocfs2_parse_options(struct super_block *sb,
                               struct mount_options *mopt,
                               int is_remount)
 {
-        int status;
+        int status, user_stack = 0;
        char *p;
        u32 tmp;
-        mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
+        trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
-                   options ? options : "(none)");
        mopt->commit_interval = 0;
        mopt->mount_opt = OCFS2_MOUNT_NOINTR;
@@ -1459,6 +1450,15 @@ static int ocfs2_parse_options(struct super_block *sb,
                        memcpy(mopt->cluster_stack, args[0].from,
                               OCFS2_STACK_LABEL_LEN);
                        mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+                        /*
+                         * Open code the memcmp here as we don't have
+                         * an osb to pass to
+                         * ocfs2_userspace_stack().
+                         */
+                        if (memcmp(mopt->cluster_stack,
+                                   OCFS2_CLASSIC_CLUSTER_STACK,
+                                   OCFS2_STACK_LABEL_LEN))
+                                user_stack = 1;
                        break;
                case Opt_inode64:
                        mopt->mount_opt |= OCFS2_MOUNT_INODE64;
@@ -1514,19 +1514,21 @@ static int ocfs2_parse_options(struct super_block *sb,
                }
        }
-        /* Ensure only one heartbeat mode */
+        if (user_stack == 0) {
-        tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
+                /* Ensure only one heartbeat mode */
-                                 OCFS2_MOUNT_HB_NONE);
+                tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
-        if (hweight32(tmp) != 1) {
+                                         OCFS2_MOUNT_HB_GLOBAL |
-                mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+                                         OCFS2_MOUNT_HB_NONE);
-                status = 0;
+                if (hweight32(tmp) != 1) {
-                goto bail;
+                        mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+                        status = 0;
+                        goto bail;
+                }
        }
        status = 1;
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -1617,8 +1619,6 @@ static int __init ocfs2_init(void)
 {
        int status;
-        mlog_entry_void();
        ocfs2_print_version();
        status = init_ocfs2_uptodate_cache();
@@ -1645,22 +1645,16 @@ static int __init ocfs2_init(void)
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
        }
-        status = ocfs2_quota_setup();
-        if (status)
-                goto leave;
        ocfs2_set_locking_protocol();
        status = register_quota_format(&ocfs2_quota_format);
 leave:
        if (status < 0) {
-                ocfs2_quota_shutdown();
                ocfs2_free_mem_caches();
                exit_ocfs2_uptodate_cache();
+                mlog_errno(status);
        }
-        mlog_exit(status);
        if (status >= 0) {
                return register_filesystem(&ocfs2_fs_type);
        } else
@@ -1669,10 +1663,6 @@ leave:
 static void __exit ocfs2_exit(void)
 {
-        mlog_entry_void();
-        ocfs2_quota_shutdown();
        if (ocfs2_wq) {
                flush_workqueue(ocfs2_wq);
                destroy_workqueue(ocfs2_wq);
@@ -1687,18 +1677,14 @@ static void __exit ocfs2_exit(void)
        unregister_filesystem(&ocfs2_fs_type);
        exit_ocfs2_uptodate_cache();
-        mlog_exit_void();
 }
 static void ocfs2_put_super(struct super_block *sb)
 {
-        mlog_entry("(0x%p)\n", sb);
+        trace_ocfs2_put_super(sb);
        ocfs2_sync_blockdev(sb);
        ocfs2_dismount_volume(sb, 0);
-        mlog_exit_void();
 }
 static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1710,7 +1696,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct buffer_head *bh = NULL;
        struct inode *inode = NULL;
-        mlog_entry("(%p, %p)\n", dentry->d_sb, buf);
+        trace_ocfs2_statfs(dentry->d_sb, buf);
        osb = OCFS2_SB(dentry->d_sb);
@@ -1757,7 +1743,8 @@ bail:
        if (inode)
                iput(inode);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -1877,8 +1864,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
        int unlock_super = 0;
        struct ocfs2_super *osb = OCFS2_SB(sb);
-        mlog_entry_void();
        if (ocfs2_is_hard_readonly(osb))
                goto leave;
@@ -1923,7 +1908,6 @@ leave:
        if (unlock_super)
                ocfs2_super_unlock(osb, 1);
-        mlog_exit(status);
        return status;
 }
@@ -1933,7 +1917,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        struct ocfs2_super *osb = NULL;
        char nodestr[8];
-        mlog_entry("(0x%p)\n", sb);
+        trace_ocfs2_dismount_volume(sb);
        BUG_ON(!sb);
        osb = OCFS2_SB(sb);
@@ -2085,8 +2069,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        struct ocfs2_super *osb;
        u64 total_blocks;
-        mlog_entry_void();
        osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL);
        if (!osb) {
                status = -ENOMEM;
@@ -2150,7 +2132,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
                status = -EINVAL;
                goto bail;
        }
-        mlog(0, "max_slots for this device: %u\n", osb->max_slots);
        ocfs2_orphan_scan_init(osb);
@@ -2289,7 +2270,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        osb->s_clustersize_bits =
                le32_to_cpu(di->id2.i_super.s_clustersize_bits);
        osb->s_clustersize = 1 << osb->s_clustersize_bits;
-        mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
        if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
            osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
@@ -2328,11 +2308,10 @@ static int ocfs2_initialize_super(struct super_block *sb,
                le64_to_cpu(di->id2.i_super.s_first_cluster_group);
        osb->fs_generation = le32_to_cpu(di->i_fs_generation);
        osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
-        mlog(0, "vol_label: %s\n", osb->vol_label);
+        trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str,
-        mlog(0, "uuid: %s\n", osb->uuid_str);
+                                     (unsigned long long)osb->root_blkno,
-        mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
+                                     (unsigned long long)osb->system_dir_blkno,
-             (unsigned long long)osb->root_blkno,
+                                     osb->s_clustersize_bits);
-             (unsigned long long)osb->system_dir_blkno);
        osb->osb_dlm_debug = ocfs2_new_dlm_debug();
        if (!osb->osb_dlm_debug) {
@@ -2375,7 +2354,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        }
 bail:
-        mlog_exit(status);
        return status;
 }
@@ -2391,8 +2369,6 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 {
        int status = -EAGAIN;
-        mlog_entry_void();
        if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
                   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
                /* We have to do a raw check of the feature here */
@@ -2447,7 +2423,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
        }
 out:
-        mlog_exit(status);
+        if (status && status != -EAGAIN)
+                mlog_errno(status);
        return status;
 }
@@ -2460,8 +2437,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
                                                  * recover
                                                  * ourselves. */
-        mlog_entry_void();
        /* Init our journal object. */
        status = ocfs2_journal_init(osb->journal, &dirty);
        if (status < 0) {
@@ -2511,8 +2486,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
                 * ourselves as mounted. */
        }
-        mlog(0, "Journal loaded.\n");
        status = ocfs2_load_local_alloc(osb);
        if (status < 0) {
                mlog_errno(status);
@@ -2544,7 +2517,8 @@ finally:
        if (local_alloc)
                kfree(local_alloc);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return status;
 }
@@ -2556,8 +2530,6 @@ finally:
 */
 static void ocfs2_delete_osb(struct ocfs2_super *osb)
 {
-        mlog_entry_void();
        /* This function assumes that the caller has the main osb resource */
        ocfs2_free_slot_info(osb);
@@ -2575,8 +2547,6 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
        kfree(osb->uuid_str);
        ocfs2_put_dlm_debug(osb->osb_dlm_debug);
        memset(osb, 0, sizeof(struct ocfs2_super));
-        mlog_exit_void();
 }
 /* Put OCFS2 into a readonly state, or (if the user specifies it),
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 9975457c981f..5d22872e2bb3 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -40,7 +40,6 @@
 #include <linux/pagemap.h>
 #include <linux/namei.h>
-#define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -62,8 +61,6 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
        char *link = NULL;
        struct ocfs2_dinode *fe;
-        mlog_entry_void();
        status = ocfs2_read_inode_block(inode, bh);
        if (status < 0) {
                mlog_errno(status);
@@ -74,7 +71,6 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
        fe = (struct ocfs2_dinode *) (*bh)->b_data;
        link = (char *) fe->id2.i_symlink;
 bail:
-        mlog_exit(status);
        return link;
 }
@@ -88,8 +84,6 @@ static int ocfs2_readlink(struct dentry *dentry,
        struct buffer_head *bh = NULL;
        struct inode *inode = dentry->d_inode;
-        mlog_entry_void();
        link = ocfs2_fast_symlink_getlink(inode, &bh);
        if (IS_ERR(link)) {
                ret = PTR_ERR(link);
@@ -104,7 +98,8 @@ static int ocfs2_readlink(struct dentry *dentry,
        brelse(bh);
 out:
-        mlog_exit(ret);
+        if (ret < 0)
+                mlog_errno(ret);
        return ret;
 }
@@ -117,8 +112,6 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
        struct inode *inode = dentry->d_inode;
        struct buffer_head *bh = NULL;
-        mlog_entry_void();
        BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
        target = ocfs2_fast_symlink_getlink(inode, &bh);
        if (IS_ERR(target)) {
@@ -142,7 +135,8 @@ bail:
        nd_set_link(nd, status ? ERR_PTR(status) : link);
        brelse(bh);
-        mlog_exit(status);
+        if (status)
+                mlog_errno(status);
        return NULL;
 }
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 902efb23b6a6..3d635f4bbb20 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -27,7 +27,6 @@
 #include <linux/types.h>
 #include <linux/highmem.h>
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 #include "ocfs2.h"
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index a0a120e82b97..52eaf33d346f 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -54,14 +54,13 @@
 #include <linux/buffer_head.h>
 #include <linux/rbtree.h>
-#define MLOG_MASK_PREFIX ML_UPTODATE
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "inode.h"
 #include "uptodate.h"
+#include "ocfs2_trace.h"
 struct ocfs2_meta_cache_item {
        struct rb_node  c_node;
@@ -152,8 +151,8 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
        while ((node = rb_last(root)) != NULL) {
                item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
-                mlog(0, "Purge item %llu\n",
+                trace_ocfs2_purge_copied_metadata_tree(
-                     (unsigned long long) item->c_block);
+                                        (unsigned long long) item->c_block);
                rb_erase(&item->c_node, root);
                kmem_cache_free(ocfs2_uptodate_cachep, item);
@@ -180,9 +179,9 @@ void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci)
        tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
        to_purge = ci->ci_num_cached;
-        mlog(0, "Purge %u %s items from Owner %llu\n", to_purge,
+        trace_ocfs2_metadata_cache_purge(
-             tree ? "array" : "tree",
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
-             (unsigned long long)ocfs2_metadata_cache_owner(ci));
+                to_purge, tree);
        /* If we're a tree, save off the root so that we can safely
         * initialize the cache. We do the work to free tree members
@@ -249,10 +248,10 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
        ocfs2_metadata_cache_lock(ci);
-        mlog(0, "Owner %llu, query block %llu (inline = %u)\n",
+        trace_ocfs2_buffer_cached_begin(
-             (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
-             (unsigned long long) bh->b_blocknr,
+                (unsigned long long) bh->b_blocknr,
-             !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
+                !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
        if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
                index = ocfs2_search_cache_array(ci, bh->b_blocknr);
@@ -261,7 +260,7 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
        ocfs2_metadata_cache_unlock(ci);
-        mlog(0, "index = %d, item = %p\n", index, item);
+        trace_ocfs2_buffer_cached_end(index, item);
        return (index != -1) || (item != NULL);
 }
@@ -306,8 +305,9 @@ static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
 {
        BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
-        mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
+        trace_ocfs2_append_cache_array(
-             ci->ci_num_cached);
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                (unsigned long long)block, ci->ci_num_cached);
        ci->ci_cache.ci_array[ci->ci_num_cached] = block;
        ci->ci_num_cached++;
@@ -324,8 +324,9 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
        struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
        struct ocfs2_meta_cache_item *tmp;
-        mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
+        trace_ocfs2_insert_cache_tree(
-             ci->ci_num_cached);
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                (unsigned long long)block, ci->ci_num_cached);
        while(*p) {
                parent = *p;
@@ -389,9 +390,9 @@ static void ocfs2_expand_cache(struct ocfs2_caching_info *ci,
                tree[i] = NULL;
        }
-        mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n",
+        trace_ocfs2_expand_cache(
-             (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
-             ci->ci_flags, ci->ci_num_cached);
+                ci->ci_flags, ci->ci_num_cached);
 }
 /* Slow path function - memory allocation is necessary. See the
@@ -405,9 +406,9 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
        struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
                { NULL, };
-        mlog(0, "Owner %llu, block %llu, expand = %d\n",
+        trace_ocfs2_set_buffer_uptodate(
-             (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
-             (unsigned long long)block, expand_tree);
+                (unsigned long long)block, expand_tree);
        new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
        if (!new) {
@@ -433,7 +434,6 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
        ocfs2_metadata_cache_lock(ci);
        if (ocfs2_insert_can_use_array(ci)) {
-                mlog(0, "Someone cleared the tree underneath us\n");
                /* Ok, items were removed from the cache in between
                 * locks. Detect this and revert back to the fast path */
                ocfs2_append_cache_array(ci, block);
@@ -490,9 +490,9 @@ void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
        if (ocfs2_buffer_cached(ci, bh))
                return;
-        mlog(0, "Owner %llu, inserting block %llu\n",
+        trace_ocfs2_set_buffer_uptodate_begin(
-             (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
-             (unsigned long long)bh->b_blocknr);
+                (unsigned long long)bh->b_blocknr);
        /* No need to recheck under spinlock - insertion is guarded by
         * co_io_lock() */
@@ -542,8 +542,9 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
        BUG_ON(index >= ci->ci_num_cached);
        BUG_ON(!ci->ci_num_cached);
-        mlog(0, "remove index %d (num_cached = %u\n", index,
+        trace_ocfs2_remove_metadata_array(
-             ci->ci_num_cached);
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                index, ci->ci_num_cached);
        ci->ci_num_cached--;
@@ -559,8 +560,9 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
 static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
                                       struct ocfs2_meta_cache_item *item)
 {
-        mlog(0, "remove block %llu from tree\n",
+        trace_ocfs2_remove_metadata_tree(
-             (unsigned long long) item->c_block);
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                (unsigned long long)item->c_block);
        rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
        ci->ci_num_cached--;
@@ -573,10 +575,10 @@ static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci,
        struct ocfs2_meta_cache_item *item = NULL;
        ocfs2_metadata_cache_lock(ci);
-        mlog(0, "Owner %llu, remove %llu, items = %u, array = %u\n",
+        trace_ocfs2_remove_block_from_cache(
-             (unsigned long long)ocfs2_metadata_cache_owner(ci),
+                (unsigned long long)ocfs2_metadata_cache_owner(ci),
-             (unsigned long long) block, ci->ci_num_cached,
+                (unsigned long long) block, ci->ci_num_cached,
-             ci->ci_flags & OCFS2_CACHE_FL_INLINE);
+                ci->ci_flags);
        if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
                index = ocfs2_search_cache_array(ci, block);
@@ -626,9 +628,6 @@ int __init init_ocfs2_uptodate_cache(void)
        if (!ocfs2_uptodate_cachep)
                return -ENOMEM;
-        mlog(0, "%u inlined cache items per inode.\n",
-             OCFS2_CACHE_INFO_MAX_ARRAY);
        return 0;
 }
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 67cd43914641..81ecf9c0bf0a 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -37,7 +37,6 @@
 #include <linux/string.h>
 #include <linux/security.h>
-#define MLOG_MASK_PREFIX ML_XATTR
 #include <cluster/masklog.h>
 #include "ocfs2.h"
@@ -57,6 +56,7 @@
 #include "xattr.h"
 #include "refcounttree.h"
 #include "acl.h"
+#include "ocfs2_trace.h"
 struct ocfs2_xattr_def_value_root {
        struct ocfs2_xattr_value_root   xv;
@@ -474,8 +474,7 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
        struct ocfs2_xattr_block *xb =
                (struct ocfs2_xattr_block *)bh->b_data;
-        mlog(0, "Validating xattr block %llu\n",
+        trace_ocfs2_validate_xattr_block((unsigned long long)bh->b_blocknr);
-             (unsigned long long)bh->b_blocknr);
        BUG_ON(!buffer_uptodate(bh));
@@ -715,11 +714,11 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
        u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
        struct ocfs2_extent_tree et;
-        mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
        ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
        while (clusters_to_add) {
+                trace_ocfs2_xattr_extend_allocation(clusters_to_add);
                status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
                                       OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
@@ -754,8 +753,6 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
                         */
                        BUG_ON(why == RESTART_META);
-                        mlog(0, "restarting xattr value extension for %u"
-                             " clusters,.\n", clusters_to_add);
                        credits = ocfs2_calc_extend_credits(inode->i_sb,
                                                            &vb->vb_xv->xr_list,
                                                            clusters_to_add);
@@ -3246,8 +3243,8 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
        }
        meta_add += extra_meta;
-        mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
+        trace_ocfs2_init_xattr_set_ctxt(xi->xi_name, meta_add,
-             "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits);
+                                        clusters_add, *credits);
        if (meta_add) {
                ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -3557,7 +3554,7 @@ int ocfs2_xattr_set(struct inode *inode,
        down_write(&OCFS2_I(inode)->ip_xattr_sem);
        /*
         * Scan inode and external block to find the same name
-         * extended attribute and collect search infomation.
+         * extended attribute and collect search information.
         */
        ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
        if (ret)
@@ -3581,7 +3578,7 @@ int ocfs2_xattr_set(struct inode *inode,
                        goto cleanup;
        }
-        /* Check whether the value is refcounted and do some prepartion. */
+        /* Check whether the value is refcounted and do some preparation. */
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
            (!xis.not_found || !xbs.not_found)) {
                ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
@@ -3887,8 +3884,10 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
        if (found) {
                xs->here = &xs->header->xh_entries[index];
-                mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
+                trace_ocfs2_xattr_bucket_find(OCFS2_I(inode)->ip_blkno,
-                     (unsigned long long)bucket_blkno(xs->bucket), index);
+                        name, name_index, name_hash,
+                        (unsigned long long)bucket_blkno(xs->bucket),
+                        index);
        } else
                ret = -ENODATA;
@@ -3915,8 +3914,10 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
        if (le16_to_cpu(el->l_next_free_rec) == 0)
                return -ENODATA;
-        mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n",
+        trace_ocfs2_xattr_index_block_find(OCFS2_I(inode)->ip_blkno,
-             name, name_hash, name_index);
+                                        name, name_index, name_hash,
+                                        (unsigned long long)root_bh->b_blocknr,
+                                        -1);
        ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
                                  &num_clusters, el);
@@ -3927,9 +3928,10 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
        BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
-        mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
+        trace_ocfs2_xattr_index_block_find_rec(OCFS2_I(inode)->ip_blkno,
-             "in the rec is %u\n", num_clusters, (unsigned long long)p_blkno,
+                                        name, name_index, first_hash,
-             first_hash);
+                                        (unsigned long long)p_blkno,
+                                        num_clusters);
        ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
                                      p_blkno, first_hash, num_clusters, xs);
@@ -3955,8 +3957,9 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
                return -ENOMEM;
        }
-        mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
+        trace_ocfs2_iterate_xattr_buckets(
-             clusters, (unsigned long long)blkno);
+                (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                (unsigned long long)blkno, clusters);
        for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
                ret = ocfs2_read_xattr_bucket(bucket, blkno);
@@ -3972,8 +3975,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
                if (i == 0)
                        num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
-                mlog(0, "iterating xattr bucket %llu, first hash %u\n",
+                trace_ocfs2_iterate_xattr_bucket((unsigned long long)blkno,
-                     (unsigned long long)blkno,
                     le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
                if (func) {
                        ret = func(inode, bucket, para);
@@ -4173,9 +4175,9 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
        char *src = xb_bh->b_data;
        char *target = bucket_block(bucket, blks - 1);
-        mlog(0, "cp xattr from block %llu to bucket %llu\n",
+        trace_ocfs2_cp_xattr_block_to_bucket_begin(
-             (unsigned long long)xb_bh->b_blocknr,
+                                (unsigned long long)xb_bh->b_blocknr,
-             (unsigned long long)bucket_blkno(bucket));
+                                (unsigned long long)bucket_blkno(bucket));
        for (i = 0; i < blks; i++)
                memset(bucket_block(bucket, i), 0, blocksize);
@@ -4211,8 +4213,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
        for (i = 0; i < count; i++)
                le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
-        mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n",
+        trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change);
-             offset, size, off_change);
        sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
             cmp_xe, swap_xe);
@@ -4261,8 +4262,8 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        struct ocfs2_xattr_tree_root *xr;
        u16 xb_flags = le16_to_cpu(xb->xb_flags);
-        mlog(0, "create xattr index block for %llu\n",
+        trace_ocfs2_xattr_create_index_block_begin(
-             (unsigned long long)xb_bh->b_blocknr);
+                                (unsigned long long)xb_bh->b_blocknr);
        BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
        BUG_ON(!xs->bucket);
@@ -4295,8 +4296,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
         */
        blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
-        mlog(0, "allocate 1 cluster from %llu to xattr block\n",
+        trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
-             (unsigned long long)blkno);
        ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
        if (ret) {
@@ -4400,8 +4400,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
        entries = (char *)xh->xh_entries;
        xh_free_start = le16_to_cpu(xh->xh_free_start);
-        mlog(0, "adjust xattr bucket in %llu, count = %u, "
+        trace_ocfs2_defrag_xattr_bucket(
-             "xh_free_start = %u, xh_name_value_len = %u.\n",
             (unsigned long long)blkno, le16_to_cpu(xh->xh_count),
             xh_free_start, le16_to_cpu(xh->xh_name_value_len));
@@ -4503,8 +4502,9 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
        BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
        BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
-        mlog(0, "move half of xattrs in cluster %llu to %llu\n",
+        trace_ocfs2_mv_xattr_bucket_cross_cluster(
-             (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
+                                (unsigned long long)last_cluster_blkno,
+                                (unsigned long long)new_blkno);
        ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
                                     last_cluster_blkno, new_blkno,
@@ -4614,8 +4614,8 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
        struct ocfs2_xattr_entry *xe;
        int blocksize = inode->i_sb->s_blocksize;
-        mlog(0, "move some of xattrs from bucket %llu to %llu\n",
+        trace_ocfs2_divide_xattr_bucket_begin((unsigned long long)blk,
-             (unsigned long long)blk, (unsigned long long)new_blk);
+                                              (unsigned long long)new_blk);
        s_bucket = ocfs2_xattr_bucket_new(inode);
        t_bucket = ocfs2_xattr_bucket_new(inode);
@@ -4714,9 +4714,9 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
         */
        xe = &xh->xh_entries[start];
        len = sizeof(struct ocfs2_xattr_entry) * (count - start);
-        mlog(0, "mv xattr entry len %d from %d to %d\n", len,
+        trace_ocfs2_divide_xattr_bucket_move(len,
-             (int)((char *)xe - (char *)xh),
+                        (int)((char *)xe - (char *)xh),
-             (int)((char *)xh->xh_entries - (char *)xh));
+                        (int)((char *)xh->xh_entries - (char *)xh));
        memmove((char *)xh->xh_entries, (char *)xe, len);
        xe = &xh->xh_entries[count - start];
        len = sizeof(struct ocfs2_xattr_entry) * start;
@@ -4788,9 +4788,9 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
        BUG_ON(s_blkno == t_blkno);
-        mlog(0, "cp bucket %llu to %llu, target is %d\n",
+        trace_ocfs2_cp_xattr_bucket((unsigned long long)s_blkno,
-             (unsigned long long)s_blkno, (unsigned long long)t_blkno,
+                                    (unsigned long long)t_blkno,
-             t_is_new);
+                                    t_is_new);
        s_bucket = ocfs2_xattr_bucket_new(inode);
        t_bucket = ocfs2_xattr_bucket_new(inode);
@@ -4862,8 +4862,8 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
        int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
        struct ocfs2_xattr_bucket *old_first, *new_first;
-        mlog(0, "mv xattrs from cluster %llu to %llu\n",
+        trace_ocfs2_mv_xattr_buckets((unsigned long long)last_blk,
-             (unsigned long long)last_blk, (unsigned long long)to_blk);
+                                     (unsigned long long)to_blk);
        BUG_ON(start_bucket >= num_buckets);
        if (start_bucket) {
@@ -5013,9 +5013,9 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 {
        int ret;
-        mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
+        trace_ocfs2_adjust_xattr_cross_cluster(
-             (unsigned long long)bucket_blkno(first), prev_clusters,
+                        (unsigned long long)bucket_blkno(first),
-             (unsigned long long)new_blk);
+                        (unsigned long long)new_blk, prev_clusters);
        if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
                ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
@@ -5088,10 +5088,10 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_extent_tree et;
-        mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
+        trace_ocfs2_add_new_xattr_cluster_begin(
-             "previous xattr blkno = %llu\n",
+                (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                (unsigned long long)bucket_blkno(first),
-             prev_cpos, (unsigned long long)bucket_blkno(first));
+                prev_cpos, prev_clusters);
        ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
@@ -5113,8 +5113,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
        BUG_ON(num_bits > clusters_to_add);
        block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-        mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
+        trace_ocfs2_add_new_xattr_cluster((unsigned long long)block, num_bits);
-             num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
        if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
            (prev_clusters + num_bits) << osb->s_clustersize_bits <=
@@ -5130,8 +5129,6 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                 */
                v_start = prev_cpos + prev_clusters;
                *num_clusters = prev_clusters + num_bits;
-                mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
-                     num_bits);
        } else {
                ret = ocfs2_adjust_xattr_cross_cluster(inode,
                                                       handle,
@@ -5147,8 +5144,8 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                }
        }
-        mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
+        trace_ocfs2_add_new_xattr_cluster_insert((unsigned long long)block,
-             num_bits, (unsigned long long)block, v_start);
+                                                 v_start, num_bits);
        ret = ocfs2_insert_extent(handle, &et, v_start, block,
                                  num_bits, 0, ctxt->meta_ac);
        if (ret < 0) {
@@ -5183,9 +5180,9 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
        u64 end_blk;
        u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
-        mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
+        trace_ocfs2_extend_xattr_bucket((unsigned long long)target_blk,
-             "from %llu, len = %u\n", (unsigned long long)target_blk,
+                                        (unsigned long long)bucket_blkno(first),
-             (unsigned long long)bucket_blkno(first), num_clusters);
+                                        num_clusters, new_bucket);
        /* The extent must have room for an additional bucket */
        BUG_ON(new_bucket >=
@@ -5265,8 +5262,8 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
        /* The bucket at the front of the extent */
        struct ocfs2_xattr_bucket *first;
-        mlog(0, "Add new xattr bucket starting from %llu\n",
+        trace_ocfs2_add_new_xattr_bucket(
-             (unsigned long long)bucket_blkno(target));
+                                (unsigned long long)bucket_blkno(target));
        /* The first bucket of the original extent */
        first = ocfs2_xattr_bucket_new(inode);
@@ -5382,8 +5379,8 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
         * modified something.  We have to assume they did, and dirty
         * the whole bucket.  This leaves us in a consistent state.
         */
-        mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+        trace_ocfs2_xattr_bucket_value_truncate(
-             xe_off, (unsigned long long)bucket_blkno(bucket), len);
+                        (unsigned long long)bucket_blkno(bucket), xe_off, len);
        ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
        if (ret) {
                mlog_errno(ret);
@@ -5433,8 +5430,9 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
        ocfs2_init_dealloc_ctxt(&dealloc);
-        mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
+        trace_ocfs2_rm_xattr_cluster(
-             cpos, len, (unsigned long long)blkno);
+                        (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                        (unsigned long long)blkno, cpos, len);
        ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno,
                                               len);
@@ -5538,7 +5536,7 @@ static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
        int ret;
        struct ocfs2_xa_loc loc;
-        mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name);
+        trace_ocfs2_xattr_set_entry_bucket(xi->xi_name);
        ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
                                       xs->not_found ? NULL : xs->here);
@@ -5570,7 +5568,6 @@ static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
 out:
-        mlog_exit(ret);
        return ret;
 }
@@ -5581,7 +5578,7 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 {
        int ret;
-        mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
+        trace_ocfs2_xattr_set_entry_index_block(xi->xi_name);
        ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
        if (!ret)
@@ -5637,7 +5634,6 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
                mlog_errno(ret);
 out:
-        mlog_exit(ret);
        return ret;
 }
@@ -6041,9 +6037,9 @@ static int ocfs2_xattr_bucket_value_refcount(struct inode *inode,
        if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
                p = &refcount;
-        mlog(0, "refcount bucket %llu, count = %u\n",
+        trace_ocfs2_xattr_bucket_value_refcount(
-             (unsigned long long)bucket_blkno(bucket),
+                                (unsigned long long)bucket_blkno(bucket),
-             le16_to_cpu(xh->xh_count));
+                                le16_to_cpu(xh->xh_count));
        for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
                xe = &xh->xh_entries[i];
@@ -6339,8 +6335,8 @@ static int ocfs2_reflink_xattr_header(handle_t *handle,
        u32 clusters, cpos, p_cluster, num_clusters;
        unsigned int ext_flags = 0;
-        mlog(0, "reflink xattr in container %llu, count = %u\n",
+        trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr,
-             (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count));
+                                         le16_to_cpu(xh->xh_count));
        last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
        for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
@@ -6540,8 +6536,8 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
                goto out;
        }
-        mlog(0, "create new xattr block for inode %llu, index = %d\n",
+        trace_ocfs2_create_empty_xattr_block(
-             (unsigned long long)fe_bh->b_blocknr, indexed);
+                                (unsigned long long)fe_bh->b_blocknr, indexed);
        ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
                                       ret_bh);
        if (ret)
@@ -6952,8 +6948,8 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
                if (ret)
                        mlog_errno(ret);
-                mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
+                trace_ocfs2_reflink_xattr_buckets((unsigned long long)new_blkno,
-                     (unsigned long long)new_blkno, num_clusters, reflink_cpos);
+                                                  num_clusters, reflink_cpos);
                len -= num_clusters;
                blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
@@ -6982,8 +6978,7 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
        struct ocfs2_alloc_context *data_ac = NULL;
        struct ocfs2_extent_tree et;
-        mlog(0, "reflink xattr buckets %llu len %u\n",
+        trace_ocfs2_reflink_xattr_rec((unsigned long long)blkno, len);
-             (unsigned long long)blkno, len);
        ocfs2_init_xattr_tree_extent_tree(&et,
                                          INODE_CACHE(args->reflink->new_inode),
@@ -7185,7 +7180,8 @@ out:
 * must not hold any lock expect i_mutex.
 */
 int ocfs2_init_security_and_acl(struct inode *dir,
-                                struct inode *inode)
+                                struct inode *inode,
+                                const struct qstr *qstr)
 {
        int ret = 0;
        struct buffer_head *dir_bh = NULL;
@@ -7193,7 +7189,7 @@ int ocfs2_init_security_and_acl(struct inode *dir,
                .enable = 1,
        };
-        ret = ocfs2_init_security_get(inode, dir, &si);
+        ret = ocfs2_init_security_get(inode, dir, qstr, &si);
        if (!ret) {
                ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
                                      si.name, si.value, si.value_len,
@@ -7261,13 +7257,14 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
 int ocfs2_init_security_get(struct inode *inode,
                            struct inode *dir,
+                            const struct qstr *qstr,
                            struct ocfs2_security_xattr_info *si)
 {
        /* check whether ocfs2 support feature xattr */
        if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
                return -EOPNOTSUPP;
-        return security_inode_init_security(inode, dir, &si->name, &si->value,
+        return security_inode_init_security(inode, dir, qstr, &si->name,
-                                            &si->value_len);
+                                            &si->value, &si->value_len);
 }
 int ocfs2_init_security_set(handle_t *handle,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index aa64bb37a65b..d63cfb72316b 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -57,6 +57,7 @@ int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
                                         struct ocfs2_dinode *di);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
 int ocfs2_init_security_get(struct inode *, struct inode *,
+                            const struct qstr *,
                            struct ocfs2_security_xattr_info *);
 int ocfs2_init_security_set(handle_t *, struct inode *,
                            struct buffer_head *,
@@ -94,5 +95,6 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
                         struct buffer_head *new_bh,
                         bool preserve_security);
 int ocfs2_init_security_and_acl(struct inode *dir,
-                                struct inode *inode);
+                                struct inode *inode,
+                                const struct qstr *qstr);
 #endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 393f3f659da7..de4ff29f1e05 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -235,33 +235,22 @@ static int omfs_dir_is_empty(struct inode *inode)
        return *ptr != ~0;
 }
-static int omfs_unlink(struct inode *dir, struct dentry *dentry)
+static int omfs_remove(struct inode *dir, struct dentry *dentry)
 {
-        int ret;
        struct inode *inode = dentry->d_inode;
+        int ret;
+        if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode))
+                return -ENOTEMPTY;
        ret = omfs_delete_entry(dentry);
        if (ret)
-                goto end_unlink;
+                return ret;
+        
-        inode_dec_link_count(inode);
+        clear_nlink(inode);
+        mark_inode_dirty(inode);
        mark_inode_dirty(dir);
+        return 0;
-end_unlink:
-        return ret;
-}
-static int omfs_rmdir(struct inode *dir, struct dentry *dentry)
-{
-        int err = -ENOTEMPTY;
-        struct inode *inode = dentry->d_inode;
-        if (omfs_dir_is_empty(inode)) {
-                err = omfs_unlink(dir, dentry);
-                if (!err)
-                        inode_dec_link_count(inode);
-        }
-        return err;
 }
 static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode)
@@ -372,9 +361,10 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
                res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
                        OMFS_NAMELEN), filp->f_pos, self, d_type);
-                if (res == 0)
-                        filp->f_pos++;
                brelse(bh);
+                if (res < 0)
+                        break;
+                filp->f_pos++;
        }
 out:
        return res;
@@ -385,44 +375,28 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 {
        struct inode *new_inode = new_dentry->d_inode;
        struct inode *old_inode = old_dentry->d_inode;
-        struct buffer_head *bh;
-        int is_dir;
        int err;
-        is_dir = S_ISDIR(old_inode->i_mode);
        if (new_inode) {
                /* overwriting existing file/dir */
-                err = -ENOTEMPTY;
+                err = omfs_remove(new_dir, new_dentry);
-                if (is_dir && !omfs_dir_is_empty(new_inode))
-                        goto out;
-                err = -ENOENT;
-                bh = omfs_find_entry(new_dir, new_dentry->d_name.name,
-                        new_dentry->d_name.len);
-                if (IS_ERR(bh))
-                        goto out;
-                brelse(bh);
-                err = omfs_unlink(new_dir, new_dentry);
                if (err)
                        goto out;
        }
        /* since omfs locates files by name, we need to unlink _before_
         * adding the new link or we won't find the old one */
-        inode_inc_link_count(old_inode);
+        err = omfs_delete_entry(old_dentry);
-        err = omfs_unlink(old_dir, old_dentry);
+        if (err)
-        if (err) {
-                inode_dec_link_count(old_inode);
                goto out;
-        }
+        mark_inode_dirty(old_dir);
        err = omfs_add_link(new_dentry, old_inode);
        if (err)
                goto out;
        old_inode->i_ctime = CURRENT_TIME_SEC;
+        mark_inode_dirty(old_inode);
 out:
        return err;
 }
@@ -488,8 +462,8 @@ const struct inode_operations omfs_dir_inops = {
        .mkdir = omfs_mkdir,
        .rename = omfs_rename,
        .create = omfs_create,
-        .unlink = omfs_unlink,
+        .unlink = omfs_remove,
-        .rmdir = omfs_rmdir,
+        .rmdir = omfs_remove,
 };
 const struct file_operations omfs_dir_operations = {
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 8a6d34fa668a..d738a7e493dd 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -372,7 +372,6 @@ const struct address_space_operations omfs_aops = {
        .readpages = omfs_readpages,
        .writepage = omfs_writepage,
        .writepages = omfs_writepages,
-        .sync_page = block_sync_page,
        .write_begin = omfs_write_begin,
        .write_end = generic_write_end,
        .bmap = omfs_bmap,
diff --git a/fs/open.c b/fs/open.c
index e52389e1f05b..b52cf013ffa1 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -233,6 +233,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
+        /* It's not possible punch hole on append only file */
+        if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode))
+                return -EPERM;
+        if (IS_IMMUTABLE(inode))
+                return -EPERM;
        /*
         * Revalidate the write permissions, in case security policy has
         * changed since the files were opened.
@@ -565,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
 {
        struct path path;
        int error = -EINVAL;
-        int follow;
+        int lookup_flags;
-        if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                goto out;
-        follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+        lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
-        error = user_path_at(dfd, filename, follow, &path);
+        if (flag & AT_EMPTY_PATH)
+                lookup_flags |= LOOKUP_EMPTY;
+        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
                goto out;
        error = mnt_want_write(path.mnt);
@@ -661,11 +671,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                                        int (*open)(struct inode *, struct file *),
                                        const struct cred *cred)
 {
+        static const struct file_operations empty_fops = {};
        struct inode *inode;
        int error;
        f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
                                FMODE_PREAD | FMODE_PWRITE;
+        if (unlikely(f->f_flags & O_PATH))
+                f->f_mode = FMODE_PATH;
        inode = dentry->d_inode;
        if (f->f_mode & FMODE_WRITE) {
                error = __get_file_write_access(inode, mnt);
@@ -679,9 +694,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
        f->f_path.dentry = dentry;
        f->f_path.mnt = mnt;
        f->f_pos = 0;
-        f->f_op = fops_get(inode->i_fop);
        file_sb_list_add(f, inode->i_sb);
+        if (unlikely(f->f_mode & FMODE_PATH)) {
+                f->f_op = &empty_fops;
+                return f;
+        }
+        f->f_op = fops_get(inode->i_fop);
        error = security_dentry_open(f, cred);
        if (error)
                goto cleanup_all;
@@ -693,7 +714,8 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                if (error)
                        goto cleanup_all;
        }
-        ima_counts_get(f);
+        if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+                i_readcount_inc(inode);
        f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
@@ -790,6 +812,8 @@ struct file *nameidata_to_filp(struct nameidata *nd)
        /* Pick up the filp from the open intent */
        filp = nd->intent.open.file;
+        nd->intent.open.file = NULL;
        /* Has the filesystem initialised the file for us? */
        if (filp->f_path.dentry == NULL) {
                path_get(&nd->path);
@@ -811,17 +835,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
        validate_creds(cred);
-        /*
+        /* We must always pass in a valid mount pointer. */
-         * We must always pass in a valid mount pointer.   Historically
+        BUG_ON(!mnt);
-         * callers got away with not passing it, but we must enforce this at
-         * the earliest possible point now to avoid strange problems deep in the
-         * filesystem stack.
-         */
-        if (!mnt) {
-                printk(KERN_WARNING "%s called with NULL vfsmount\n", __func__);
-                dump_stack();
-                return ERR_PTR(-EINVAL);
-        }
        error = -ENFILE;
        f = get_empty_filp();
@@ -880,15 +895,110 @@ void fd_install(unsigned int fd, struct file *file)
 EXPORT_SYMBOL(fd_install);
+static inline int build_open_flags(int flags, int mode, struct open_flags *op)
+{
+        int lookup_flags = 0;
+        int acc_mode;
+        if (!(flags & O_CREAT))
+                mode = 0;
+        op->mode = mode;
+        /* Must never be set by userspace */
+        flags &= ~FMODE_NONOTIFY;
+        /*
+         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
+         * check for O_DSYNC if the need any syncing at all we enforce it's
+         * always set instead of having to deal with possibly weird behaviour
+         * for malicious applications setting only __O_SYNC.
+         */
+        if (flags & __O_SYNC)
+                flags |= O_DSYNC;
+        /*
+         * If we have O_PATH in the open flag. Then we
+         * cannot have anything other than the below set of flags
+         */
+        if (flags & O_PATH) {
+                flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
+                acc_mode = 0;
+        } else {
+                acc_mode = MAY_OPEN | ACC_MODE(flags);
+        }
+        op->open_flag = flags;
+        /* O_TRUNC implies we need access checks for write permissions */
+        if (flags & O_TRUNC)
+                acc_mode |= MAY_WRITE;
+        /* Allow the LSM permission hook to distinguish append
+           access from general write access. */
+        if (flags & O_APPEND)
+                acc_mode |= MAY_APPEND;
+        op->acc_mode = acc_mode;
+        op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
+        if (flags & O_CREAT) {
+                op->intent |= LOOKUP_CREATE;
+                if (flags & O_EXCL)
+                        op->intent |= LOOKUP_EXCL;
+        }
+        if (flags & O_DIRECTORY)
+                lookup_flags |= LOOKUP_DIRECTORY;
+        if (!(flags & O_NOFOLLOW))
+                lookup_flags |= LOOKUP_FOLLOW;
+        return lookup_flags;
+}
+/**
+ * filp_open - open file and return file pointer
+ *
+ * @filename:   path to open
+ * @flags:      open flags as per the open(2) second argument
+ * @mode:       mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to.  But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *filp_open(const char *filename, int flags, int mode)
+{
+        struct open_flags op;
+        int lookup = build_open_flags(flags, mode, &op);
+        return do_filp_open(AT_FDCWD, filename, &op, lookup);
+}
+EXPORT_SYMBOL(filp_open);
+struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+                            const char *filename, int flags)
+{
+        struct open_flags op;
+        int lookup = build_open_flags(flags, 0, &op);
+        if (flags & O_CREAT)
+                return ERR_PTR(-EINVAL);
+        if (!filename && (flags & O_DIRECTORY))
+                if (!dentry->d_inode->i_op->lookup)
+                        return ERR_PTR(-ENOTDIR);
+        return do_file_open_root(dentry, mnt, filename, &op, lookup);
+}
+EXPORT_SYMBOL(file_open_root);
 long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 {
+        struct open_flags op;
+        int lookup = build_open_flags(flags, mode, &op);
        char *tmp = getname(filename);
        int fd = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
                fd = get_unused_fd_flags(flags);
                if (fd >= 0) {
-                        struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
+                        struct file *f = do_filp_open(dfd, tmp, &op, lookup);
                        if (IS_ERR(f)) {
                                put_unused_fd(fd);
                                fd = PTR_ERR(f);
@@ -958,8 +1068,10 @@ int filp_close(struct file *filp, fl_owner_t id)
        if (filp->f_op && filp->f_op->flush)
                retval = filp->f_op->flush(filp, id);
-        dnotify_flush(filp, id);
+        if (likely(!(filp->f_mode & FMODE_PATH))) {
-        locks_remove_posix(filp, id);
+                dnotify_flush(filp, id);
+                locks_remove_posix(filp, id);
+        }
        fput(filp);
        return retval;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 9c21119512b9..d545e97d99c3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -290,7 +290,8 @@ ssize_t part_inflight_show(struct device *dev,
 {
        struct hd_struct *p = dev_to_part(dev);
-        return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
+        return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
+                atomic_read(&p->in_flight[1]));
 }
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -499,7 +500,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        /* everything is up and running, commence */
        rcu_assign_pointer(ptbl->part[partno], p);
-        /* suppress uevent if the disk supresses it */
+        /* suppress uevent if the disk suppresses it */
        if (!dev_get_uevent_suppress(ddev))
                kobject_uevent(&pdev->kobj, KOBJ_ADD);
@@ -584,7 +585,7 @@ rescan:
        /*
         * If any partition code tried to read beyond EOD, try
         * unlocking native capacity even if partition table is
-         * sucessfully read as we could be missing some partitions.
+         * successfully read as we could be missing some partitions.
         */
        if (state->access_beyond_eod) {
                printk(KERN_WARNING
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 789c625c7aa5..ce4f62440425 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -251,6 +251,11 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
        }
        vm->vblk_size     = get_unaligned_be32(data + 0x08);
+        if (vm->vblk_size == 0) {
+                ldm_error ("Illegal VBLK size");
+                return false;
+        }
        vm->vblk_offset   = get_unaligned_be32(data + 0x0C);
        vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
@@ -1294,6 +1299,11 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
        BUG_ON (!data || !frags);
+        if (size < 2 * VBLK_SIZE_HEAD) {
+                ldm_error("Value of size is to small.");
+                return false;
+        }
        group = get_unaligned_be32(data + 0x08);
        rec   = get_unaligned_be16(data + 0x0C);
        num   = get_unaligned_be16(data + 0x0E);
@@ -1301,6 +1311,10 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
                ldm_error ("A VBLK claims to have %d parts.", num);
                return false;
        }
+        if (rec >= num) {
+                ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num);
+                return false;
+        }
        list_for_each (item, frags) {
                f = list_entry (item, struct frag, list);
@@ -1329,10 +1343,9 @@ found:
        f->map |= (1 << rec);
-        if (num > 0) {
+        data += VBLK_SIZE_HEAD;
-                data += VBLK_SIZE_HEAD;
+        size -= VBLK_SIZE_HEAD;
-                size -= VBLK_SIZE_HEAD;
-        }
        memcpy (f->data+rec*(size-VBLK_SIZE_HEAD)+VBLK_SIZE_HEAD, data, size);
        return true;
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index 68d6a216ee79..11f688bd76c5 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -29,10 +29,9 @@ static inline void mac_fix_string(char *stg, int len)
 int mac_partition(struct parsed_partitions *state)
 {
-        int slot = 1;
        Sector sect;
        unsigned char *data;
-        int blk, blocks_in_map;
+        int slot, blocks_in_map;
        unsigned secsize;
 #ifdef CONFIG_PPC_PMAC
        int found_root = 0;
@@ -59,10 +58,14 @@ int mac_partition(struct parsed_partitions *state)
                put_dev_sector(sect);
                return 0;               /* not a MacOS disk */
        }
-        strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
        blocks_in_map = be32_to_cpu(part->map_count);
-        for (blk = 1; blk <= blocks_in_map; ++blk) {
+        if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) {
-                int pos = blk * secsize;
+                put_dev_sector(sect);
+                return 0;
+        }
+        strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
+        for (slot = 1; slot <= blocks_in_map; ++slot) {
+                int pos = slot * secsize;
                put_dev_sector(sect);
                data = read_part_sector(state, pos/512, &sect);
                if (!data)
@@ -113,13 +116,11 @@ int mac_partition(struct parsed_partitions *state)
                        }
                        if (goodness > found_root_goodness) {
-                                found_root = blk;
+                                found_root = slot;
                                found_root_goodness = goodness;
                        }
                }
 #endif /* CONFIG_PPC_PMAC */
-                ++slot;
        }
 #ifdef CONFIG_PPC_PMAC
        if (found_root_goodness)
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index 48cec7cbca17..764b86a01965 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,10 +10,13 @@
 #include "check.h"
 #include "osf.h"
+#define MAX_OSF_PARTITIONS 18
 int osf_partition(struct parsed_partitions *state)
 {
        int i;
        int slot = 1;
+        unsigned int npartitions;
        Sector sect;
        unsigned char *data;
        struct disklabel {
@@ -45,7 +48,7 @@ int osf_partition(struct parsed_partitions *state)
                        u8  p_fstype;
                        u8  p_frag;
                        __le16 p_cpg;
-                } d_partitions[8];
+                } d_partitions[MAX_OSF_PARTITIONS];
        } * label;
        struct d_partition * partition;
@@ -63,7 +66,12 @@ int osf_partition(struct parsed_partitions *state)
                put_dev_sector(sect);
                return 0;
        }
-        for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) {
+        npartitions = le16_to_cpu(label->d_npartitions);
+        if (npartitions > MAX_OSF_PARTITIONS) {
+                put_dev_sector(sect);
+                return 0;
+        }
+        for (i = 0 ; i < npartitions; i++, partition++) {
                if (slot == state->limit)
                        break;
                if (le32_to_cpu(partition->p_size))
diff --git a/fs/proc/array.c b/fs/proc/array.c
index df2b703b9d0f..5e4f776b0917 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -353,9 +353,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
        task_cap(m, task);
        task_cpus_allowed(m, task);
        cpuset_task_status_allowed(m, task);
-#if defined(CONFIG_S390)
-        task_show_regs(m, task);
-#endif
        task_context_switch_counts(m, task);
        return 0;
 }
@@ -492,8 +489,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                vsize,
                mm ? get_mm_rss(mm) : 0,
                rsslim,
-                mm ? mm->start_code : 0,
+                mm ? (permitted ? mm->start_code : 1) : 0,
-                mm ? mm->end_code : 0,
+                mm ? (permitted ? mm->end_code : 1) : 0,
                (permitted && mm) ? mm->start_stack : 0,
                esp,
                eip,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9d096e82b201..dfa532730e55 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -191,17 +191,20 @@ static int proc_root_link(struct inode *inode, struct path *path)
        return result;
 }
-/*
+static struct mm_struct *__check_mem_permission(struct task_struct *task)
- * Return zero if current may access user memory in @task, -error if not.
- */
-static int check_mem_permission(struct task_struct *task)
 {
+        struct mm_struct *mm;
+        mm = get_task_mm(task);
+        if (!mm)
+                return ERR_PTR(-EINVAL);
        /*
         * A task can always look at itself, in case it chooses
         * to use system calls instead of load instructions.
         */
        if (task == current)
-                return 0;
+                return mm;
        /*
         * If current is actively ptrace'ing, and would also be
@@ -213,27 +216,53 @@ static int check_mem_permission(struct task_struct *task)
                match = (tracehook_tracer_task(task) == current);
                rcu_read_unlock();
                if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
-                        return 0;
+                        return mm;
        }
        /*
-         * Noone else is allowed.
+         * No one else is allowed.
+         */
+        mmput(mm);
+        return ERR_PTR(-EPERM);
+}
+/*
+ * If current may access user memory in @task return a reference to the
+ * corresponding mm, otherwise ERR_PTR.
+ */
+static struct mm_struct *check_mem_permission(struct task_struct *task)
+{
+        struct mm_struct *mm;
+        int err;
+        /*
+         * Avoid racing if task exec's as we might get a new mm but validate
+         * against old credentials.
         */
-        return -EPERM;
+        err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+        if (err)
+                return ERR_PTR(err);
+        mm = __check_mem_permission(task);
+        mutex_unlock(&task->signal->cred_guard_mutex);
+        return mm;
 }
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
        struct mm_struct *mm;
+        int err;
-        if (mutex_lock_killable(&task->signal->cred_guard_mutex))
+        err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
-                return NULL;
+        if (err)
+                return ERR_PTR(err);
        mm = get_task_mm(task);
        if (mm && mm != current->mm &&
                        !ptrace_may_access(task, PTRACE_MODE_READ)) {
                mmput(mm);
-                mm = NULL;
+                mm = ERR_PTR(-EACCES);
        }
        mutex_unlock(&task->signal->cred_guard_mutex);
@@ -279,9 +308,9 @@ out:
 static int proc_pid_auxv(struct task_struct *task, char *buffer)
 {
-        int res = 0;
+        struct mm_struct *mm = mm_for_maps(task);
-        struct mm_struct *mm = get_task_mm(task);
+        int res = PTR_ERR(mm);
-        if (mm) {
+        if (mm && !IS_ERR(mm)) {
                unsigned int nwords = 0;
                do {
                        nwords += 2;
@@ -318,6 +347,23 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 }
 #endif /* CONFIG_KALLSYMS */
+static int lock_trace(struct task_struct *task)
+{
+        int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+        if (err)
+                return err;
+        if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+                mutex_unlock(&task->signal->cred_guard_mutex);
+                return -EPERM;
+        }
+        return 0;
+}
+static void unlock_trace(struct task_struct *task)
+{
+        mutex_unlock(&task->signal->cred_guard_mutex);
+}
 #ifdef CONFIG_STACKTRACE
 #define MAX_STACK_TRACE_DEPTH   64
@@ -327,6 +373,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 {
        struct stack_trace trace;
        unsigned long *entries;
+        int err;
        int i;
        entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
@@ -337,15 +384,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
        trace.max_entries       = MAX_STACK_TRACE_DEPTH;
        trace.entries           = entries;
        trace.skip              = 0;
-        save_stack_trace_tsk(task, &trace);
-        for (i = 0; i < trace.nr_entries; i++) {
+        err = lock_trace(task);
-                seq_printf(m, "[<%p>] %pS\n",
+        if (!err) {
-                           (void *)entries[i], (void *)entries[i]);
+                save_stack_trace_tsk(task, &trace);
+                for (i = 0; i < trace.nr_entries; i++) {
+                        seq_printf(m, "[<%pK>] %pS\n",
+                                   (void *)entries[i], (void *)entries[i]);
+                }
+                unlock_trace(task);
        }
        kfree(entries);
-        return 0;
+        return err;
 }
 #endif
@@ -508,18 +560,22 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
 {
        long nr;
        unsigned long args[6], sp, pc;
+        int res = lock_trace(task);
+        if (res)
+                return res;
        if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
-                return sprintf(buffer, "running\n");
+                res = sprintf(buffer, "running\n");
+        else if (nr < 0)
-        if (nr < 0)
+                res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
-                return sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+        else
+                res = sprintf(buffer,
-        return sprintf(buffer,
                       "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
                       nr,
                       args[0], args[1], args[2], args[3], args[4], args[5],
                       sp, pc);
+        unlock_trace(task);
+        return res;
 }
 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
@@ -775,18 +831,14 @@ static ssize_t mem_read(struct file * file, char __user * buf,
        if (!task)
                goto out_no_task;
-        if (check_mem_permission(task))
-                goto out;
        ret = -ENOMEM;
        page = (char *)__get_free_page(GFP_TEMPORARY);
        if (!page)
                goto out;
-        ret = 0;
+        mm = check_mem_permission(task);
- 
+        ret = PTR_ERR(mm);
-        mm = get_task_mm(task);
+        if (IS_ERR(mm))
-        if (!mm)
                goto out_free;
        ret = -EIO;
@@ -800,8 +852,8 @@ static ssize_t mem_read(struct file * file, char __user * buf,
                int this_len, retval;
                this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
-                retval = access_process_vm(task, src, page, this_len, 0);
+                retval = access_remote_vm(mm, src, page, this_len, 0);
-                if (!retval || check_mem_permission(task)) {
+                if (!retval) {
                        if (!ret)
                                ret = -EIO;
                        break;
@@ -829,10 +881,6 @@ out_no_task:
        return ret;
 }
-#define mem_write NULL
-#ifndef mem_write
-/* This is a security hazard */
 static ssize_t mem_write(struct file * file, const char __user *buf,
                         size_t count, loff_t *ppos)
 {
@@ -840,18 +888,25 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
        char *page;
        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
        unsigned long dst = *ppos;
+        struct mm_struct *mm;
        copied = -ESRCH;
        if (!task)
                goto out_no_task;
-        if (check_mem_permission(task))
+        mm = check_mem_permission(task);
-                goto out;
+        copied = PTR_ERR(mm);
+        if (IS_ERR(mm))
+                goto out_task;
+        copied = -EIO;
+        if (file->private_data != (void *)((long)current->self_exec_id))
+                goto out_mm;
        copied = -ENOMEM;
        page = (char *)__get_free_page(GFP_TEMPORARY);
        if (!page)
-                goto out;
+                goto out_mm;
        copied = 0;
        while (count > 0) {
@@ -862,7 +917,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
                        copied = -EFAULT;
                        break;
                }
-                retval = access_process_vm(task, dst, page, this_len, 1);
+                retval = access_remote_vm(mm, dst, page, this_len, 1);
                if (!retval) {
                        if (!copied)
                                copied = -EIO;
@@ -875,12 +930,13 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
        }
        *ppos = dst;
        free_page((unsigned long) page);
-out:
+out_mm:
+        mmput(mm);
+out_task:
        put_task_struct(task);
 out_no_task:
        return copied;
 }
-#endif
 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 {
@@ -917,20 +973,18 @@ static ssize_t environ_read(struct file *file, char __user *buf,
        if (!task)
                goto out_no_task;
-        if (!ptrace_may_access(task, PTRACE_MODE_READ))
-                goto out;
        ret = -ENOMEM;
        page = (char *)__get_free_page(GFP_TEMPORARY);
        if (!page)
                goto out;
-        ret = 0;
-        mm = get_task_mm(task);
+        mm = mm_for_maps(task);
-        if (!mm)
+        ret = PTR_ERR(mm);
+        if (!mm || IS_ERR(mm))
                goto out_free;
+        ret = 0;
        while (count > 0) {
                int this_len, retval, max_len;
@@ -2620,35 +2674,6 @@ static const struct pid_entry proc_base_stuff[] = {
                &proc_self_inode_operations, NULL, {}),
 };
-/*
- *      Exceptional case: normally we are not allowed to unhash a busy
- * directory. In this case, however, we can do it - no aliasing problems
- * due to the way we treat inodes.
- */
-static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-        struct inode *inode;
-        struct task_struct *task;
-        if (nd->flags & LOOKUP_RCU)
-                return -ECHILD;
-        inode = dentry->d_inode;
-        task = get_proc_task(inode);
-        if (task) {
-                put_task_struct(task);
-                return 1;
-        }
-        d_drop(dentry);
-        return 0;
-}
-static const struct dentry_operations proc_base_dentry_operations =
-{
-        .d_revalidate   = proc_base_revalidate,
-        .d_delete       = pid_delete_dentry,
-};
 static struct dentry *proc_base_instantiate(struct inode *dir,
        struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@@ -2685,7 +2710,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        if (p->fop)
                inode->i_fop = p->fop;
        ei->op = p->op;
-        d_set_d_op(dentry, &proc_base_dentry_operations);
        d_add(dentry, inode);
        error = NULL;
 out:
@@ -2778,8 +2802,12 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
                                struct pid *pid, struct task_struct *task)
 {
-        seq_printf(m, "%08x\n", task->personality);
+        int err = lock_trace(task);
-        return 0;
+        if (!err) {
+                seq_printf(m, "%08x\n", task->personality);
+                unlock_trace(task);
+        }
+        return err;
 }
 /*
@@ -2798,7 +2826,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("environ",    S_IRUSR, proc_environ_operations),
        INF("auxv",       S_IRUSR, proc_pid_auxv),
        ONE("status",     S_IRUGO, proc_pid_status),
-        ONE("personality", S_IRUSR, proc_pid_personality),
+        ONE("personality", S_IRUGO, proc_pid_personality),
        INF("limits",     S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
@@ -2808,7 +2836,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-        INF("syscall",    S_IRUSR, proc_pid_syscall),
+        INF("syscall",    S_IRUGO, proc_pid_syscall),
 #endif
        INF("cmdline",    S_IRUGO, proc_pid_cmdline),
        ONE("stat",       S_IRUGO, proc_tgid_stat),
@@ -2827,7 +2855,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",      S_IRUGO, proc_smaps_operations),
-        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+        REG("pagemap",    S_IRUGO, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
        DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2836,7 +2864,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        INF("wchan",      S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
-        ONE("stack",      S_IRUSR, proc_pid_stack),
+        ONE("stack",      S_IRUGO, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
        INF("schedstat",  S_IRUGO, proc_pid_schedstat),
@@ -3096,11 +3124,16 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-        unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
+        unsigned int nr;
-        struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode);
+        struct task_struct *reaper;
        struct tgid_iter iter;
        struct pid_namespace *ns;
+        if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
+                goto out_no_task;
+        nr = filp->f_pos - FIRST_PROCESS_ENTRY;
+        reaper = get_proc_task(filp->f_path.dentry->d_inode);
        if (!reaper)
                goto out_no_task;
@@ -3138,14 +3171,14 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("environ",   S_IRUSR, proc_environ_operations),
        INF("auxv",      S_IRUSR, proc_pid_auxv),
        ONE("status",    S_IRUGO, proc_pid_status),
-        ONE("personality", S_IRUSR, proc_pid_personality),
+        ONE("personality", S_IRUGO, proc_pid_personality),
        INF("limits",    S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-        INF("syscall",   S_IRUSR, proc_pid_syscall),
+        INF("syscall",   S_IRUGO, proc_pid_syscall),
 #endif
        INF("cmdline",   S_IRUGO, proc_pid_cmdline),
        ONE("stat",      S_IRUGO, proc_tid_stat),
@@ -3163,7 +3196,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",     S_IRUGO, proc_smaps_operations),
-        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+        REG("pagemap",    S_IRUGO, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
        DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -3172,7 +3205,7 @@ static const struct pid_entry tid_base_stuff[] = {
        INF("wchan",     S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
-        ONE("stack",      S_IRUSR, proc_pid_stack),
+        ONE("stack",      S_IRUGO, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
        INF("schedstat", S_IRUGO, proc_pid_schedstat),
@@ -3191,7 +3224,7 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
-        REG("sessionid",  S_IRUSR, proc_sessionid_operations),
+        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 01e07f2a188f..f1281339b6fa 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -28,7 +28,7 @@
 DEFINE_SPINLOCK(proc_subdir_lock);
-static int proc_match(int len, const char *name, struct proc_dir_entry *de)
+static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
 {
        if (de->namelen != len)
                return 0;
@@ -303,7 +303,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
 {
        const char              *cp = name, *next;
        struct proc_dir_entry   *de;
-        int                     len;
+        unsigned int            len;
        de = *ret;
        if (!de)
@@ -602,7 +602,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 {
        struct proc_dir_entry *ent = NULL;
        const char *fn = name;
-        int len;
+        unsigned int len;
        /* make sure name is valid */
        if (!name || !strlen(name)) goto out;
@@ -786,7 +786,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
        struct proc_dir_entry **p;
        struct proc_dir_entry *de = NULL;
        const char *fn = name;
-        int len;
+        unsigned int len;
        spin_lock(&proc_subdir_lock);
        if (__xlate_proc_name(name, &parent, &fn) != 0) {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 176ce4cda68a..d15aa1b1cc8f 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -27,6 +27,7 @@
 static void proc_evict_inode(struct inode *inode)
 {
        struct proc_dir_entry *de;
+        struct ctl_table_header *head;
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
@@ -38,12 +39,13 @@ static void proc_evict_inode(struct inode *inode)
        de = PROC_I(inode)->pde;
        if (de)
                pde_put(de);
-        if (PROC_I(inode)->sysctl)
+        head = PROC_I(inode)->sysctl;
-                sysctl_head_put(PROC_I(inode)->sysctl);
+        if (head) {
+                rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
+                sysctl_head_put(head);
+        }
 }
-struct vfsmount *proc_mnt;
 static struct kmem_cache * proc_inode_cachep;
 static struct inode *proc_alloc_inode(struct super_block *sb)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9ad561ded409..c03e8d3a3a5b 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -107,7 +107,6 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
 }
 void pde_put(struct proc_dir_entry *pde);
-extern struct vfsmount *proc_mnt;
 int proc_fill_super(struct super_block *);
 struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index d9396a4fc7ff..927cbd115e53 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -233,7 +233,7 @@ void __init proc_device_tree_init(void)
                return;
        root = of_find_node_by_path("/");
        if (root == NULL) {
-                printk(KERN_ERR "/proc/device-tree: can't find root\n");
+                pr_debug("/proc/device-tree: can't find root\n");
                return;
        }
        proc_device_tree_add_node(root, proc_device_tree);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 09a1f92a34ef..f50133c11c24 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -32,7 +32,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
        ei->sysctl_entry = table;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
        inode->i_mode = table->mode;
        if (!table->child) {
                inode->i_mode |= S_IFREG;
@@ -408,15 +407,18 @@ static int proc_sys_compare(const struct dentry *parent,
                const struct dentry *dentry, const struct inode *inode,
                unsigned int len, const char *str, const struct qstr *name)
 {
+        struct ctl_table_header *head;
        /* Although proc doesn't have negative dentries, rcu-walk means
         * that inode here can be NULL */
+        /* AV: can it, indeed? */
        if (!inode)
-                return 0;
+                return 1;
        if (name->len != len)
                return 1;
        if (memcmp(name->name, str, len))
                return 1;
-        return !sysctl_is_seen(PROC_I(inode)->sysctl);
+        head = rcu_dereference(PROC_I(inode)->sysctl);
+        return !head || !sysctl_is_seen(head);
 }
 static const struct dentry_operations proc_sys_dentry_operations = {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ef9fa8e24ad6..a9000e9cfee5 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -43,17 +43,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
        struct pid_namespace *ns;
        struct proc_inode *ei;
-        if (proc_mnt) {
-                /* Seed the root directory with a pid so it doesn't need
-                 * to be special in base.c.  I would do this earlier but
-                 * the only task alive when /proc is mounted the first time
-                 * is the init_task and it doesn't have any pids.
-                 */
-                ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode);
-                if (!ei->pid)
-                        ei->pid = find_get_pid(1);
-        }
        if (flags & MS_KERNMOUNT)
                ns = (struct pid_namespace *)data;
        else
@@ -71,16 +60,16 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
                        return ERR_PTR(err);
                }
-                ei = PROC_I(sb->s_root->d_inode);
-                if (!ei->pid) {
-                        rcu_read_lock();
-                        ei->pid = get_pid(find_pid_ns(1, ns));
-                        rcu_read_unlock();
-                }
                sb->s_flags |= MS_ACTIVE;
        }
+        ei = PROC_I(sb->s_root->d_inode);
+        if (!ei->pid) {
+                rcu_read_lock();
+                ei->pid = get_pid(find_pid_ns(1, ns));
+                rcu_read_unlock();
+        }
        return dget(sb->s_root);
 }
@@ -101,19 +90,20 @@ static struct file_system_type proc_fs_type = {
 void __init proc_root_init(void)
 {
+        struct vfsmount *mnt;
        int err;
        proc_init_inodecache();
        err = register_filesystem(&proc_fs_type);
        if (err)
                return;
-        proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
+        mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
-        if (IS_ERR(proc_mnt)) {
+        if (IS_ERR(mnt)) {
                unregister_filesystem(&proc_fs_type);
                return;
        }
-        init_pid_ns.proc_mnt = proc_mnt;
+        init_pid_ns.proc_mnt = mnt;
        proc_symlink("mounts", NULL, "self/mounts");
        proc_net_init();
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 60b914860f81..2e7addfd9803 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,5 +1,6 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
+#include <linux/huge_mm.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/highmem.h>
@@ -7,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
+#include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
@@ -119,14 +121,14 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
        if (!priv->task)
-                return NULL;
+                return ERR_PTR(-ESRCH);
        mm = mm_for_maps(priv->task);
-        if (!mm)
+        if (!mm || IS_ERR(mm))
-                return NULL;
+                return mm;
        down_read(&mm->mmap_sem);
-        tail_vma = get_gate_vma(priv->task);
+        tail_vma = get_gate_vma(priv->task->mm);
        priv->tail_vma = tail_vma;
        /* Start with last addr hint */
@@ -180,7 +182,8 @@ static void m_stop(struct seq_file *m, void *v)
        struct proc_maps_private *priv = m->private;
        struct vm_area_struct *vma = v;
-        vma_stop(priv, vma);
+        if (!IS_ERR(vma))
+                vma_stop(priv, vma);
        if (priv->task)
                put_task_struct(priv->task);
 }
@@ -249,8 +252,8 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
                const char *name = arch_vma_name(vma);
                if (!name) {
                        if (mm) {
-                                if (vma->vm_start <= mm->start_brk &&
+                                if (vma->vm_start <= mm->brk &&
-                                                vma->vm_end >= mm->brk) {
+                                                vma->vm_end >= mm->start_brk) {
                                        name = "[heap]";
                                } else if (vma->vm_start <= mm->start_stack &&
                                           vma->vm_end >= mm->start_stack) {
@@ -277,7 +280,8 @@ static int show_map(struct seq_file *m, void *v)
        show_map_vma(m, vma);
        if (m->count < m->size)  /* vma is copied successfully */
-                m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
+                m->version = (vma != get_gate_vma(task->mm))
+                        ? vma->vm_start : 0;
        return 0;
 }
@@ -329,58 +333,86 @@ struct mem_size_stats {
        unsigned long private_dirty;
        unsigned long referenced;
        unsigned long anonymous;
+        unsigned long anonymous_thp;
        unsigned long swap;
        u64 pss;
 };
-static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-                           struct mm_walk *walk)
+static void smaps_pte_entry(pte_t ptent, unsigned long addr,
+                unsigned long ptent_size, struct mm_walk *walk)
 {
        struct mem_size_stats *mss = walk->private;
        struct vm_area_struct *vma = mss->vma;
-        pte_t *pte, ptent;
-        spinlock_t *ptl;
        struct page *page;
        int mapcount;
-        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+        if (is_swap_pte(ptent)) {
-        for (; addr != end; pte++, addr += PAGE_SIZE) {
+                mss->swap += ptent_size;
-                ptent = *pte;
+                return;
+        }
-                if (is_swap_pte(ptent)) {
-                        mss->swap += PAGE_SIZE;
-                        continue;
-                }
-                if (!pte_present(ptent))
+        if (!pte_present(ptent))
-                        continue;
+                return;
+        page = vm_normal_page(vma, addr, ptent);
+        if (!page)
+                return;
+        if (PageAnon(page))
+                mss->anonymous += ptent_size;
+        mss->resident += ptent_size;
+        /* Accumulate the size in pages that have been accessed. */
+        if (pte_young(ptent) || PageReferenced(page))
+                mss->referenced += ptent_size;
+        mapcount = page_mapcount(page);
+        if (mapcount >= 2) {
+                if (pte_dirty(ptent) || PageDirty(page))
+                        mss->shared_dirty += ptent_size;
+                else
+                        mss->shared_clean += ptent_size;
+                mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
+        } else {
+                if (pte_dirty(ptent) || PageDirty(page))
+                        mss->private_dirty += ptent_size;
+                else
+                        mss->private_clean += ptent_size;
+                mss->pss += (ptent_size << PSS_SHIFT);
+        }
+}
-                page = vm_normal_page(vma, addr, ptent);
+static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-                if (!page)
+                           struct mm_walk *walk)
-                        continue;
+{
+        struct mem_size_stats *mss = walk->private;
+        struct vm_area_struct *vma = mss->vma;
+        pte_t *pte;
+        spinlock_t *ptl;
-                if (PageAnon(page))
+        spin_lock(&walk->mm->page_table_lock);
-                        mss->anonymous += PAGE_SIZE;
+        if (pmd_trans_huge(*pmd)) {
+                if (pmd_trans_splitting(*pmd)) {
-                mss->resident += PAGE_SIZE;
+                        spin_unlock(&walk->mm->page_table_lock);
-                /* Accumulate the size in pages that have been accessed. */
+                        wait_split_huge_page(vma->anon_vma, pmd);
-                if (pte_young(ptent) || PageReferenced(page))
-                        mss->referenced += PAGE_SIZE;
-                mapcount = page_mapcount(page);
-                if (mapcount >= 2) {
-                        if (pte_dirty(ptent) || PageDirty(page))
-                                mss->shared_dirty += PAGE_SIZE;
-                        else
-                                mss->shared_clean += PAGE_SIZE;
-                        mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
                } else {
-                        if (pte_dirty(ptent) || PageDirty(page))
+                        smaps_pte_entry(*(pte_t *)pmd, addr,
-                                mss->private_dirty += PAGE_SIZE;
+                                        HPAGE_PMD_SIZE, walk);
-                        else
+                        spin_unlock(&walk->mm->page_table_lock);
-                                mss->private_clean += PAGE_SIZE;
+                        mss->anonymous_thp += HPAGE_PMD_SIZE;
-                        mss->pss += (PAGE_SIZE << PSS_SHIFT);
+                        return 0;
                }
+        } else {
+                spin_unlock(&walk->mm->page_table_lock);
        }
+        /*
+         * The mmap_sem held all the way back in m_start() is what
+         * keeps khugepaged out of here and from collapsing things
+         * in here.
+         */
+        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+        for (; addr != end; pte++, addr += PAGE_SIZE)
+                smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
        pte_unmap_unlock(pte - 1, ptl);
        cond_resched();
        return 0;
@@ -416,6 +448,7 @@ static int show_smap(struct seq_file *m, void *v)
                   "Private_Dirty:  %8lu kB\n"
                   "Referenced:     %8lu kB\n"
                   "Anonymous:      %8lu kB\n"
+                   "AnonHugePages:  %8lu kB\n"
                   "Swap:           %8lu kB\n"
                   "KernelPageSize: %8lu kB\n"
                   "MMUPageSize:    %8lu kB\n"
@@ -429,6 +462,7 @@ static int show_smap(struct seq_file *m, void *v)
                   mss.private_dirty >> 10,
                   mss.referenced >> 10,
                   mss.anonymous >> 10,
+                   mss.anonymous_thp >> 10,
                   mss.swap >> 10,
                   vma_kernel_pagesize(vma) >> 10,
                   vma_mmu_pagesize(vma) >> 10,
@@ -436,7 +470,8 @@ static int show_smap(struct seq_file *m, void *v)
                        (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
        if (m->count < m->size)  /* vma is copied successfully */
-                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+                m->version = (vma != get_gate_vma(task->mm))
+                        ? vma->vm_start : 0;
        return 0;
 }
@@ -467,6 +502,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        spinlock_t *ptl;
        struct page *page;
+        split_huge_page_pmd(walk->mm, pmd);
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE) {
                ptent = *pte;
@@ -623,6 +660,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        pte_t *pte;
        int err = 0;
+        split_huge_page_pmd(walk->mm, pmd);
        /* find the first VMA at or above 'addr' */
        vma = find_vma(walk->mm, addr);
        for (; addr != end; addr += PAGE_SIZE) {
@@ -728,8 +767,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (!task)
                goto out;
-        ret = -EACCES;
+        mm = mm_for_maps(task);
-        if (!ptrace_may_access(task, PTRACE_MODE_READ))
+        ret = PTR_ERR(mm);
+        if (!mm || IS_ERR(mm))
                goto out_task;
        ret = -EINVAL;
@@ -742,10 +782,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        if (!count)
                goto out_task;
-        mm = get_task_mm(task);
-        if (!mm)
-                goto out_task;
        pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
        pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
        ret = -ENOMEM;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index b535d3e5d5f1..980de547c070 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -199,13 +199,13 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        /* pin the task and mm whilst we play with them */
        priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
        if (!priv->task)
-                return NULL;
+                return ERR_PTR(-ESRCH);
        mm = mm_for_maps(priv->task);
-        if (!mm) {
+        if (!mm || IS_ERR(mm)) {
                put_task_struct(priv->task);
                priv->task = NULL;
-                return NULL;
+                return mm;
        }
        down_read(&mm->mmap_sem);
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
new file mode 100644
index 000000000000..8007ae7c0d8c
--- /dev/null
+++ b/fs/pstore/Kconfig
@@ -0,0 +1,13 @@
+config PSTORE
+        bool "Persistent store support"
+        default n
+        help
+           This option enables generic access to platform level
+           persistent storage via "pstore" filesystem that can
+           be mounted as /dev/pstore.  Only useful if you have
+           a platform level driver that registers with pstore to
+           provide the data, so you probably should just go say "Y"
+           (or "M") to a platform specific persistent store driver
+           (e.g. ACPI_APEI on X86) which will select this for you.
+           If you don't have a platform persistent store driver,
+           say N.
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
new file mode 100644
index 000000000000..760f4bce7d1d
--- /dev/null
+++ b/fs/pstore/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux pstorefs routines.
+#
+obj-y += pstore.o
+pstore-objs += inode.o platform.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
new file mode 100644
index 000000000000..977ed2723845
--- /dev/null
+++ b/fs/pstore/inode.c
@@ -0,0 +1,311 @@
+/*
+ * Persistent Storage - ramfs parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/mount.h>
+#include <linux/ramfs.h>
+#include <linux/parser.h>
+#include <linux/sched.h>
+#include <linux/magic.h>
+#include <linux/pstore.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+#define PSTORE_NAMELEN  64
+struct pstore_private {
+        u64     id;
+        int     (*erase)(u64);
+        ssize_t size;
+        char    data[];
+};
+static int pstore_file_open(struct inode *inode, struct file *file)
+{
+        file->private_data = inode->i_private;
+        return 0;
+}
+static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
+                                                size_t count, loff_t *ppos)
+{
+        struct pstore_private *ps = file->private_data;
+        return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size);
+}
+static const struct file_operations pstore_file_operations = {
+        .open   = pstore_file_open,
+        .read   = pstore_file_read,
+        .llseek = default_llseek,
+};
+/*
+ * When a file is unlinked from our file system we call the
+ * platform driver to erase the record from persistent store.
+ */
+static int pstore_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct pstore_private *p = dentry->d_inode->i_private;
+        p->erase(p->id);
+        return simple_unlink(dir, dentry);
+}
+static void pstore_evict_inode(struct inode *inode)
+{
+        end_writeback(inode);
+        kfree(inode->i_private);
+}
+static const struct inode_operations pstore_dir_inode_operations = {
+        .lookup         = simple_lookup,
+        .unlink         = pstore_unlink,
+};
+static struct inode *pstore_get_inode(struct super_block *sb,
+                                        const struct inode *dir, int mode, dev_t dev)
+{
+        struct inode *inode = new_inode(sb);
+        if (inode) {
+                inode->i_ino = get_next_ino();
+                inode->i_uid = inode->i_gid = 0;
+                inode->i_mode = mode;
+                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                switch (mode & S_IFMT) {
+                case S_IFREG:
+                        inode->i_fop = &pstore_file_operations;
+                        break;
+                case S_IFDIR:
+                        inode->i_op = &pstore_dir_inode_operations;
+                        inode->i_fop = &simple_dir_operations;
+                        inc_nlink(inode);
+                        break;
+                }
+        }
+        return inode;
+}
+enum {
+        Opt_kmsg_bytes, Opt_err
+};
+static const match_table_t tokens = {
+        {Opt_kmsg_bytes, "kmsg_bytes=%u"},
+        {Opt_err, NULL}
+};
+static void parse_options(char *options)
+{
+        char            *p;
+        substring_t     args[MAX_OPT_ARGS];
+        int             option;
+        if (!options)
+                return;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_kmsg_bytes:
+                        if (!match_int(&args[0], &option))
+                                pstore_set_kmsg_bytes(option);
+                        break;
+                }
+        }
+}
+static int pstore_remount(struct super_block *sb, int *flags, char *data)
+{
+        parse_options(data);
+        return 0;
+}
+static const struct super_operations pstore_ops = {
+        .statfs         = simple_statfs,
+        .drop_inode     = generic_delete_inode,
+        .evict_inode    = pstore_evict_inode,
+        .remount_fs     = pstore_remount,
+        .show_options   = generic_show_options,
+};
+static struct super_block *pstore_sb;
+int pstore_is_mounted(void)
+{
+        return pstore_sb != NULL;
+}
+/*
+ * Make a regular file in the root directory of our file system.
+ * Load it up with "size" bytes of data from "buf".
+ * Set the mtime & ctime to the date that this record was originally stored.
+ */
+int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
+                              char *data, size_t size,
+                              struct timespec time, int (*erase)(u64))
+{
+        struct dentry           *root = pstore_sb->s_root;
+        struct dentry           *dentry;
+        struct inode            *inode;
+        int                     rc;
+        char                    name[PSTORE_NAMELEN];
+        struct pstore_private   *private;
+        rc = -ENOMEM;
+        inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
+        if (!inode)
+                goto fail;
+        private = kmalloc(sizeof *private + size, GFP_KERNEL);
+        if (!private)
+                goto fail_alloc;
+        private->id = id;
+        private->erase = erase;
+        switch (type) {
+        case PSTORE_TYPE_DMESG:
+                sprintf(name, "dmesg-%s-%lld", psname, id);
+                break;
+        case PSTORE_TYPE_MCE:
+                sprintf(name, "mce-%s-%lld", psname, id);
+                break;
+        case PSTORE_TYPE_UNKNOWN:
+                sprintf(name, "unknown-%s-%lld", psname, id);
+                break;
+        default:
+                sprintf(name, "type%d-%s-%lld", type, psname, id);
+                break;
+        }
+        mutex_lock(&root->d_inode->i_mutex);
+        rc = -ENOSPC;
+        dentry = d_alloc_name(root, name);
+        if (IS_ERR(dentry))
+                goto fail_lockedalloc;
+        memcpy(private->data, data, size);
+        inode->i_size = private->size = size;
+        inode->i_private = private;
+        if (time.tv_sec)
+                inode->i_mtime = inode->i_ctime = time;
+        d_add(dentry, inode);
+        mutex_unlock(&root->d_inode->i_mutex);
+        return 0;
+fail_lockedalloc:
+        mutex_unlock(&root->d_inode->i_mutex);
+        kfree(private);
+fail_alloc:
+        iput(inode);
+fail:
+        return rc;
+}
+int pstore_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct inode *inode = NULL;
+        struct dentry *root;
+        int err;
+        save_mount_options(sb, data);
+        pstore_sb = sb;
+        sb->s_maxbytes          = MAX_LFS_FILESIZE;
+        sb->s_blocksize         = PAGE_CACHE_SIZE;
+        sb->s_blocksize_bits    = PAGE_CACHE_SHIFT;
+        sb->s_magic             = PSTOREFS_MAGIC;
+        sb->s_op                = &pstore_ops;
+        sb->s_time_gran         = 1;
+        parse_options(data);
+        inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0);
+        if (!inode) {
+                err = -ENOMEM;
+                goto fail;
+        }
+        /* override ramfs "dir" options so we catch unlink(2) */
+        inode->i_op = &pstore_dir_inode_operations;
+        root = d_alloc_root(inode);
+        sb->s_root = root;
+        if (!root) {
+                err = -ENOMEM;
+                goto fail;
+        }
+        pstore_get_records();
+        return 0;
+fail:
+        iput(inode);
+        return err;
+}
+static struct dentry *pstore_mount(struct file_system_type *fs_type,
+        int flags, const char *dev_name, void *data)
+{
+        return mount_single(fs_type, flags, data, pstore_fill_super);
+}
+static void pstore_kill_sb(struct super_block *sb)
+{
+        kill_litter_super(sb);
+        pstore_sb = NULL;
+}
+static struct file_system_type pstore_fs_type = {
+        .name           = "pstore",
+        .mount          = pstore_mount,
+        .kill_sb        = pstore_kill_sb,
+};
+static int __init init_pstore_fs(void)
+{
+        return register_filesystem(&pstore_fs_type);
+}
+module_init(init_pstore_fs)
+MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
+MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
new file mode 100644
index 000000000000..8c9f23eb1645
--- /dev/null
+++ b/fs/pstore/internal.h
@@ -0,0 +1,6 @@
+extern void     pstore_set_kmsg_bytes(int);
+extern void     pstore_get_records(void);
+extern int      pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
+                              char *data, size_t size,
+                              struct timespec time, int (*erase)(u64));
+extern int      pstore_is_mounted(void);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
new file mode 100644
index 000000000000..f835a25625ff
--- /dev/null
+++ b/fs/pstore/platform.c
@@ -0,0 +1,201 @@
+/*
+ * Persistent Storage - platform driver interface parts.
+ *
+ * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/atomic.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kmsg_dump.h>
+#include <linux/module.h>
+#include <linux/pstore.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+/*
+ * pstore_lock just protects "psinfo" during
+ * calls to pstore_register()
+ */
+static DEFINE_SPINLOCK(pstore_lock);
+static struct pstore_info *psinfo;
+/* How much of the console log to snapshot */
+static unsigned long kmsg_bytes = 10240;
+void pstore_set_kmsg_bytes(int bytes)
+{
+        kmsg_bytes = bytes;
+}
+/* Tag each group of saved records with a sequence number */
+static int      oopscount;
+static char *reason_str[] = {
+        "Oops", "Panic", "Kexec", "Restart", "Halt", "Poweroff", "Emergency"
+};
+/*
+ * callback from kmsg_dump. (s2,l2) has the most recently
+ * written bytes, older bytes are in (s1,l1). Save as much
+ * as we can from the end of the buffer.
+ */
+static void pstore_dump(struct kmsg_dumper *dumper,
+            enum kmsg_dump_reason reason,
+            const char *s1, unsigned long l1,
+            const char *s2, unsigned long l2)
+{
+        unsigned long   s1_start, s2_start;
+        unsigned long   l1_cpy, l2_cpy;
+        unsigned long   size, total = 0;
+        char            *dst, *why;
+        u64             id;
+        int             hsize, part = 1;
+        if (reason < ARRAY_SIZE(reason_str))
+                why = reason_str[reason];
+        else
+                why = "Unknown";
+        mutex_lock(&psinfo->buf_mutex);
+        oopscount++;
+        while (total < kmsg_bytes) {
+                dst = psinfo->buf;
+                hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++);
+                size = psinfo->bufsize - hsize;
+                dst += hsize;
+                l2_cpy = min(l2, size);
+                l1_cpy = min(l1, size - l2_cpy);
+                if (l1_cpy + l2_cpy == 0)
+                        break;
+                s2_start = l2 - l2_cpy;
+                s1_start = l1 - l1_cpy;
+                memcpy(dst, s1 + s1_start, l1_cpy);
+                memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
+                id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy);
+                if (reason == KMSG_DUMP_OOPS && pstore_is_mounted())
+                        pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
+                                      psinfo->buf, hsize + l1_cpy + l2_cpy,
+                                      CURRENT_TIME, psinfo->erase);
+                l1 -= l1_cpy;
+                l2 -= l2_cpy;
+                total += l1_cpy + l2_cpy;
+        }
+        mutex_unlock(&psinfo->buf_mutex);
+}
+static struct kmsg_dumper pstore_dumper = {
+        .dump = pstore_dump,
+};
+/*
+ * platform specific persistent storage driver registers with
+ * us here. If pstore is already mounted, call the platform
+ * read function right away to populate the file system. If not
+ * then the pstore mount code will call us later to fill out
+ * the file system.
+ *
+ * Register with kmsg_dump to save last part of console log on panic.
+ */
+int pstore_register(struct pstore_info *psi)
+{
+        struct module *owner = psi->owner;
+        spin_lock(&pstore_lock);
+        if (psinfo) {
+                spin_unlock(&pstore_lock);
+                return -EBUSY;
+        }
+        psinfo = psi;
+        spin_unlock(&pstore_lock);
+        if (owner && !try_module_get(owner)) {
+                psinfo = NULL;
+                return -EINVAL;
+        }
+        if (pstore_is_mounted())
+                pstore_get_records();
+        kmsg_dump_register(&pstore_dumper);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_register);
+/*
+ * Read all the records from the persistent store. Create and
+ * file files in our filesystem.
+ */
+void pstore_get_records(void)
+{
+        struct pstore_info *psi = psinfo;
+        size_t                  size;
+        u64                     id;
+        enum pstore_type_id     type;
+        struct timespec         time;
+        int                     failed = 0;
+        if (!psi)
+                return;
+        mutex_lock(&psinfo->buf_mutex);
+        while ((size = psi->read(&id, &type, &time)) > 0) {
+                if (pstore_mkfile(type, psi->name, id, psi->buf, size,
+                                  time, psi->erase))
+                        failed++;
+        }
+        mutex_unlock(&psinfo->buf_mutex);
+        if (failed)
+                printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
+                       failed, psi->name);
+}
+/*
+ * Call platform driver to write a record to the
+ * persistent store.
+ */
+int pstore_write(enum pstore_type_id type, char *buf, size_t size)
+{
+        u64     id;
+        if (!psinfo)
+                return -ENODEV;
+        if (size > psinfo->bufsize)
+                return -EFBIG;
+        mutex_lock(&psinfo->buf_mutex);
+        memcpy(psinfo->buf, buf, size);
+        id = psinfo->write(type, size);
+        if (pstore_is_mounted())
+                pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
+                              size, CURRENT_TIME, psinfo->erase);
+        mutex_unlock(&psinfo->buf_mutex);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(pstore_write);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index e63b4171d583..2b0646613f5a 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -335,7 +335,6 @@ static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
 static const struct address_space_operations qnx4_aops = {
        .readpage       = qnx4_readpage,
        .writepage      = qnx4_writepage,
-        .sync_page      = block_sync_page,
        .write_begin    = qnx4_write_begin,
        .write_end      = generic_write_end,
        .bmap           = qnx4_bmap
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a2a622e079f0..d3c032f5fa0a 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -76,7 +76,7 @@
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
 #include <linux/quotaops.h>
-#include <linux/writeback.h> /* for inode_lock, oddly enough.. */
+#include "../internal.h" /* ugh */
 #include <asm/uaccess.h>
@@ -442,7 +442,7 @@ EXPORT_SYMBOL(dquot_acquire);
 */
 int dquot_commit(struct dquot *dquot)
 {
-        int ret = 0, ret2 = 0;
+        int ret = 0;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
        mutex_lock(&dqopt->dqio_mutex);
@@ -454,15 +454,10 @@ int dquot_commit(struct dquot *dquot)
        spin_unlock(&dq_list_lock);
        /* Inactive dquot can be only if there was error during read/init
         * => we have better not writing it */
-        if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+        if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
                ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot);
-                if (info_dirty(&dqopt->info[dquot->dq_type])) {
+        else
-                        ret2 = dqopt->ops[dquot->dq_type]->write_file_info(
+                ret = -EIO;
-                                                dquot->dq_sb, dquot->dq_type);
-                }
-                if (ret >= 0)
-                        ret = ret2;
-        }
 out_sem:
        mutex_unlock(&dqopt->dqio_mutex);
        return ret;
@@ -900,33 +895,38 @@ static void add_dquot_ref(struct super_block *sb, int type)
        int reserved = 0;
 #endif
-        spin_lock(&inode_lock);
+        spin_lock(&inode_sb_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+                spin_lock(&inode->i_lock);
+                if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
+                    !atomic_read(&inode->i_writecount) ||
+                    !dqinit_needed(inode, type)) {
+                        spin_unlock(&inode->i_lock);
                        continue;
+                }
 #ifdef CONFIG_QUOTA_DEBUG
                if (unlikely(inode_get_rsv_space(inode) > 0))
                        reserved = 1;
 #endif
-                if (!atomic_read(&inode->i_writecount))
-                        continue;
-                if (!dqinit_needed(inode, type))
-                        continue;
                __iget(inode);
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode->i_lock);
+                spin_unlock(&inode_sb_list_lock);
                iput(old_inode);
                __dquot_initialize(inode, type);
-                /* We hold a reference to 'inode' so it couldn't have been
-                 * removed from s_inodes list while we dropped the inode_lock.
+                /*
-                 * We cannot iput the inode now as we can be holding the last
+                 * We hold a reference to 'inode' so it couldn't have been
-                 * reference and we cannot iput it under inode_lock. So we
+                 * removed from s_inodes list while we dropped the
-                 * keep the reference and iput it later. */
+                 * inode_sb_list_lock We cannot iput the inode now as we can be
+                 * holding the last reference and we cannot iput it under
+                 * inode_sb_list_lock. So we keep the reference and iput it
+                 * later.
+                 */
                old_inode = inode;
-                spin_lock(&inode_lock);
+                spin_lock(&inode_sb_list_lock);
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
        iput(old_inode);
 #ifdef CONFIG_QUOTA_DEBUG
@@ -951,7 +951,7 @@ static inline int dqput_blocks(struct dquot *dquot)
 /*
 * Remove references to dquots from inode and add dquot to list for freeing
- * if we have the last referece to dquot
+ * if we have the last reference to dquot
 * We can't race with anybody because we hold dqptr_sem for writing...
 */
 static int remove_inode_dquot_ref(struct inode *inode, int type,
@@ -1007,7 +1007,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
        struct inode *inode;
        int reserved = 0;
-        spin_lock(&inode_lock);
+        spin_lock(&inode_sb_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                /*
                 *  We have to scan also I_NEW inodes because they can already
@@ -1021,7 +1021,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
                        remove_inode_dquot_ref(inode, type, tofree_head);
                }
        }
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_sb_list_lock);
 #ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                printk(KERN_WARNING "VFS (%s): Writes happened after quota"
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 65444d29406b..f1ab3604db5a 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -112,7 +112,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
        if (!info->dqi_priv) {
                printk(KERN_WARNING
                       "Not enough memory for quota information structure.\n");
-                return -1;
+                return -ENOMEM;
        }
        qinfo = info->dqi_priv;
        if (version == 0) {
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 9eead2c796b7..fbb0b478a346 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -112,6 +112,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
                SetPageDirty(page);
                unlock_page(page);
+                put_page(page);
        }
        return 0;
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 792b3cb2cd18..3c3b00165114 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -31,9 +31,7 @@ endif
 # and causing a panic. Since this behavior only affects ppc32, this ifeq
 # will work around it. If any other architecture displays this behavior,
 # add it here.
-ifeq ($(CONFIG_PPC32),y)
+ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1)
-EXTRA_CFLAGS := $(call cc-ifversion, -lt, 0400, -O1)
-endif
 TAGS:
        etags *.c
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0bae036831e2..4fd5bb33dbb5 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
        struct inode *inode = dentry->d_inode;
        int maxlen = *lenp;
-        if (maxlen < 3)
+        if (need_parent && (maxlen < 5)) {
+                *lenp = 5;
                return 255;
+        } else if (maxlen < 3) {
+                *lenp = 3;
+                return 255;
+        }
        data[0] = inode->i_ino;
        data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
@@ -3212,7 +3217,6 @@ const struct address_space_operations reiserfs_address_space_operations = {
        .readpages = reiserfs_readpages,
        .releasepage = reiserfs_releasepage,
        .invalidatepage = reiserfs_invalidatepage,
-        .sync_page = block_sync_page,
        .write_begin = reiserfs_write_begin,
        .write_end = reiserfs_write_end,
        .bmap = reiserfs_aop_bmap,
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 79265fdc317a..4e153051bc75 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -59,7 +59,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        if (err)
                                break;
-                        if (!is_owner_or_cap(inode)) {
+                        if (!inode_owner_or_capable(inode)) {
                                err = -EPERM;
                                goto setflags_out;
                        }
@@ -103,7 +103,7 @@ setflags_out:
                err = put_user(inode->i_generation, (int __user *)arg);
                break;
        case REISERFS_IOC_SETVERSION:
-                if (!is_owner_or_cap(inode)) {
+                if (!inode_owner_or_capable(inode)) {
                        err = -EPERM;
                        break;
                }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 3eea859e6990..c5e82ece7c6c 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1,7 +1,7 @@
 /*
 ** Write ahead logging implementation copyright Chris Mason 2000
 **
-** The background commits make this code very interelated, and
+** The background commits make this code very interrelated, and
 ** overly complex.  I need to rethink things a bit....The major players:
 **
 ** journal_begin -- call with the number of blocks you expect to log.
@@ -2725,7 +2725,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
                                                 REISERFS_DISK_OFFSET_IN_BYTES /
                                                 sb->s_blocksize + 2);
-        /* Sanity check to see is the standard journal fitting withing first bitmap
+        /* Sanity check to see is the standard journal fitting within first bitmap
           (actual for small blocksizes) */
        if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
            (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
@@ -2876,7 +2876,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        reiserfs_mounted_fs_count++;
        if (reiserfs_mounted_fs_count <= 1) {
                reiserfs_write_unlock(sb);
-                commit_wq = create_workqueue("reiserfs");
+                commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
                reiserfs_write_lock(sb);
        }
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
index b87aa2c1afc1..7df1ce48203a 100644
--- a/fs/reiserfs/lock.c
+++ b/fs/reiserfs/lock.c
@@ -15,7 +15,7 @@
 * for this mutex, no need for a system wide mutex facility.
 *
 * Also this lock is often released before a call that could block because
- * reiserfs performances were partialy based on the release while schedule()
+ * reiserfs performances were partially based on the release while schedule()
 * property of the Bkl.
 */
 void reiserfs_write_lock(struct super_block *s)
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ba5f51ec3458..118662690cdf 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -593,7 +593,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
        new_inode_init(inode, dir, mode);
        jbegin_count += reiserfs_cache_default_acl(dir);
-        retval = reiserfs_security_init(dir, inode, &security);
+        retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
        if (retval < 0) {
                drop_new_inode(inode);
                return retval;
@@ -667,7 +667,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
        new_inode_init(inode, dir, mode);
        jbegin_count += reiserfs_cache_default_acl(dir);
-        retval = reiserfs_security_init(dir, inode, &security);
+        retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
        if (retval < 0) {
                drop_new_inode(inode);
                return retval;
@@ -747,7 +747,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        new_inode_init(inode, dir, mode);
        jbegin_count += reiserfs_cache_default_acl(dir);
-        retval = reiserfs_security_init(dir, inode, &security);
+        retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
        if (retval < 0) {
                drop_new_inode(inode);
                return retval;
@@ -771,7 +771,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                                        EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
                                        dentry, inode, &security);
        if (retval) {
-                dir->i_nlink--;
+                DEC_DIR_INODE_NLINK(dir)
                goto out_failed;
        }
@@ -1032,7 +1032,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
        }
        new_inode_init(inode, parent_dir, mode);
-        retval = reiserfs_security_init(parent_dir, inode, &security);
+        retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
+                                        &security);
        if (retval < 0) {
                drop_new_inode(inode);
                return retval;
@@ -1122,10 +1123,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
                reiserfs_write_unlock(dir->i_sb);
                return -EMLINK;
        }
-        if (inode->i_nlink == 0) {
-                reiserfs_write_unlock(dir->i_sb);
-                return -ENOENT;
-        }
        /* inc before scheduling so reiserfs_unlink knows we are here */
        inc_nlink(inode);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0aab04f46827..b216ff6be1c9 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -393,7 +393,7 @@ void add_save_link(struct reiserfs_transaction_handle *th,
        /* body of "save" link */
        link = INODE_PKEY(inode)->k_dir_id;
-        /* put "save" link inot tree, don't charge quota to anyone */
+        /* put "save" link into tree, don't charge quota to anyone */
        retval =
            reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link);
        if (retval) {
@@ -2104,7 +2104,7 @@ out:
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
+ * itself serializes the operations (and no one else should touch the files)
 * we don't have to be afraid of races */
 static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
                                   size_t len, loff_t off)
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 3cfb2e933644..47d2a4498b03 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -396,7 +396,7 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n)
        struct address_space *mapping = dir->i_mapping;
        struct page *page;
        /* We can deadlock if we try to free dentries,
-           and an unlink/rmdir has just occured - GFP_NOFS avoids this */
+           and an unlink/rmdir has just occurred - GFP_NOFS avoids this */
        mapping_set_gfp_mask(mapping, GFP_NOFS);
        page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL);
        if (!IS_ERR(page)) {
@@ -978,8 +978,6 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-        if (nd->flags & LOOKUP_RCU)
-                return -ECHILD;
        return -EPERM;
 }
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 90d2fcb67a31..3dc38f1206fc 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -26,7 +26,7 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value,
        size_t jcreate_blocks;
        if (!reiserfs_posixacl(inode->i_sb))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EPERM;
        if (value) {
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 237c6928d3c6..ef66c18a9332 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -54,6 +54,7 @@ static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
 * of blocks needed for the transaction. If successful, reiserfs_security
 * must be released using reiserfs_security_free when the caller is done. */
 int reiserfs_security_init(struct inode *dir, struct inode *inode,
+                           const struct qstr *qstr,
                           struct reiserfs_security_handle *sec)
 {
        int blocks = 0;
@@ -65,7 +66,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
        if (IS_PRIVATE(dir))
                return 0;
-        error = security_inode_init_security(inode, dir, &sec->name,
+        error = security_inode_init_security(inode, dir, qstr, &sec->name,
                                             &sec->value, &sec->length);
        if (error) {
                if (error == -EOPNOTSUPP)
diff --git a/fs/select.c b/fs/select.c
index e56560d2b08a..d33418fdc858 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -517,9 +517,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
-#define MAX_SELECT_SECONDS \
-        ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
                           fd_set __user *exp, struct timespec *end_time)
 {
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index aa68a8a31518..efc309fa3035 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -5,12 +5,12 @@ config SQUASHFS
        help
          Saying Y here includes support for SquashFS 4.0 (a Compressed
          Read-Only File System).  Squashfs is a highly compressed read-only
-          filesystem for Linux.  It uses zlib/lzo compression to compress both
+          filesystem for Linux.  It uses zlib, lzo or xz compression to
-          files, inodes and directories.  Inodes in the system are very small
+          compress both files, inodes and directories.  Inodes in the system
-          and all blocks are packed to minimise data overhead. Block sizes
+          are very small and all blocks are packed to minimise data overhead.
-          greater than 4K are supported up to a maximum of 1 Mbytes (default
+          Block sizes greater than 4K are supported up to a maximum of 1 Mbytes
-          block size 128K).  SquashFS 4.0 supports 64 bit filesystems and files
+          (default block size 128K).  SquashFS 4.0 supports 64 bit filesystems
-          (larger than 4GB), full uid/gid information, hard links and
+          and files (larger than 4GB), full uid/gid information, hard links and
          timestamps.
          Squashfs is intended for general read-only filesystem use, for
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 26b15ae34d6f..c37b520132ff 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -104,7 +104,7 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
                        entry = &cache->entry[i];
                        /*
-                         * Initialise choosen cache entry, and fill it in from
+                         * Initialise chosen cache entry, and fill it in from
                         * disk.
                         */
                        cache->unused--;
@@ -286,7 +286,7 @@ cleanup:
 /*
- * Copy upto length bytes from cache entry to buffer starting at offset bytes
+ * Copy up to length bytes from cache entry to buffer starting at offset bytes
 * into the cache entry.  If there's not length bytes then copy the number of
 * bytes available.  In all cases return the number of bytes copied.
 */
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index a5940e54c4dd..e921bd213738 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -23,6 +23,7 @@
 #include <linux/types.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include "squashfs_fs.h"
@@ -74,3 +75,36 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
        return decompressor[i];
 }
+void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        void *strm, *buffer = NULL;
+        int length = 0;
+        /*
+         * Read decompressor specific options from file system if present
+         */
+        if (SQUASHFS_COMP_OPTS(flags)) {
+                buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+                if (buffer == NULL)
+                        return ERR_PTR(-ENOMEM);
+                length = squashfs_read_data(sb, &buffer,
+                        sizeof(struct squashfs_super_block), 0, NULL,
+                        PAGE_CACHE_SIZE, 1);
+                if (length < 0) {
+                        strm = ERR_PTR(length);
+                        goto finished;
+                }
+        }
+        strm = msblk->decompressor->init(msblk, buffer, length);
+finished:
+        kfree(buffer);
+        return strm;
+}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 3b305a70f7aa..099745ad5691 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -24,7 +24,7 @@
 */
 struct squashfs_decompressor {
-        void    *(*init)(struct squashfs_sb_info *);
+        void    *(*init)(struct squashfs_sb_info *, void *, int);
        void    (*free)(void *);
        int     (*decompress)(struct squashfs_sb_info *, void **,
                struct buffer_head **, int, int, int, int, int);
@@ -33,11 +33,6 @@ struct squashfs_decompressor {
        int     supported;
 };
-static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk)
-{
-        return msblk->decompressor->init(msblk);
-}
 static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
        void *s)
 {
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 0dc340aa2be9..3f79cd1d0c19 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -172,6 +172,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
                length += sizeof(dirh);
                dir_count = le32_to_cpu(dirh.count) + 1;
+                /* dir_count should never be larger than 256 */
+                if (dir_count > 256)
+                        goto failed_read;
                while (dir_count--) {
                        /*
                         * Read directory entry.
@@ -183,6 +188,10 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
                        size = le16_to_cpu(dire->size) + 1;
+                        /* size should never be larger than SQUASHFS_NAME_LEN */
+                        if (size > SQUASHFS_NAME_LEN)
+                                goto failed_read;
                        err = squashfs_read_metadata(inode->i_sb, dire->name,
                                        &block, &offset, size);
                        if (err < 0)
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 7da759e34c52..00f4dfc5f088 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -37,7 +37,7 @@ struct squashfs_lzo {
        void    *output;
 };
-static void *lzo_init(struct squashfs_sb_info *msblk)
+static void *lzo_init(struct squashfs_sb_info *msblk, void *buff, int len)
 {
        int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
@@ -58,7 +58,7 @@ failed2:
 failed:
        ERROR("Failed to allocate lzo workspace\n");
        kfree(stream);
-        return NULL;
+        return ERR_PTR(-ENOMEM);
 }
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 7a9464d08cf6..5d922a6701ab 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -176,6 +176,11 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
                length += sizeof(dirh);
                dir_count = le32_to_cpu(dirh.count) + 1;
+                /* dir_count should never be larger than 256 */
+                if (dir_count > 256)
+                        goto data_error;
                while (dir_count--) {
                        /*
                         * Read directory entry.
@@ -187,6 +192,10 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
                        size = le16_to_cpu(dire->size) + 1;
+                        /* size should never be larger than SQUASHFS_NAME_LEN */
+                        if (size > SQUASHFS_NAME_LEN)
+                                goto data_error;
                        err = squashfs_read_metadata(dir->i_sb, dire->name,
                                        &block, &offset, size);
                        if (err < 0)
@@ -228,6 +237,9 @@ exit_lookup:
        d_add(dentry, inode);
        return ERR_PTR(0);
+data_error:
+        err = -EIO;
 read_failure:
        ERROR("Unable to read directory block [%llx:%x]\n",
                squashfs_i(dir)->start + msblk->directory_table,
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index ba729d808876..1f2e608b8785 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -48,6 +48,7 @@ extern int squashfs_read_table(struct super_block *, void *, u64, int);
 /* decompressor.c */
 extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
+extern void *squashfs_decompressor_init(struct super_block *, unsigned short);
 /* export.c */
 extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 39533feffd6d..4582c568ef4d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -57,6 +57,7 @@
 #define SQUASHFS_ALWAYS_FRAG            5
 #define SQUASHFS_DUPLICATE              6
 #define SQUASHFS_EXPORT                 7
+#define SQUASHFS_COMP_OPT               10
 #define SQUASHFS_BIT(flag, bit)         ((flag >> bit) & 1)
@@ -81,6 +82,9 @@
 #define SQUASHFS_EXPORTABLE(flags)              SQUASHFS_BIT(flags, \
                                                SQUASHFS_EXPORT)
+#define SQUASHFS_COMP_OPTS(flags)               SQUASHFS_BIT(flags, \
+                                                SQUASHFS_COMP_OPT)
 /* Max number of types and file types */
 #define SQUASHFS_DIR_TYPE               1
 #define SQUASHFS_REG_TYPE               2
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 20700b9f2b4c..5c8184c061a4 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -199,10 +199,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        err = -ENOMEM;
-        msblk->stream = squashfs_decompressor_init(msblk);
-        if (msblk->stream == NULL)
-                goto failed_mount;
        msblk->block_cache = squashfs_cache_init("metadata",
                        SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
        if (msblk->block_cache == NULL)
@@ -215,6 +211,13 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
+        msblk->stream = squashfs_decompressor_init(sb, flags);
+        if (IS_ERR(msblk->stream)) {
+                err = PTR_ERR(msblk->stream);
+                msblk->stream = NULL;
+                goto failed_mount;
+        }
        /* Allocate and read id index table */
        msblk->id_table = squashfs_read_id_index_table(sb,
                le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids));
@@ -370,8 +373,8 @@ static void squashfs_put_super(struct super_block *sb)
 }
-static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags,
+static struct dentry *squashfs_mount(struct file_system_type *fs_type,
-                                const char *dev_name, void *data)
+                                int flags, const char *dev_name, void *data)
 {
        return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
 }
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index c4eb40018256..aa47a286d1f8 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -26,10 +26,10 @@
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
 #include <linux/xz.h>
+#include <linux/bitops.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
@@ -38,24 +38,57 @@ struct squashfs_xz {
        struct xz_buf buf;
 };
-static void *squashfs_xz_init(struct squashfs_sb_info *msblk)
+struct comp_opts {
+        __le32 dictionary_size;
+        __le32 flags;
+};
+static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff,
+        int len)
 {
-        int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+        struct comp_opts *comp_opts = buff;
+        struct squashfs_xz *stream;
+        int dict_size = msblk->block_size;
+        int err, n;
+        if (comp_opts) {
+                /* check compressor options are the expected length */
+                if (len < sizeof(*comp_opts)) {
+                        err = -EIO;
+                        goto failed;
+                }
-        struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+                dict_size = le32_to_cpu(comp_opts->dictionary_size);
-        if (stream == NULL)
+                /* the dictionary size should be 2^n or 2^n+2^(n+1) */
+                n = ffs(dict_size) - 1;
+                if (dict_size != (1 << n) && dict_size != (1 << n) +
+                                                (1 << (n + 1))) {
+                        err = -EIO;
+                        goto failed;
+                }
+        }
+        dict_size = max_t(int, dict_size, SQUASHFS_METADATA_SIZE);
+        stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+        if (stream == NULL) {
+                err = -ENOMEM;
                goto failed;
+        }
-        stream->state = xz_dec_init(XZ_PREALLOC, block_size);
+        stream->state = xz_dec_init(XZ_PREALLOC, dict_size);
-        if (stream->state == NULL)
+        if (stream->state == NULL) {
+                kfree(stream);
+                err = -ENOMEM;
                goto failed;
+        }
        return stream;
 failed:
-        ERROR("Failed to allocate xz workspace\n");
+        ERROR("Failed to initialise xz decompressor\n");
-        kfree(stream);
+        return ERR_PTR(err);
-        return NULL;
 }
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 4661ae2b1cec..517688b32ffa 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -26,19 +26,19 @@
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
 #include <linux/zlib.h>
+#include <linux/vmalloc.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs.h"
 #include "decompressor.h"
-static void *zlib_init(struct squashfs_sb_info *dummy)
+static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len)
 {
        z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
        if (stream == NULL)
                goto failed;
-        stream->workspace = kmalloc(zlib_inflate_workspacesize(),
+        stream->workspace = vmalloc(zlib_inflate_workspacesize());
-                GFP_KERNEL);
        if (stream->workspace == NULL)
                goto failed;
@@ -47,7 +47,7 @@ static void *zlib_init(struct squashfs_sb_info *dummy)
 failed:
        ERROR("Failed to allocate zlib workspace\n");
        kfree(stream);
-        return NULL;
+        return ERR_PTR(-ENOMEM);
 }
@@ -56,7 +56,7 @@ static void zlib_free(void *strm)
        z_stream *stream = strm;
        if (stream)
-                kfree(stream->workspace);
+                vfree(stream->workspace);
        kfree(stream);
 }
diff --git a/fs/stat.c b/fs/stat.c
index d5c61cf2b703..961039121cb8 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
        int error = -EINVAL;
        int lookup_flags = 0;
-        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
+        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+                      AT_EMPTY_PATH)) != 0)
                goto out;
        if (!(flag & AT_SYMLINK_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        if (flag & AT_NO_AUTOMOUNT)
                lookup_flags |= LOOKUP_NO_AUTOMOUNT;
+        if (flag & AT_EMPTY_PATH)
+                lookup_flags |= LOOKUP_EMPTY;
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
@@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
        if (bufsiz <= 0)
                return -EINVAL;
-        error = user_path_at(dfd, pathname, 0, &path);
+        error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path);
        if (!error) {
                struct inode *inode = path.dentry->d_inode;
diff --git a/fs/statfs.c b/fs/statfs.c
index 30ea8c8a996b..8244924dec55 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf)
 }
 EXPORT_SYMBOL(vfs_statfs);
-static int do_statfs_native(struct path *path, struct statfs *buf)
+int user_statfs(const char __user *pathname, struct kstatfs *st)
 {
-        struct kstatfs st;
+        struct path path;
-        int retval;
+        int error = user_path(pathname, &path);
+        if (!error) {
+                error = vfs_statfs(&path, st);
+                path_put(&path);
+        }
+        return error;
+}
-        retval = vfs_statfs(path, &st);
+int fd_statfs(int fd, struct kstatfs *st)
-        if (retval)
+{
-                return retval;
+        struct file *file = fget(fd);
+        int error = -EBADF;
+        if (file) {
+                error = vfs_statfs(&file->f_path, st);
+                fput(file);
+        }
+        return error;
+}
-        if (sizeof(*buf) == sizeof(st))
+static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
-                memcpy(buf, &st, sizeof(st));
+{
+        struct statfs buf;
+        if (sizeof(buf) == sizeof(*st))
+                memcpy(&buf, st, sizeof(*st));
        else {
-                if (sizeof buf->f_blocks == 4) {
+                if (sizeof buf.f_blocks == 4) {
-                        if ((st.f_blocks | st.f_bfree | st.f_bavail |
+                        if ((st->f_blocks | st->f_bfree | st->f_bavail |
-                             st.f_bsize | st.f_frsize) &
+                             st->f_bsize | st->f_frsize) &
                            0xffffffff00000000ULL)
                                return -EOVERFLOW;
                        /*
                         * f_files and f_ffree may be -1; it's okay to stuff
                         * that into 32 bits
                         */
-                        if (st.f_files != -1 &&
+                        if (st->f_files != -1 &&
-                            (st.f_files & 0xffffffff00000000ULL))
+                            (st->f_files & 0xffffffff00000000ULL))
                                return -EOVERFLOW;
-                        if (st.f_ffree != -1 &&
+                        if (st->f_ffree != -1 &&
-                            (st.f_ffree & 0xffffffff00000000ULL))
+                            (st->f_ffree & 0xffffffff00000000ULL))
                                return -EOVERFLOW;
                }
-                buf->f_type = st.f_type;
+                buf.f_type = st->f_type;
-                buf->f_bsize = st.f_bsize;
+                buf.f_bsize = st->f_bsize;
-                buf->f_blocks = st.f_blocks;
+                buf.f_blocks = st->f_blocks;
-                buf->f_bfree = st.f_bfree;
+                buf.f_bfree = st->f_bfree;
-                buf->f_bavail = st.f_bavail;
+                buf.f_bavail = st->f_bavail;
-                buf->f_files = st.f_files;
+                buf.f_files = st->f_files;
-                buf->f_ffree = st.f_ffree;
+                buf.f_ffree = st->f_ffree;
-                buf->f_fsid = st.f_fsid;
+                buf.f_fsid = st->f_fsid;
-                buf->f_namelen = st.f_namelen;
+                buf.f_namelen = st->f_namelen;
-                buf->f_frsize = st.f_frsize;
+                buf.f_frsize = st->f_frsize;
-                buf->f_flags = st.f_flags;
+                buf.f_flags = st->f_flags;
-                memset(buf->f_spare, 0, sizeof(buf->f_spare));
+                memset(buf.f_spare, 0, sizeof(buf.f_spare));
        }
+        if (copy_to_user(p, &buf, sizeof(buf)))
+                return -EFAULT;
        return 0;
 }
-static int do_statfs64(struct path *path, struct statfs64 *buf)
+static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
 {
-        struct kstatfs st;
+        struct statfs64 buf;
-        int retval;
+        if (sizeof(buf) == sizeof(*st))
+                memcpy(&buf, st, sizeof(*st));
-        retval = vfs_statfs(path, &st);
-        if (retval)
-                return retval;
-        if (sizeof(*buf) == sizeof(st))
-                memcpy(buf, &st, sizeof(st));
        else {
-                buf->f_type = st.f_type;
+                buf.f_type = st->f_type;
-                buf->f_bsize = st.f_bsize;
+                buf.f_bsize = st->f_bsize;
-                buf->f_blocks = st.f_blocks;
+                buf.f_blocks = st->f_blocks;
-                buf->f_bfree = st.f_bfree;
+                buf.f_bfree = st->f_bfree;
-                buf->f_bavail = st.f_bavail;
+                buf.f_bavail = st->f_bavail;
-                buf->f_files = st.f_files;
+                buf.f_files = st->f_files;
-                buf->f_ffree = st.f_ffree;
+                buf.f_ffree = st->f_ffree;
-                buf->f_fsid = st.f_fsid;
+                buf.f_fsid = st->f_fsid;
-                buf->f_namelen = st.f_namelen;
+                buf.f_namelen = st->f_namelen;
-                buf->f_frsize = st.f_frsize;
+                buf.f_frsize = st->f_frsize;
-                buf->f_flags = st.f_flags;
+                buf.f_flags = st->f_flags;
-                memset(buf->f_spare, 0, sizeof(buf->f_spare));
+                memset(buf.f_spare, 0, sizeof(buf.f_spare));
        }
+        if (copy_to_user(p, &buf, sizeof(buf)))
+                return -EFAULT;
        return 0;
 }
 SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
 {
-        struct path path;
+        struct kstatfs st;
-        int error;
+        int error = user_statfs(pathname, &st);
+        if (!error)
-        error = user_path(pathname, &path);
+                error = do_statfs_native(&st, buf);
-        if (!error) {
-                struct statfs tmp;
-                error = do_statfs_native(&path, &tmp);
-                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                        error = -EFAULT;
-                path_put(&path);
-        }
        return error;
 }
 SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
 {
-        struct path path;
+        struct kstatfs st;
-        long error;
+        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = user_path(pathname, &path);
+        error = user_statfs(pathname, &st);
-        if (!error) {
+        if (!error)
-                struct statfs64 tmp;
+                error = do_statfs64(&st, buf);
-                error = do_statfs64(&path, &tmp);
-                if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                        error = -EFAULT;
-                path_put(&path);
-        }
        return error;
 }
 SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
 {
-        struct file *file;
+        struct kstatfs st;
-        struct statfs tmp;
+        int error = fd_statfs(fd, &st);
-        int error;
+        if (!error)
+                error = do_statfs_native(&st, buf);
-        error = -EBADF;
-        file = fget(fd);
-        if (!file)
-                goto out;
-        error = do_statfs_native(&file->f_path, &tmp);
-        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                error = -EFAULT;
-        fput(file);
-out:
        return error;
 }
 SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
 {
-        struct file *file;
+        struct kstatfs st;
-        struct statfs64 tmp;
        int error;
        if (sz != sizeof(*buf))
                return -EINVAL;
-        error = -EBADF;
+        error = fd_statfs(fd, &st);
-        file = fget(fd);
+        if (!error)
-        if (!file)
+                error = do_statfs64(&st, buf);
-                goto out;
-        error = do_statfs64(&file->f_path, &tmp);
-        if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-                error = -EFAULT;
-        fput(file);
-out:
        return error;
 }
diff --git a/fs/super.c b/fs/super.c
index 74e149efed81..8a06881b1920 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -71,6 +71,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 #else
                INIT_LIST_HEAD(&s->s_files);
 #endif
+                s->s_bdi = &default_backing_dev_info;
                INIT_LIST_HEAD(&s->s_instances);
                INIT_HLIST_BL_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
@@ -177,6 +178,11 @@ void deactivate_locked_super(struct super_block *s)
        struct file_system_type *fs = s->s_type;
        if (atomic_dec_and_test(&s->s_active)) {
                fs->kill_sb(s);
+                /*
+                 * We need to call rcu_barrier so all the delayed rcu free
+                 * inodes are flushed before we release the fs module.
+                 */
+                rcu_barrier();
                put_filesystem(fs);
                put_super(s);
        } else {
@@ -838,23 +844,6 @@ error:
 }
 EXPORT_SYMBOL(mount_bdev);
-int get_sb_bdev(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
-        struct vfsmount *mnt)
-{
-        struct dentry *root;
-        root = mount_bdev(fs_type, flags, dev_name, data, fill_super);
-        if (IS_ERR(root))
-                return PTR_ERR(root);
-        mnt->mnt_root = root;
-        mnt->mnt_sb = root->d_sb;
-        return 0;
-}
-EXPORT_SYMBOL(get_sb_bdev);
 void kill_block_super(struct super_block *sb)
 {
        struct block_device *bdev = sb->s_bdev;
@@ -892,22 +881,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
 }
 EXPORT_SYMBOL(mount_nodev);
-int get_sb_nodev(struct file_system_type *fs_type,
-        int flags, void *data,
-        int (*fill_super)(struct super_block *, void *, int),
-        struct vfsmount *mnt)
-{
-        struct dentry *root;
-        root = mount_nodev(fs_type, flags, data, fill_super);
-        if (IS_ERR(root))
-                return PTR_ERR(root);
-        mnt->mnt_root = root;
-        mnt->mnt_sb = root->d_sb;
-        return 0;
-}
-EXPORT_SYMBOL(get_sb_nodev);
 static int compare_single(struct super_block *s, void *p)
 {
        return 1;
@@ -938,69 +911,36 @@ struct dentry *mount_single(struct file_system_type *fs_type,
 }
 EXPORT_SYMBOL(mount_single);
-int get_sb_single(struct file_system_type *fs_type,
+struct dentry *
-        int flags, void *data,
+mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
-        int (*fill_super)(struct super_block *, void *, int),
-        struct vfsmount *mnt)
-{
-        struct dentry *root;
-        root = mount_single(fs_type, flags, data, fill_super);
-        if (IS_ERR(root))
-                return PTR_ERR(root);
-        mnt->mnt_root = root;
-        mnt->mnt_sb = root->d_sb;
-        return 0;
-}
-EXPORT_SYMBOL(get_sb_single);
-struct vfsmount *
-vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
 {
-        struct vfsmount *mnt;
        struct dentry *root;
+        struct super_block *sb;
        char *secdata = NULL;
-        int error;
+        int error = -ENOMEM;
-        if (!type)
-                return ERR_PTR(-ENODEV);
-        error = -ENOMEM;
-        mnt = alloc_vfsmnt(name);
-        if (!mnt)
-                goto out;
-        if (flags & MS_KERNMOUNT)
-                mnt->mnt_flags = MNT_INTERNAL;
        if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
                secdata = alloc_secdata();
                if (!secdata)
-                        goto out_mnt;
+                        goto out;
                error = security_sb_copy_data(data, secdata);
                if (error)
                        goto out_free_secdata;
        }
-        if (type->mount) {
+        root = type->mount(type, flags, name, data);
-                root = type->mount(type, flags, name, data);
+        if (IS_ERR(root)) {
-                if (IS_ERR(root)) {
+                error = PTR_ERR(root);
-                        error = PTR_ERR(root);
+                goto out_free_secdata;
-                        goto out_free_secdata;
-                }
-                mnt->mnt_root = root;
-                mnt->mnt_sb = root->d_sb;
-        } else {
-                error = type->get_sb(type, flags, name, data, mnt);
-                if (error < 0)
-                        goto out_free_secdata;
        }
-        BUG_ON(!mnt->mnt_sb);
+        sb = root->d_sb;
-        WARN_ON(!mnt->mnt_sb->s_bdi);
+        BUG_ON(!sb);
-        mnt->mnt_sb->s_flags |= MS_BORN;
+        WARN_ON(!sb->s_bdi);
+        WARN_ON(sb->s_bdi == &default_backing_dev_info);
+        sb->s_flags |= MS_BORN;
-        error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
+        error = security_sb_kern_mount(sb, flags, secdata);
        if (error)
                goto out_sb;
@@ -1011,27 +951,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
         * violate this rule. This warning should be either removed or
         * converted to a BUG() in 2.6.34.
         */
-        WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
+        WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
-                "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes);
+                "negative value (%lld)\n", type->name, sb->s_maxbytes);
-        mnt->mnt_mountpoint = mnt->mnt_root;
+        up_write(&sb->s_umount);
-        mnt->mnt_parent = mnt;
-        up_write(&mnt->mnt_sb->s_umount);
        free_secdata(secdata);
-        return mnt;
+        return root;
 out_sb:
-        dput(mnt->mnt_root);
+        dput(root);
-        deactivate_locked_super(mnt->mnt_sb);
+        deactivate_locked_super(sb);
 out_free_secdata:
        free_secdata(secdata);
-out_mnt:
-        free_vfsmnt(mnt);
 out:
        return ERR_PTR(error);
 }
-EXPORT_SYMBOL_GPL(vfs_kern_mount);
 /**
 * freeze_super - lock the filesystem and force it into a consistent state
 * @sb: the super to lock
@@ -1121,49 +1055,3 @@ out:
        return 0;
 }
 EXPORT_SYMBOL(thaw_super);
-static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
-{
-        int err;
-        const char *subtype = strchr(fstype, '.');
-        if (subtype) {
-                subtype++;
-                err = -EINVAL;
-                if (!subtype[0])
-                        goto err;
-        } else
-                subtype = "";
-        mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
-        err = -ENOMEM;
-        if (!mnt->mnt_sb->s_subtype)
-                goto err;
-        return mnt;
- err:
-        mntput(mnt);
-        return ERR_PTR(err);
-}
-struct vfsmount *
-do_kern_mount(const char *fstype, int flags, const char *name, void *data)
-{
-        struct file_system_type *type = get_fs_type(fstype);
-        struct vfsmount *mnt;
-        if (!type)
-                return ERR_PTR(-ENODEV);
-        mnt = vfs_kern_mount(type, flags, name, data);
-        if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
-            !mnt->mnt_sb->s_subtype)
-                mnt = fs_set_subtype(mnt, fstype);
-        put_filesystem(type);
-        return mnt;
-}
-EXPORT_SYMBOL_GPL(do_kern_mount);
-struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
-{
-        return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
-}
-EXPORT_SYMBOL_GPL(kern_mount_data);
diff --git a/fs/sync.c b/fs/sync.c
index ba76b9623e7e..c38ec163da6c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <linux/namei.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/syscalls.h>
@@ -33,7 +34,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
         * This should be safe, as we require bdi backing to actually
         * write out data in the first place
         */
-        if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info)
+        if (sb->s_bdi == &noop_backing_dev_info)
                return 0;
        if (sb->s_qcop && sb->s_qcop->quota_sync)
@@ -79,7 +80,7 @@ EXPORT_SYMBOL_GPL(sync_filesystem);
 static void sync_one_sb(struct super_block *sb, void *arg)
 {
-        if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
+        if (!(sb->s_flags & MS_RDONLY))
                __sync_filesystem(sb, *(int *)arg);
 }
 /*
@@ -128,6 +129,29 @@ void emergency_sync(void)
        }
 }
+/*
+ * sync a single super
+ */
+SYSCALL_DEFINE1(syncfs, int, fd)
+{
+        struct file *file;
+        struct super_block *sb;
+        int ret;
+        int fput_needed;
+        file = fget_light(fd, &fput_needed);
+        if (!file)
+                return -EBADF;
+        sb = file->f_dentry->d_sb;
+        down_read(&sb->s_umount);
+        ret = sync_filesystem(sb);
+        up_read(&sb->s_umount);
+        fput_light(file, fput_needed);
+        return ret;
+}
 /**
 * vfs_fsync_range - helper to sync a range of data & metadata to disk
 * @file:               file to sync
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 9ca66276315e..fa8d43c92bb8 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -488,7 +488,6 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations sysv_aops = {
        .readpage = sysv_readpage,
        .writepage = sysv_writepage,
-        .sync_page = block_sync_page,
        .write_begin = sysv_write_begin,
        .write_end = generic_write_end,
        .bmap = sysv_bmap
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b427b1208c26..e474fbcf8bde 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -245,7 +245,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
                new_de = sysv_find_entry(new_dentry, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                sysv_set_link(new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -257,18 +256,15 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
                        if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = sysv_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
        sysv_delete_entry(old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                sysv_set_link(dir_de, dir_page, new_dir);
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 830e3f76f442..f8b0160da2da 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -44,29 +44,17 @@ config UBIFS_FS_ZLIB
 # Debugging-related stuff
 config UBIFS_FS_DEBUG
-        bool "Enable debugging"
+        bool "Enable debugging support"
        depends on UBIFS_FS
        select DEBUG_FS
-        select KALLSYMS_ALL
+        select KALLSYMS
-        help
+        help
-          This option enables UBIFS debugging.
+          This option enables UBIFS debugging support. It makes sure various
+          assertions, self-checks, debugging messages and test modes are compiled
-config UBIFS_FS_DEBUG_MSG_LVL
+          in (this all is compiled out otherwise). Assertions are light-weight
-        int "Default message level (0 = no extra messages, 3 = lots)"
+          and this option also enables them. Self-checks, debugging messages and
-        depends on UBIFS_FS_DEBUG
+          test modes are switched off by default. Thus, it is safe and actually
-        default "0"
+          recommended to have debugging support enabled, and it should not slow
-        help
+          down UBIFS. You can then further enable / disable individual  debugging
-          This controls the amount of debugging messages produced by UBIFS.
+          features using UBIFS module parameters and the corresponding sysfs
-          If reporting bugs, please try to have available a full dump of the
+          interfaces.
-          messages at level 1 while the misbehaviour was occurring. Level 2
-          may become necessary if level 1 messages were not enough to find the
-          bug. Generally Level 3 should be avoided.
-config UBIFS_FS_DEBUG_CHKS
-        bool "Enable extra checks"
-        depends on UBIFS_FS_DEBUG
-        help
-          If extra checks are enabled UBIFS will check the consistency of its
-          internal data structures during operation. However, UBIFS performance
-          is dramatically slower when this option is selected especially if the
-          file system is large.
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index c8ff0d1ae5d3..8b3a7da531eb 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -147,7 +147,7 @@ static int make_free_space(struct ubifs_info *c)
                if (liab2 < liab1)
                        return -EAGAIN;
-                dbg_budg("new liability %lld (not shrinked)", liab2);
+                dbg_budg("new liability %lld (not shrunk)", liab2);
                /* Liability did not shrink again, try GC */
                dbg_budg("Run GC");
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 02429d81ca33..1bd01ded7123 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -48,6 +48,56 @@
 #include <linux/slab.h>
 #include "ubifs.h"
+/*
+ * nothing_to_commit - check if there is nothing to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which checks if there is anything to commit. It is
+ * used as an optimization to avoid starting the commit if it is not really
+ * necessary. Indeed, the commit operation always assumes flash I/O (e.g.,
+ * writing the commit start node to the log), and it is better to avoid doing
+ * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is
+ * nothing to commit, it is more optimal to avoid any flash I/O.
+ *
+ * This function has to be called with @c->commit_sem locked for writing -
+ * this function does not take LPT/TNC locks because the @c->commit_sem
+ * guarantees that we have exclusive access to the TNC and LPT data structures.
+ *
+ * This function returns %1 if there is nothing to commit and %0 otherwise.
+ */
+static int nothing_to_commit(struct ubifs_info *c)
+{
+        /*
+         * During mounting or remounting from R/O mode to R/W mode we may
+         * commit for various recovery-related reasons.
+         */
+        if (c->mounting || c->remounting_rw)
+                return 0;
+        /*
+         * If the root TNC node is dirty, we definitely have something to
+         * commit.
+         */
+        if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags))
+                return 0;
+        /*
+         * Even though the TNC is clean, the LPT tree may have dirty nodes. For
+         * example, this may happen if the budgeting subsystem invoked GC to
+         * make some free space, and the GC found an LEB with only dirty and
+         * free space. In this case GC would just change the lprops of this
+         * LEB (by turning all space into free space) and unmap it.
+         */
+        if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags))
+                return 0;
+        ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
+        ubifs_assert(c->dirty_pn_cnt == 0);
+        ubifs_assert(c->dirty_nn_cnt == 0);
+        return 1;
+}
 /**
 * do_commit - commit the journal.
 * @c: UBIFS file-system description object
@@ -70,6 +120,12 @@ static int do_commit(struct ubifs_info *c)
                goto out_up;
        }
+        if (nothing_to_commit(c)) {
+                up_write(&c->commit_sem);
+                err = 0;
+                goto out_cancel;
+        }
        /* Sync all write buffers (necessary for recovery) */
        for (i = 0; i < c->jhead_cnt; i++) {
                err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
@@ -162,12 +218,12 @@ static int do_commit(struct ubifs_info *c)
        if (err)
                goto out;
+out_cancel:
        spin_lock(&c->cs_lock);
        c->cmt_state = COMMIT_RESTING;
        wake_up(&c->cmt_wq);
        dbg_cmt("commit end");
        spin_unlock(&c->cs_lock);
        return 0;
 out_up:
@@ -521,7 +577,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
        size_t sz;
        if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX))
-                goto out;
+                return 0;
        INIT_LIST_HEAD(&list);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 0bee4dbffc31..004d3745dc45 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -43,8 +43,8 @@ DEFINE_SPINLOCK(dbg_lock);
 static char dbg_key_buf0[128];
 static char dbg_key_buf1[128];
-unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT;
+unsigned int ubifs_msg_flags;
-unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT;
+unsigned int ubifs_chk_flags;
 unsigned int ubifs_tst_flags;
 module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
@@ -810,16 +810,24 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
 {
        struct ubifs_scan_leb *sleb;
        struct ubifs_scan_node *snod;
+        void *buf;
        if (dbg_failure_mode)
                return;
        printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
               current->pid, lnum);
-        sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
+        buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+        if (!buf) {
+                ubifs_err("cannot allocate memory for dumping LEB %d", lnum);
+                return;
+        }
+        sleb = ubifs_scan(c, lnum, 0, buf, 0);
        if (IS_ERR(sleb)) {
                ubifs_err("scan error %d", (int)PTR_ERR(sleb));
-                return;
+                goto out;
        }
        printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
@@ -835,6 +843,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
        printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
               current->pid, lnum);
        ubifs_scan_destroy(sleb);
+out:
+        vfree(buf);
        return;
 }
@@ -961,11 +972,39 @@ void dbg_dump_index(struct ubifs_info *c)
 void dbg_save_space_info(struct ubifs_info *c)
 {
        struct ubifs_debug_info *d = c->dbg;
+        int freeable_cnt;
-        ubifs_get_lp_stats(c, &d->saved_lst);
        spin_lock(&c->space_lock);
+        memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats));
+        /*
+         * We use a dirty hack here and zero out @c->freeable_cnt, because it
+         * affects the free space calculations, and UBIFS might not know about
+         * all freeable eraseblocks. Indeed, we know about freeable eraseblocks
+         * only when we read their lprops, and we do this only lazily, upon the
+         * need. So at any given point of time @c->freeable_cnt might be not
+         * exactly accurate.
+         *
+         * Just one example about the issue we hit when we did not zero
+         * @c->freeable_cnt.
+         * 1. The file-system is mounted R/O, c->freeable_cnt is %0. We save the
+         *    amount of free space in @d->saved_free
+         * 2. We re-mount R/W, which makes UBIFS to read the "lsave"
+         *    information from flash, where we cache LEBs from various
+         *    categories ('ubifs_remount_fs()' -> 'ubifs_lpt_init()'
+         *    -> 'lpt_init_wr()' -> 'read_lsave()' -> 'ubifs_lpt_lookup()'
+         *    -> 'ubifs_get_pnode()' -> 'update_cats()'
+         *    -> 'ubifs_add_to_cat()').
+         * 3. Lsave contains a freeable eraseblock, and @c->freeable_cnt
+         *    becomes %1.
+         * 4. We calculate the amount of free space when the re-mount is
+         *    finished in 'dbg_check_space_info()' and it does not match
+         *    @d->saved_free.
+         */
+        freeable_cnt = c->freeable_cnt;
+        c->freeable_cnt = 0;
        d->saved_free = ubifs_get_free_space_nolock(c);
+        c->freeable_cnt = freeable_cnt;
        spin_unlock(&c->space_lock);
 }
@@ -982,12 +1021,15 @@ int dbg_check_space_info(struct ubifs_info *c)
 {
        struct ubifs_debug_info *d = c->dbg;
        struct ubifs_lp_stats lst;
-        long long avail, free;
+        long long free;
+        int freeable_cnt;
        spin_lock(&c->space_lock);
-        avail = ubifs_calc_available(c, c->min_idx_lebs);
+        freeable_cnt = c->freeable_cnt;
+        c->freeable_cnt = 0;
+        free = ubifs_get_free_space_nolock(c);
+        c->freeable_cnt = freeable_cnt;
        spin_unlock(&c->space_lock);
-        free = ubifs_get_free_space(c);
        if (free != d->saved_free) {
                ubifs_err("free space changed from %lld to %lld",
@@ -2690,16 +2732,8 @@ int ubifs_debugging_init(struct ubifs_info *c)
        if (!c->dbg)
                return -ENOMEM;
-        c->dbg->buf = vmalloc(c->leb_size);
-        if (!c->dbg->buf)
-                goto out;
        failure_mode_init(c);
        return 0;
-out:
-        kfree(c->dbg);
-        return -ENOMEM;
 }
 /**
@@ -2709,7 +2743,6 @@ out:
 void ubifs_debugging_exit(struct ubifs_info *c)
 {
        failure_mode_exit(c);
-        vfree(c->dbg->buf);
        kfree(c->dbg);
 }
@@ -2804,40 +2837,38 @@ int dbg_debugfs_init_fs(struct ubifs_info *c)
        struct ubifs_debug_info *d = c->dbg;
        sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
-        d->dfs_dir = debugfs_create_dir(d->dfs_dir_name, dfs_rootdir);
+        fname = d->dfs_dir_name;
-        if (IS_ERR(d->dfs_dir)) {
+        dent = debugfs_create_dir(fname, dfs_rootdir);
-                err = PTR_ERR(d->dfs_dir);
+        if (IS_ERR_OR_NULL(dent))
-                ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
-                          d->dfs_dir_name, err);
                goto out;
-        }
+        d->dfs_dir = dent;
        fname = "dump_lprops";
-        dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+        dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
-        if (IS_ERR(dent))
+        if (IS_ERR_OR_NULL(dent))
                goto out_remove;
        d->dfs_dump_lprops = dent;
        fname = "dump_budg";
-        dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+        dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
-        if (IS_ERR(dent))
+        if (IS_ERR_OR_NULL(dent))
                goto out_remove;
        d->dfs_dump_budg = dent;
        fname = "dump_tnc";
-        dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
+        dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
-        if (IS_ERR(dent))
+        if (IS_ERR_OR_NULL(dent))
                goto out_remove;
        d->dfs_dump_tnc = dent;
        return 0;
 out_remove:
-        err = PTR_ERR(dent);
-        ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
-                  fname, err);
        debugfs_remove_recursive(d->dfs_dir);
 out:
+        err = dent ? PTR_ERR(dent) : -ENODEV;
+        ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
+                  fname, err);
        return err;
 }
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 69ebe4729151..e6493cac193d 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -23,11 +23,16 @@
 #ifndef __UBIFS_DEBUG_H__
 #define __UBIFS_DEBUG_H__
+/* Checking helper functions */
+typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
+                                 struct ubifs_zbranch *zbr, void *priv);
+typedef int (*dbg_znode_callback)(struct ubifs_info *c,
+                                  struct ubifs_znode *znode, void *priv);
 #ifdef CONFIG_UBIFS_FS_DEBUG
 /**
 * ubifs_debug_info - per-FS debugging information.
- * @buf: a buffer of LEB size, used for various purposes
 * @old_zroot: old index root - used by 'dbg_check_old_index()'
 * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
 * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
@@ -54,7 +59,6 @@
 * dfs_dump_tnc: "dump TNC" debugfs knob
 */
 struct ubifs_debug_info {
-        void *buf;
        struct ubifs_zbranch old_zroot;
        int old_zroot_level;
        unsigned long long old_zroot_sqnum;
@@ -173,7 +177,7 @@ const char *dbg_key_str1(const struct ubifs_info *c,
 #define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
 /*
- * Debugging message type flags (must match msg_type_names in debug.c).
+ * Debugging message type flags.
 *
 * UBIFS_MSG_GEN: general messages
 * UBIFS_MSG_JNL: journal messages
@@ -205,14 +209,8 @@ enum {
        UBIFS_MSG_RCVRY = 0x1000,
 };
-/* Debugging message type flags for each default debug message level */
-#define UBIFS_MSG_LVL_0 0
-#define UBIFS_MSG_LVL_1 0x1
-#define UBIFS_MSG_LVL_2 0x7f
-#define UBIFS_MSG_LVL_3 0xffff
 /*
- * Debugging check flags (must match chk_names in debug.c).
+ * Debugging check flags.
 *
 * UBIFS_CHK_GEN: general checks
 * UBIFS_CHK_TNC: check TNC
@@ -233,7 +231,7 @@ enum {
 };
 /*
- * Special testing flags (must match tst_names in debug.c).
+ * Special testing flags.
 *
 * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
 * UBIFS_TST_RCVRY: failure mode for recovery testing
@@ -243,22 +241,6 @@ enum {
        UBIFS_TST_RCVRY             = 0x4,
 };
-#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1
-#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2
-#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3
-#else
-#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0
-#endif
-#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS
-#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff
-#else
-#define UBIFS_CHK_FLAGS_DEFAULT 0
-#endif
 extern spinlock_t dbg_lock;
 extern unsigned int ubifs_msg_flags;
@@ -294,11 +276,6 @@ void dbg_dump_tnc(struct ubifs_info *c);
 void dbg_dump_index(struct ubifs_info *c);
 void dbg_dump_lpt_lebs(const struct ubifs_info *c);
-/* Checking helper functions */
-typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
-                                 struct ubifs_zbranch *zbr, void *priv);
-typedef int (*dbg_znode_callback)(struct ubifs_info *c,
-                                  struct ubifs_znode *znode, void *priv);
 int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
                   dbg_znode_callback znode_cb, void *priv);
@@ -319,7 +296,6 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
 int dbg_check_filesystem(struct ubifs_info *c);
 void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
                    int add_pos);
-int dbg_check_lprops(struct ubifs_info *c);
 int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
                        int row, int col);
 int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
@@ -425,58 +401,94 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 #define DBGKEY(key)  ((char *)(key))
 #define DBGKEY1(key) ((char *)(key))
-#define ubifs_debugging_init(c)                0
+static inline int ubifs_debugging_init(struct ubifs_info *c)      { return 0; }
-#define ubifs_debugging_exit(c)                ({})
+static inline void ubifs_debugging_exit(struct ubifs_info *c)     { return; }
+static inline const char *dbg_ntype(int type)                     { return ""; }
-#define dbg_ntype(type)                        ""
+static inline const char *dbg_cstate(int cmt_state)               { return ""; }
-#define dbg_cstate(cmt_state)                  ""
+static inline const char *dbg_jhead(int jhead)                    { return ""; }
-#define dbg_jhead(jhead)                       ""
+static inline const char *
-#define dbg_get_key_dump(c, key)               ({})
+dbg_get_key_dump(const struct ubifs_info *c,
-#define dbg_dump_inode(c, inode)               ({})
+                 const union ubifs_key *key)                      { return ""; }
-#define dbg_dump_node(c, node)                 ({})
+static inline void dbg_dump_inode(const struct ubifs_info *c,
-#define dbg_dump_lpt_node(c, node, lnum, offs) ({})
+                                  const struct inode *inode)      { return; }
-#define dbg_dump_budget_req(req)               ({})
+static inline void dbg_dump_node(const struct ubifs_info *c,
-#define dbg_dump_lstats(lst)                   ({})
+                                 const void *node)                { return; }
-#define dbg_dump_budg(c)                       ({})
+static inline void dbg_dump_lpt_node(const struct ubifs_info *c,
-#define dbg_dump_lprop(c, lp)                  ({})
+                                     void *node, int lnum,
-#define dbg_dump_lprops(c)                     ({})
+                                     int offs)                    { return; }
-#define dbg_dump_lpt_info(c)                   ({})
+static inline void
-#define dbg_dump_leb(c, lnum)                  ({})
+dbg_dump_budget_req(const struct ubifs_budget_req *req)           { return; }
-#define dbg_dump_znode(c, znode)               ({})
+static inline void
-#define dbg_dump_heap(c, heap, cat)            ({})
+dbg_dump_lstats(const struct ubifs_lp_stats *lst)                 { return; }
-#define dbg_dump_pnode(c, pnode, parent, iip)  ({})
+static inline void dbg_dump_budg(struct ubifs_info *c)            { return; }
-#define dbg_dump_tnc(c)                        ({})
+static inline void dbg_dump_lprop(const struct ubifs_info *c,
-#define dbg_dump_index(c)                      ({})
+                                  const struct ubifs_lprops *lp)  { return; }
-#define dbg_dump_lpt_lebs(c)                   ({})
+static inline void dbg_dump_lprops(struct ubifs_info *c)          { return; }
+static inline void dbg_dump_lpt_info(struct ubifs_info *c)        { return; }
-#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
+static inline void dbg_dump_leb(const struct ubifs_info *c,
-#define dbg_old_index_check_init(c, zroot)         0
+                                int lnum)                         { return; }
-#define dbg_save_space_info(c)                     ({})
+static inline void
-#define dbg_check_space_info(c)                    0
+dbg_dump_znode(const struct ubifs_info *c,
-#define dbg_check_old_index(c, zroot)              0
+               const struct ubifs_znode *znode)                   { return; }
-#define dbg_check_cats(c)                          0
+static inline void dbg_dump_heap(struct ubifs_info *c,
-#define dbg_check_ltab(c)                          0
+                                 struct ubifs_lpt_heap *heap,
-#define dbg_chk_lpt_free_spc(c)                    0
+                                 int cat)                         { return; }
-#define dbg_chk_lpt_sz(c, action, len)             0
+static inline void dbg_dump_pnode(struct ubifs_info *c,
-#define dbg_check_synced_i_size(inode)             0
+                                  struct ubifs_pnode *pnode,
-#define dbg_check_dir_size(c, dir)                 0
+                                  struct ubifs_nnode *parent,
-#define dbg_check_tnc(c, x)                        0
+                                  int iip)                        { return; }
-#define dbg_check_idx_size(c, idx_size)            0
+static inline void dbg_dump_tnc(struct ubifs_info *c)             { return; }
-#define dbg_check_filesystem(c)                    0
+static inline void dbg_dump_index(struct ubifs_info *c)           { return; }
-#define dbg_check_heap(c, heap, cat, add_pos)      ({})
+static inline void dbg_dump_lpt_lebs(const struct ubifs_info *c)  { return; }
-#define dbg_check_lprops(c)                        0
-#define dbg_check_lpt_nodes(c, cnode, row, col)    0
+static inline int dbg_walk_index(struct ubifs_info *c,
-#define dbg_check_inode_size(c, inode, size)       0
+                                 dbg_leaf_callback leaf_cb,
-#define dbg_check_data_nodes_order(c, head)        0
+                                 dbg_znode_callback znode_cb,
-#define dbg_check_nondata_nodes_order(c, head)     0
+                                 void *priv)                      { return 0; }
-#define dbg_force_in_the_gaps_enabled              0
+static inline void dbg_save_space_info(struct ubifs_info *c)      { return; }
-#define dbg_force_in_the_gaps()                    0
+static inline int dbg_check_space_info(struct ubifs_info *c)      { return 0; }
-#define dbg_failure_mode                           0
+static inline int dbg_check_lprops(struct ubifs_info *c)          { return 0; }
+static inline int
-#define dbg_debugfs_init()                         0
+dbg_old_index_check_init(struct ubifs_info *c,
-#define dbg_debugfs_exit()
+                         struct ubifs_zbranch *zroot)             { return 0; }
-#define dbg_debugfs_init_fs(c)                     0
+static inline int
-#define dbg_debugfs_exit_fs(c)                     0
+dbg_check_old_index(struct ubifs_info *c,
+                    struct ubifs_zbranch *zroot)                  { return 0; }
+static inline int dbg_check_cats(struct ubifs_info *c)            { return 0; }
+static inline int dbg_check_ltab(struct ubifs_info *c)            { return 0; }
+static inline int dbg_chk_lpt_free_spc(struct ubifs_info *c)      { return 0; }
+static inline int dbg_chk_lpt_sz(struct ubifs_info *c,
+                                 int action, int len)             { return 0; }
+static inline int dbg_check_synced_i_size(struct inode *inode)    { return 0; }
+static inline int dbg_check_dir_size(struct ubifs_info *c,
+                                     const struct inode *dir)     { return 0; }
+static inline int dbg_check_tnc(struct ubifs_info *c, int extra)  { return 0; }
+static inline int dbg_check_idx_size(struct ubifs_info *c,
+                                     long long idx_size)          { return 0; }
+static inline int dbg_check_filesystem(struct ubifs_info *c)      { return 0; }
+static inline void dbg_check_heap(struct ubifs_info *c,
+                                  struct ubifs_lpt_heap *heap,
+                                  int cat, int add_pos)           { return; }
+static inline int dbg_check_lpt_nodes(struct ubifs_info *c,
+        struct ubifs_cnode *cnode, int row, int col)              { return 0; }
+static inline int dbg_check_inode_size(struct ubifs_info *c,
+                                       const struct inode *inode,
+                                       loff_t size)               { return 0; }
+static inline int
+dbg_check_data_nodes_order(struct ubifs_info *c,
+                           struct list_head *head)                { return 0; }
+static inline int
+dbg_check_nondata_nodes_order(struct ubifs_info *c,
+                              struct list_head *head)             { return 0; }
+static inline int dbg_force_in_the_gaps(void)                     { return 0; }
+#define dbg_force_in_the_gaps_enabled 0
+#define dbg_failure_mode              0
+static inline int dbg_debugfs_init(void)                          { return 0; }
+static inline void dbg_debugfs_exit(void)                         { return; }
+static inline int dbg_debugfs_init_fs(struct ubifs_info *c)       { return 0; }
+static inline int dbg_debugfs_exit_fs(struct ubifs_info *c)       { return 0; }
 #endif /* !CONFIG_UBIFS_FS_DEBUG */
 #endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 14f64b689d7f..7217d67a80a6 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
        ubifs_assert(mutex_is_locked(&dir->i_mutex));
        ubifs_assert(mutex_is_locked(&inode->i_mutex));
-        /*
-         * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-         * otherwise has the potential to corrupt the orphan inode list.
-         *
-         * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
-         * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
-         * lock 'dirA->i_mutex', so this is possible. Both of the functions
-         * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
-         * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
-         * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
-         * to the list of orphans. After this, 'vfs_link()' will link
-         * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
-         * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
-         * to the list of orphans.
-         */
-         if (inode->i_nlink == 0)
-                 return -ENOENT;
        err = dbg_check_synced_i_size(inode);
        if (err)
                return err;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index d77db7e36484..b286db79c686 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -448,10 +448,12 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
                if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
                        /*
                         * We change whole page so no need to load it. But we
-                         * have to set the @PG_checked flag to make the further
+                         * do not know whether this page exists on the media or
-                         * code know that the page is new. This might be not
+                         * not, so we assume the latter because it requires
-                         * true, but it is better to budget more than to read
+                         * larger budget. The assumption is that it is better
-                         * the page from the media.
+                         * to budget a bit more than to read the page from the
+                         * media. Thus, we are setting the @PG_checked flag
+                         * here.
                         */
                        SetPageChecked(page);
                        skipped_read = 1;
@@ -559,6 +561,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
                dbg_gen("copied %d instead of %d, read page and repeat",
                        copied, len);
                cancel_budget(c, page, ui, appending);
+                ClearPageChecked(page);
                /*
                 * Return 0 to force VFS to repeat the whole operation, or the
@@ -1309,6 +1312,9 @@ int ubifs_fsync(struct file *file, int datasync)
        dbg_gen("syncing inode %lu", inode->i_ino);
+        if (inode->i_sb->s_flags & MS_RDONLY)
+                return 0;
        /*
         * VFS has already synchronized dirty pages for this inode. Synchronize
         * the inode unless this is a 'datasync()' call.
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index d82173182eeb..dfd168b7807e 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -31,6 +31,26 @@
 * buffer is full or when it is not used for some time (by timer). This is
 * similar to the mechanism is used by JFFS2.
 *
+ * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum
+ * write size (@c->max_write_size). The latter is the maximum amount of bytes
+ * the underlying flash is able to program at a time, and writing in
+ * @c->max_write_size units should presumably be faster. Obviously,
+ * @c->min_io_size <= @c->max_write_size. Write-buffers are of
+ * @c->max_write_size bytes in size for maximum performance. However, when a
+ * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size
+ * boundary) which contains data is written, not the whole write-buffer,
+ * because this is more space-efficient.
+ *
+ * This optimization adds few complications to the code. Indeed, on the one
+ * hand, we want to write in optimal @c->max_write_size bytes chunks, which
+ * also means aligning writes at the @c->max_write_size bytes offsets. On the
+ * other hand, we do not want to waste space when synchronizing the write
+ * buffer, so during synchronization we writes in smaller chunks. And this makes
+ * the next write offset to be not aligned to @c->max_write_size bytes. So the
+ * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned
+ * to @c->max_write_size bytes again. We do this by temporarily shrinking
+ * write-buffer size (@wbuf->size).
+ *
 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
 * mutexes defined inside these objects. Since sometimes upper-level code
 * has to lock the write-buffer (e.g. journal space reservation code), many
@@ -46,8 +66,8 @@
 * UBIFS uses padding when it pads to the next min. I/O unit. In this case it
 * uses padding nodes or padding bytes, if the padding node does not fit.
 *
- * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes
+ * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when
- * every time they are read from the flash media.
+ * they are read from the flash media.
 */
 #include <linux/crc32.h>
@@ -88,8 +108,12 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
 * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
 * true, which is controlled by corresponding UBIFS mount option. However, if
 * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
- * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is
+ * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are
- * ignored and CRC is checked.
+ * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC
+ * is checked. This is because during mounting or re-mounting from R/O mode to
+ * R/W mode we may read journal nodes (when replying the journal or doing the
+ * recovery) and the journal nodes may potentially be corrupted, so checking is
+ * required.
 *
 * This function returns zero in case of success and %-EUCLEAN in case of bad
 * CRC or magic.
@@ -131,8 +155,8 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
                   node_len > c->ranges[type].max_len)
                goto out_len;
-        if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc &&
+        if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting &&
-             c->no_chk_data_crc)
+            !c->remounting_rw && c->no_chk_data_crc)
                return 0;
        crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
@@ -343,11 +367,17 @@ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
 *
 * This function synchronizes write-buffer @buf and returns zero in case of
 * success or a negative error code in case of failure.
+ *
+ * Note, although write-buffers are of @c->max_write_size, this function does
+ * not necessarily writes all @c->max_write_size bytes to the flash. Instead,
+ * if the write-buffer is only partially filled with data, only the used part
+ * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized.
+ * This way we waste less space.
 */
 int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
 {
        struct ubifs_info *c = wbuf->c;
-        int err, dirt;
+        int err, dirt, sync_len;
        cancel_wbuf_timer_nolock(wbuf);
        if (!wbuf->used || wbuf->lnum == -1)
@@ -357,27 +387,53 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
        dbg_io("LEB %d:%d, %d bytes, jhead %s",
               wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
        ubifs_assert(!(wbuf->avail & 7));
-        ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
+        ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size);
+        ubifs_assert(wbuf->size >= c->min_io_size);
+        ubifs_assert(wbuf->size <= c->max_write_size);
+        ubifs_assert(wbuf->size % c->min_io_size == 0);
        ubifs_assert(!c->ro_media && !c->ro_mount);
+        if (c->leb_size - wbuf->offs >= c->max_write_size)
+                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
        if (c->ro_error)
                return -EROFS;
-        ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
+        /*
+         * Do not write whole write buffer but write only the minimum necessary
+         * amount of min. I/O units.
+         */
+        sync_len = ALIGN(wbuf->used, c->min_io_size);
+        dirt = sync_len - wbuf->used;
+        if (dirt)
+                ubifs_pad(c, wbuf->buf + wbuf->used, dirt);
        err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
-                            c->min_io_size, wbuf->dtype);
+                            sync_len, wbuf->dtype);
        if (err) {
                ubifs_err("cannot write %d bytes to LEB %d:%d",
-                          c->min_io_size, wbuf->lnum, wbuf->offs);
+                          sync_len, wbuf->lnum, wbuf->offs);
                dbg_dump_stack();
                return err;
        }
-        dirt = wbuf->avail;
        spin_lock(&wbuf->lock);
-        wbuf->offs += c->min_io_size;
+        wbuf->offs += sync_len;
-        wbuf->avail = c->min_io_size;
+        /*
+         * Now @wbuf->offs is not necessarily aligned to @c->max_write_size.
+         * But our goal is to optimize writes and make sure we write in
+         * @c->max_write_size chunks and to @c->max_write_size-aligned offset.
+         * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make
+         * sure that @wbuf->offs + @wbuf->size is aligned to
+         * @c->max_write_size. This way we make sure that after next
+         * write-buffer flush we are again at the optimal offset (aligned to
+         * @c->max_write_size).
+         */
+        if (c->leb_size - wbuf->offs < c->max_write_size)
+                wbuf->size = c->leb_size - wbuf->offs;
+        else if (wbuf->offs & (c->max_write_size - 1))
+                wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
+        else
+                wbuf->size = c->max_write_size;
+        wbuf->avail = wbuf->size;
        wbuf->used = 0;
        wbuf->next_ino = 0;
        spin_unlock(&wbuf->lock);
@@ -420,7 +476,13 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
        spin_lock(&wbuf->lock);
        wbuf->lnum = lnum;
        wbuf->offs = offs;
-        wbuf->avail = c->min_io_size;
+        if (c->leb_size - wbuf->offs < c->max_write_size)
+                wbuf->size = c->leb_size - wbuf->offs;
+        else if (wbuf->offs & (c->max_write_size - 1))
+                wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
+        else
+                wbuf->size = c->max_write_size;
+        wbuf->avail = wbuf->size;
        wbuf->used = 0;
        spin_unlock(&wbuf->lock);
        wbuf->dtype = dtype;
@@ -500,8 +562,9 @@ out_timers:
 *
 * This function writes data to flash via write-buffer @wbuf. This means that
 * the last piece of the node won't reach the flash media immediately if it
- * does not take whole minimal I/O unit. Instead, the node will sit in RAM
+ * does not take whole max. write unit (@c->max_write_size). Instead, the node
- * until the write-buffer is synchronized (e.g., by timer).
+ * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or
+ * because more data are appended to the write-buffer).
 *
 * This function returns zero in case of success and a negative error code in
 * case of failure. If the node cannot be written because there is no more
@@ -518,9 +581,14 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
        ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
        ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
-        ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
+        ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size);
+        ubifs_assert(wbuf->size >= c->min_io_size);
+        ubifs_assert(wbuf->size <= c->max_write_size);
+        ubifs_assert(wbuf->size % c->min_io_size == 0);
        ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
        ubifs_assert(!c->ro_media && !c->ro_mount);
+        if (c->leb_size - wbuf->offs >= c->max_write_size)
+                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
        if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
                err = -ENOSPC;
@@ -543,14 +611,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                        dbg_io("flush jhead %s wbuf to LEB %d:%d",
                               dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
                        err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
-                                            wbuf->offs, c->min_io_size,
+                                            wbuf->offs, wbuf->size,
                                            wbuf->dtype);
                        if (err)
                                goto out;
                        spin_lock(&wbuf->lock);
-                        wbuf->offs += c->min_io_size;
+                        wbuf->offs += wbuf->size;
-                        wbuf->avail = c->min_io_size;
+                        if (c->leb_size - wbuf->offs >= c->max_write_size)
+                                wbuf->size = c->max_write_size;
+                        else
+                                wbuf->size = c->leb_size - wbuf->offs;
+                        wbuf->avail = wbuf->size;
                        wbuf->used = 0;
                        wbuf->next_ino = 0;
                        spin_unlock(&wbuf->lock);
@@ -564,33 +636,57 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
                goto exit;
        }
-        /*
+        offs = wbuf->offs;
-         * The node is large enough and does not fit entirely within current
+        written = 0;
-         * minimal I/O unit. We have to fill and flush write-buffer and switch
-         * to the next min. I/O unit.
-         */
-        dbg_io("flush jhead %s wbuf to LEB %d:%d",
-               dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
-        memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
-        err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
-                            c->min_io_size, wbuf->dtype);
-        if (err)
-                goto out;
-        offs = wbuf->offs + c->min_io_size;
+        if (wbuf->used) {
-        len -= wbuf->avail;
+                /*
-        aligned_len -= wbuf->avail;
+                 * The node is large enough and does not fit entirely within
-        written = wbuf->avail;
+                 * current available space. We have to fill and flush
+                 * write-buffer and switch to the next max. write unit.
+                 */
+                dbg_io("flush jhead %s wbuf to LEB %d:%d",
+                       dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
+                memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
+                err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
+                                    wbuf->size, wbuf->dtype);
+                if (err)
+                        goto out;
+                offs += wbuf->size;
+                len -= wbuf->avail;
+                aligned_len -= wbuf->avail;
+                written += wbuf->avail;
+        } else if (wbuf->offs & (c->max_write_size - 1)) {
+                /*
+                 * The write-buffer offset is not aligned to
+                 * @c->max_write_size and @wbuf->size is less than
+                 * @c->max_write_size. Write @wbuf->size bytes to make sure the
+                 * following writes are done in optimal @c->max_write_size
+                 * chunks.
+                 */
+                dbg_io("write %d bytes to LEB %d:%d",
+                       wbuf->size, wbuf->lnum, wbuf->offs);
+                err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs,
+                                    wbuf->size, wbuf->dtype);
+                if (err)
+                        goto out;
+                offs += wbuf->size;
+                len -= wbuf->size;
+                aligned_len -= wbuf->size;
+                written += wbuf->size;
+        }
        /*
-         * The remaining data may take more whole min. I/O units, so write the
+         * The remaining data may take more whole max. write units, so write the
-         * remains multiple to min. I/O unit size directly to the flash media.
+         * remains multiple to max. write unit size directly to the flash media.
         * We align node length to 8-byte boundary because we anyway flash wbuf
         * if the remaining space is less than 8 bytes.
         */
-        n = aligned_len >> c->min_io_shift;
+        n = aligned_len >> c->max_write_shift;
        if (n) {
-                n <<= c->min_io_shift;
+                n <<= c->max_write_shift;
                dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
                err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
                                    wbuf->dtype);
@@ -606,14 +702,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        if (aligned_len)
                /*
                 * And now we have what's left and what does not take whole
-                 * min. I/O unit, so write it to the write-buffer and we are
+                 * max. write unit, so write it to the write-buffer and we are
                 * done.
                 */
                memcpy(wbuf->buf, buf + written, len);
        wbuf->offs = offs;
+        if (c->leb_size - wbuf->offs >= c->max_write_size)
+                wbuf->size = c->max_write_size;
+        else
+                wbuf->size = c->leb_size - wbuf->offs;
+        wbuf->avail = wbuf->size - aligned_len;
        wbuf->used = aligned_len;
-        wbuf->avail = c->min_io_size - aligned_len;
        wbuf->next_ino = 0;
        spin_unlock(&wbuf->lock);
@@ -837,11 +937,11 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
 {
        size_t size;
-        wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL);
+        wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL);
        if (!wbuf->buf)
                return -ENOMEM;
-        size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
+        size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
        wbuf->inodes = kmalloc(size, GFP_KERNEL);
        if (!wbuf->inodes) {
                kfree(wbuf->buf);
@@ -851,7 +951,14 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
        wbuf->used = 0;
        wbuf->lnum = wbuf->offs = -1;
-        wbuf->avail = c->min_io_size;
+        /*
+         * If the LEB starts at the max. write size aligned address, then
+         * write-buffer size has to be set to @c->max_write_size. Otherwise,
+         * set it to something smaller so that it ends at the closest max.
+         * write size boundary.
+         */
+        size = c->max_write_size - (c->leb_start % c->max_write_size);
+        wbuf->avail = wbuf->size = size;
        wbuf->dtype = UBI_UNKNOWN;
        wbuf->sync_callback = NULL;
        mutex_init(&wbuf->io_mutex);
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 8aacd64957a2..548acf494afd 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -160,7 +160,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                if (IS_RDONLY(inode))
                        return -EROFS;
-                if (!is_owner_or_cap(inode))
+                if (!inode_owner_or_capable(inode))
                        return -EACCES;
                if (get_user(flags, (int __user *) arg))
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 914f1bd89e57..aed25e864227 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -690,7 +690,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 {
        struct ubifs_data_node *data;
        int err, lnum, offs, compr_type, out_len;
-        int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR;
+        int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
        struct ubifs_inode *ui = ubifs_inode(inode);
        dbg_jnl("ino %lu, blk %u, len %d, key %s",
@@ -698,9 +698,19 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
                DBGKEY(key));
        ubifs_assert(len <= UBIFS_BLOCK_SIZE);
-        data = kmalloc(dlen, GFP_NOFS);
+        data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
-        if (!data)
+        if (!data) {
-                return -ENOMEM;
+                /*
+                 * Fall-back to the write reserve buffer. Note, we might be
+                 * currently on the memory reclaim path, when the kernel is
+                 * trying to free some memory by writing out dirty pages. The
+                 * write reserve buffer helps us to guarantee that we are
+                 * always able to write the data.
+                 */
+                allocated = 0;
+                mutex_lock(&c->write_reserve_mutex);
+                data = c->write_reserve_buf;
+        }
        data->ch.node_type = UBIFS_DATA_NODE;
        key_write(c, key, &data->key);
@@ -736,7 +746,10 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
                goto out_ro;
        finish_reservation(c);
-        kfree(data);
+        if (!allocated)
+                mutex_unlock(&c->write_reserve_mutex);
+        else
+                kfree(data);
        return 0;
 out_release:
@@ -745,7 +758,10 @@ out_ro:
        ubifs_ro_mode(c, err);
        finish_reservation(c);
 out_free:
-        kfree(data);
+        if (!allocated)
+                mutex_unlock(&c->write_reserve_mutex);
+        else
+                kfree(data);
        return err;
 }
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 4d4ca388889b..0ee0847f2421 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1035,7 +1035,8 @@ static int scan_check_cb(struct ubifs_info *c,
        struct ubifs_scan_leb *sleb;
        struct ubifs_scan_node *snod;
        struct ubifs_lp_stats *lst = &data->lst;
-        int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty;
+        int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
+        void *buf = NULL;
        cat = lp->flags & LPROPS_CAT_MASK;
        if (cat != LPROPS_UNCAT) {
@@ -1093,7 +1094,13 @@ static int scan_check_cb(struct ubifs_info *c,
                }
        }
-        sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
+        buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+        if (!buf) {
+                ubifs_err("cannot allocate memory to scan LEB %d", lnum);
+                goto out;
+        }
+        sleb = ubifs_scan(c, lnum, 0, buf, 0);
        if (IS_ERR(sleb)) {
                /*
                 * After an unclean unmount, empty and freeable LEBs
@@ -1105,7 +1112,8 @@ static int scan_check_cb(struct ubifs_info *c,
                        lst->empty_lebs += 1;
                        lst->total_free += c->leb_size;
                        lst->total_dark += ubifs_calc_dark(c, c->leb_size);
-                        return LPT_SCAN_CONTINUE;
+                        ret = LPT_SCAN_CONTINUE;
+                        goto exit;
                }
                if (lp->free + lp->dirty == c->leb_size &&
@@ -1115,10 +1123,12 @@ static int scan_check_cb(struct ubifs_info *c,
                        lst->total_free  += lp->free;
                        lst->total_dirty += lp->dirty;
                        lst->total_dark  +=  ubifs_calc_dark(c, c->leb_size);
-                        return LPT_SCAN_CONTINUE;
+                        ret = LPT_SCAN_CONTINUE;
+                        goto exit;
                }
                data->err = PTR_ERR(sleb);
-                return LPT_SCAN_STOP;
+                ret = LPT_SCAN_STOP;
+                goto exit;
        }
        is_idx = -1;
@@ -1236,7 +1246,10 @@ static int scan_check_cb(struct ubifs_info *c,
        }
        ubifs_scan_destroy(sleb);
-        return LPT_SCAN_CONTINUE;
+        ret = LPT_SCAN_CONTINUE;
+exit:
+        vfree(buf);
+        return ret;
 out_print:
        ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
@@ -1246,6 +1259,7 @@ out_print:
 out_destroy:
        ubifs_scan_destroy(sleb);
 out:
+        vfree(buf);
        data->err = -EINVAL;
        return LPT_SCAN_STOP;
 }
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 72775d35b99e..ef5155e109a2 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1270,10 +1270,9 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
        lnum = branch->lnum;
        offs = branch->offs;
        pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
-        if (!pnode) {
+        if (!pnode)
-                err = -ENOMEM;
+                return -ENOMEM;
-                goto out;
-        }
        if (lnum == 0) {
                /*
                 * This pnode was not written which just means that the LEB
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 5c90dec5db0b..0c9c69bd983a 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1628,29 +1628,35 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
 {
        int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
        int ret;
-        void *buf = c->dbg->buf;
+        void *buf, *p;
        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
                return 0;
+        buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+        if (!buf) {
+                ubifs_err("cannot allocate memory for ltab checking");
+                return 0;
+        }
        dbg_lp("LEB %d", lnum);
        err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
        if (err) {
                dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err);
-                return err;
+                goto out;
        }
        while (1) {
-                if (!is_a_node(c, buf, len)) {
+                if (!is_a_node(c, p, len)) {
                        int i, pad_len;
-                        pad_len = get_pad_len(c, buf, len);
+                        pad_len = get_pad_len(c, p, len);
                        if (pad_len) {
-                                buf += pad_len;
+                                p += pad_len;
                                len -= pad_len;
                                dirty += pad_len;
                                continue;
                        }
-                        if (!dbg_is_all_ff(buf, len)) {
+                        if (!dbg_is_all_ff(p, len)) {
                                dbg_msg("invalid empty space in LEB %d at %d",
                                        lnum, c->leb_size - len);
                                err = -EINVAL;
@@ -1668,16 +1674,21 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
                                        lnum, dirty, c->ltab[i].dirty);
                                err = -EINVAL;
                        }
-                        return err;
+                        goto out;
                }
-                node_type = get_lpt_node_type(c, buf, &node_num);
+                node_type = get_lpt_node_type(c, p, &node_num);
                node_len = get_lpt_node_len(c, node_type);
                ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len);
                if (ret == 1)
                        dirty += node_len;
-                buf += node_len;
+                p += node_len;
                len -= node_len;
        }
+        err = 0;
+out:
+        vfree(buf);
+        return err;
 }
 /**
@@ -1870,25 +1881,31 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
 {
        int err, len = c->leb_size, node_type, node_num, node_len, offs;
-        void *buf = c->dbg->buf;
+        void *buf, *p;
        printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
               current->pid, lnum);
+        buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+        if (!buf) {
+                ubifs_err("cannot allocate memory to dump LPT");
+                return;
+        }
        err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
        if (err) {
                ubifs_err("cannot read LEB %d, error %d", lnum, err);
-                return;
+                goto out;
        }
        while (1) {
                offs = c->leb_size - len;
-                if (!is_a_node(c, buf, len)) {
+                if (!is_a_node(c, p, len)) {
                        int pad_len;
-                        pad_len = get_pad_len(c, buf, len);
+                        pad_len = get_pad_len(c, p, len);
                        if (pad_len) {
                                printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
                                       lnum, offs, pad_len);
-                                buf += pad_len;
+                                p += pad_len;
                                len -= pad_len;
                                continue;
                        }
@@ -1898,7 +1915,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
                        break;
                }
-                node_type = get_lpt_node_type(c, buf, &node_num);
+                node_type = get_lpt_node_type(c, p, &node_num);
                switch (node_type) {
                case UBIFS_LPT_PNODE:
                {
@@ -1923,7 +1940,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
                        else
                                printk(KERN_DEBUG "LEB %d:%d, nnode, ",
                                       lnum, offs);
-                        err = ubifs_unpack_nnode(c, buf, &nnode);
+                        err = ubifs_unpack_nnode(c, p, &nnode);
                        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
                                printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
                                       nnode.nbranch[i].offs);
@@ -1944,15 +1961,18 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
                        break;
                default:
                        ubifs_err("LPT node type %d not recognized", node_type);
-                        return;
+                        goto out;
                }
-                buf += node_len;
+                p += node_len;
                len -= node_len;
        }
        printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
               current->pid, lnum);
+out:
+        vfree(buf);
+        return;
 }
 /**
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 82009c74b6a3..09df318e368f 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -892,15 +892,22 @@ static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb)
 static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
 {
        int lnum, err = 0;
+        void *buf;
        /* Check no-orphans flag and skip this if no orphans */
        if (c->no_orphs)
                return 0;
+        buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
+        if (!buf) {
+                ubifs_err("cannot allocate memory to check orphans");
+                return 0;
+        }
        for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
                struct ubifs_scan_leb *sleb;
-                sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
+                sleb = ubifs_scan(c, lnum, 0, buf, 0);
                if (IS_ERR(sleb)) {
                        err = PTR_ERR(sleb);
                        break;
@@ -912,6 +919,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
                        break;
        }
+        vfree(buf);
        return err;
 }
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 77e9b874b6c2..936f2cbfe6b6 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -28,6 +28,23 @@
 * UBIFS always cleans away all remnants of an unclean un-mount, so that
 * errors do not accumulate. However UBIFS defers recovery if it is mounted
 * read-only, and the flash is not modified in that case.
+ *
+ * The general UBIFS approach to the recovery is that it recovers from
+ * corruptions which could be caused by power cuts, but it refuses to recover
+ * from corruption caused by other reasons. And UBIFS tries to distinguish
+ * between these 2 reasons of corruptions and silently recover in the former
+ * case and loudly complain in the latter case.
+ *
+ * UBIFS writes only to erased LEBs, so it writes only to the flash space
+ * containing only 0xFFs. UBIFS also always writes strictly from the beginning
+ * of the LEB to the end. And UBIFS assumes that the underlying flash media
+ * writes in @c->max_write_size bytes at a time.
+ *
+ * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min.
+ * I/O unit corresponding to offset X to contain corrupted data, all the
+ * following min. I/O units have to contain empty space (all 0xFFs). If this is
+ * not true, the corruption cannot be the result of a power cut, and UBIFS
+ * refuses to mount.
 */
 #include <linux/crc32.h>
@@ -362,8 +379,9 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
 * @offs: offset to check
 *
 * This function returns %1 if @offs was in the last write to the LEB whose data
- * is in @buf, otherwise %0 is returned.  The determination is made by checking
+ * is in @buf, otherwise %0 is returned. The determination is made by checking
- * for subsequent empty space starting from the next @c->min_io_size boundary.
+ * for subsequent empty space starting from the next @c->max_write_size
+ * boundary.
 */
 static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
 {
@@ -371,10 +389,10 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
        uint8_t *p;
        /*
-         * Round up to the next @c->min_io_size boundary i.e. @offs is in the
+         * Round up to the next @c->max_write_size boundary i.e. @offs is in
-         * last wbuf written. After that should be empty space.
+         * the last wbuf written. After that should be empty space.
         */
-        empty_offs = ALIGN(offs + 1, c->min_io_size);
+        empty_offs = ALIGN(offs + 1, c->max_write_size);
        check_len = c->leb_size - empty_offs;
        p = buf + empty_offs - offs;
        return is_empty(p, check_len);
@@ -429,7 +447,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
        int skip, dlen = le32_to_cpu(ch->len);
        /* Check for empty space after the corrupt node's common header */
-        skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
+        skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs;
        if (is_empty(buf + skip, len - skip))
                return 1;
        /*
@@ -441,7 +459,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
                return 0;
        }
        /* Now we know the corrupt node's length we can skip over it */
-        skip = ALIGN(offs + dlen, c->min_io_size) - offs;
+        skip = ALIGN(offs + dlen, c->max_write_size) - offs;
        /* After which there should be empty space */
        if (is_empty(buf + skip, len - skip))
                return 1;
@@ -671,10 +689,14 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                } else {
                        int corruption = first_non_ff(buf, len);
+                        /*
+                         * See header comment for this file for more
+                         * explanations about the reasons we have this check.
+                         */
                        ubifs_err("corrupt empty space LEB %d:%d, corruption "
                                  "starts at %d", lnum, offs, corruption);
                        /* Make sure we dump interesting non-0xFF data */
-                        offs = corruption;
+                        offs += corruption;
                        buf += corruption;
                        goto corrupted;
                }
@@ -836,12 +858,8 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
 static int recover_head(const struct ubifs_info *c, int lnum, int offs,
                        void *sbuf)
 {
-        int len, err;
+        int len = c->max_write_size, err;
-        if (c->min_io_size > 1)
-                len = c->min_io_size;
-        else
-                len = 512;
        if (offs + len > c->leb_size)
                len = c->leb_size - offs;
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 3e1ee57dbeaa..36216b46f772 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -328,7 +328,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
                if (!quiet)
                        ubifs_err("empty space starts at non-aligned offset %d",
                                  offs);
-                goto corrupted;;
+                goto corrupted;
        }
        ubifs_end_scan(c, sleb, lnum, offs);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6e11c2975dcf..c75f6133206c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -512,9 +512,12 @@ static int init_constants_early(struct ubifs_info *c)
        c->leb_cnt = c->vi.size;
        c->leb_size = c->vi.usable_leb_size;
+        c->leb_start = c->di.leb_start;
        c->half_leb_size = c->leb_size / 2;
        c->min_io_size = c->di.min_io_size;
        c->min_io_shift = fls(c->min_io_size) - 1;
+        c->max_write_size = c->di.max_write_size;
+        c->max_write_shift = fls(c->max_write_size) - 1;
        if (c->leb_size < UBIFS_MIN_LEB_SZ) {
                ubifs_err("too small LEBs (%d bytes), min. is %d bytes",
@@ -534,6 +537,18 @@ static int init_constants_early(struct ubifs_info *c)
        }
        /*
+         * Maximum write size has to be greater or equivalent to min. I/O
+         * size, and be multiple of min. I/O size.
+         */
+        if (c->max_write_size < c->min_io_size ||
+            c->max_write_size % c->min_io_size ||
+            !is_power_of_2(c->max_write_size)) {
+                ubifs_err("bad write buffer size %d for %d min. I/O unit",
+                          c->max_write_size, c->min_io_size);
+                return -EINVAL;
+        }
+        /*
         * UBIFS aligns all node to 8-byte boundary, so to make function in
         * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is
         * less than 8.
@@ -541,6 +556,10 @@ static int init_constants_early(struct ubifs_info *c)
        if (c->min_io_size < 8) {
                c->min_io_size = 8;
                c->min_io_shift = 3;
+                if (c->max_write_size < c->min_io_size) {
+                        c->max_write_size = c->min_io_size;
+                        c->max_write_shift = c->min_io_shift;
+                }
        }
        c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size);
@@ -1202,11 +1221,14 @@ static int mount_ubifs(struct ubifs_info *c)
        if (c->bulk_read == 1)
                bu_init(c);
-        /*
+        if (!c->ro_mount) {
-         * We have to check all CRCs, even for data nodes, when we mount the FS
+                c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ,
-         * (specifically, when we are replaying).
+                                               GFP_KERNEL);
-         */
+                if (!c->write_reserve_buf)
-        c->always_chk_crc = 1;
+                        goto out_free;
+        }
+        c->mounting = 1;
        err = ubifs_read_superblock(c);
        if (err)
@@ -1382,7 +1404,7 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_infos;
-        c->always_chk_crc = 0;
+        c->mounting = 0;
        ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
                  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
@@ -1403,6 +1425,7 @@ static int mount_ubifs(struct ubifs_info *c)
        dbg_msg("compiled on:         " __DATE__ " at " __TIME__);
        dbg_msg("min. I/O unit size:  %d bytes", c->min_io_size);
+        dbg_msg("max. write size:     %d bytes", c->max_write_size);
        dbg_msg("LEB size:            %d bytes (%d KiB)",
                c->leb_size, c->leb_size >> 10);
        dbg_msg("data journal heads:  %d",
@@ -1432,9 +1455,9 @@ static int mount_ubifs(struct ubifs_info *c)
                UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
        dbg_msg("node sizes:          ref %zu, cmt. start %zu, orph %zu",
                UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
-        dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu",
+        dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu, idx %d",
                UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
-                UBIFS_MAX_DENT_NODE_SZ);
+                UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
        dbg_msg("dead watermark:      %d", c->dead_wm);
        dbg_msg("dark watermark:      %d", c->dark_wm);
        dbg_msg("LEB overhead:        %d", c->leb_overhead);
@@ -1474,6 +1497,7 @@ out_wbufs:
 out_cbuf:
        kfree(c->cbuf);
 out_free:
+        kfree(c->write_reserve_buf);
        kfree(c->bu.buf);
        vfree(c->ileb_buf);
        vfree(c->sbuf);
@@ -1512,6 +1536,7 @@ static void ubifs_umount(struct ubifs_info *c)
        kfree(c->cbuf);
        kfree(c->rcvrd_mst_node);
        kfree(c->mst_node);
+        kfree(c->write_reserve_buf);
        kfree(c->bu.buf);
        vfree(c->ileb_buf);
        vfree(c->sbuf);
@@ -1543,7 +1568,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        mutex_lock(&c->umount_mutex);
        dbg_save_space_info(c);
        c->remounting_rw = 1;
-        c->always_chk_crc = 1;
+        c->ro_mount = 0;
        err = check_free_space(c);
        if (err)
@@ -1598,6 +1623,10 @@ static int ubifs_remount_rw(struct ubifs_info *c)
                goto out;
        }
+        c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL);
+        if (!c->write_reserve_buf)
+                goto out;
        err = ubifs_lpt_init(c, 0, 1);
        if (err)
                goto out;
@@ -1648,14 +1677,13 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        }
        dbg_gen("re-mounted read-write");
-        c->ro_mount = 0;
        c->remounting_rw = 0;
-        c->always_chk_crc = 0;
        err = dbg_check_space_info(c);
        mutex_unlock(&c->umount_mutex);
        return err;
 out:
+        c->ro_mount = 1;
        vfree(c->orph_buf);
        c->orph_buf = NULL;
        if (c->bgt) {
@@ -1663,11 +1691,12 @@ out:
                c->bgt = NULL;
        }
        free_wbufs(c);
+        kfree(c->write_reserve_buf);
+        c->write_reserve_buf = NULL;
        vfree(c->ileb_buf);
        c->ileb_buf = NULL;
        ubifs_lpt_free(c, 1);
        c->remounting_rw = 0;
-        c->always_chk_crc = 0;
        mutex_unlock(&c->umount_mutex);
        return err;
 }
@@ -1707,6 +1736,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
        free_wbufs(c);
        vfree(c->orph_buf);
        c->orph_buf = NULL;
+        kfree(c->write_reserve_buf);
+        c->write_reserve_buf = NULL;
        vfree(c->ileb_buf);
        c->ileb_buf = NULL;
        ubifs_lpt_free(c, 1);
@@ -1937,6 +1968,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        mutex_init(&c->mst_mutex);
        mutex_init(&c->umount_mutex);
        mutex_init(&c->bu_mutex);
+        mutex_init(&c->write_reserve_mutex);
        init_waitqueue_head(&c->cmt_wq);
        c->buds = RB_ROOT;
        c->old_idx = RB_ROOT;
@@ -1954,6 +1986,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        INIT_LIST_HEAD(&c->old_buds);
        INIT_LIST_HEAD(&c->orph_list);
        INIT_LIST_HEAD(&c->orph_new);
+        c->no_chk_data_crc = 1;
        c->vfs_sb = sb;
        c->highest_inum = UBIFS_FIRST_INO;
@@ -1979,7 +2012,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
         */
        c->bdi.name = "ubifs",
        c->bdi.capabilities = BDI_CAP_MAP_COPY;
-        c->bdi.unplug_io_fn = default_unplug_io_fn;
        err  = bdi_init(&c->bdi);
        if (err)
                goto out_close;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index ad9cf0133622..de485979ca39 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -447,8 +447,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 *
 * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
 * is true (it is controlled by corresponding mount option). However, if
- * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always
+ * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to
- * checked.
+ * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is
+ * because during mounting or re-mounting from R/O mode to R/W mode we may read
+ * journal nodes (when replying the journal or doing the recovery) and the
+ * journal nodes may potentially be corrupted, so checking is required.
 */
 static int try_read_node(const struct ubifs_info *c, void *buf, int type,
                         int len, int lnum, int offs)
@@ -476,7 +479,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
        if (node_len != len)
                return 0;
-        if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc)
+        if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting &&
+            !c->remounting_rw)
                return 1;
        crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 381d6b207a52..8c40ad3c6721 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -151,6 +151,12 @@
 */
 #define WORST_COMPR_FACTOR 2
+/*
+ * How much memory is needed for a buffer where we comress a data node.
+ */
+#define COMPRESSED_DATA_NODE_BUF_SZ \
+        (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR)
 /* Maximum expected tree height for use by bottom_up_buf */
 #define BOTTOM_UP_HEIGHT 64
@@ -646,6 +652,7 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
 * @offs: write-buffer offset in this logical eraseblock
 * @avail: number of bytes available in the write-buffer
 * @used:  number of used bytes in the write-buffer
+ * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range)
 * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
 * %UBI_UNKNOWN)
 * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
@@ -680,6 +687,7 @@ struct ubifs_wbuf {
        int offs;
        int avail;
        int used;
+        int size;
        int dtype;
        int jhead;
        int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
@@ -1003,6 +1011,11 @@ struct ubifs_debug_info;
 * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
 * @bu: pre-allocated bulk-read information
 *
+ * @write_reserve_mutex: protects @write_reserve_buf
+ * @write_reserve_buf: on the write path we allocate memory, which might
+ *                     sometimes be unavailable, in which case we use this
+ *                     write reserve buffer
+ *
 * @log_lebs: number of logical eraseblocks in the log
 * @log_bytes: log size in bytes
 * @log_last: last LEB of the log
@@ -1024,7 +1037,12 @@ struct ubifs_debug_info;
 *
 * @min_io_size: minimal input/output unit size
 * @min_io_shift: number of bits in @min_io_size minus one
+ * @max_write_size: maximum amount of bytes the underlying flash can write at a
+ *                  time (MTD write buffer size)
+ * @max_write_shift: number of bits in @max_write_size minus one
 * @leb_size: logical eraseblock size in bytes
+ * @leb_start: starting offset of logical eraseblocks within physical
+ *             eraseblocks
 * @half_leb_size: half LEB size
 * @idx_leb_size: how many bytes of an LEB are effectively available when it is
 *                used to store indexing nodes (@leb_size - @max_idx_node_sz)
@@ -1166,22 +1184,21 @@ struct ubifs_debug_info;
 * @rp_uid: reserved pool user ID
 * @rp_gid: reserved pool group ID
 *
- * @empty: if the UBI device is empty
+ * @empty: %1 if the UBI device is empty
+ * @need_recovery: %1 if the file-system needs recovery
+ * @replaying: %1 during journal replay
+ * @mounting: %1 while mounting
+ * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
 * @replay_tree: temporary tree used during journal replay
 * @replay_list: temporary list used during journal replay
 * @replay_buds: list of buds to replay
 * @cs_sqnum: sequence number of first node in the log (commit start node)
 * @replay_sqnum: sequence number of node currently being replayed
- * @need_recovery: file-system needs recovery
- * @replaying: set to %1 during journal replay
 * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
 *                    mode
 * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
 *                  FS to R/W mode
 * @size_tree: inode size information for recovery
- * @remounting_rw: set while re-mounting from R/O mode to R/W mode
- * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
- *                  mode)
 * @mount_opts: UBIFS-specific mount options
 *
 * @dbg: debugging-related information
@@ -1250,6 +1267,9 @@ struct ubifs_info {
        struct mutex bu_mutex;
        struct bu_info bu;
+        struct mutex write_reserve_mutex;
+        void *write_reserve_buf;
        int log_lebs;
        long long log_bytes;
        int log_last;
@@ -1271,7 +1291,10 @@ struct ubifs_info {
        int min_io_size;
        int min_io_shift;
+        int max_write_size;
+        int max_write_shift;
        int leb_size;
+        int leb_start;
        int half_leb_size;
        int idx_leb_size;
        int leb_cnt;
@@ -1402,19 +1425,19 @@ struct ubifs_info {
        gid_t rp_gid;
        /* The below fields are used only during mounting and re-mounting */
-        int empty;
+        unsigned int empty:1;
+        unsigned int need_recovery:1;
+        unsigned int replaying:1;
+        unsigned int mounting:1;
+        unsigned int remounting_rw:1;
        struct rb_root replay_tree;
        struct list_head replay_list;
        struct list_head replay_buds;
        unsigned long long cs_sqnum;
        unsigned long long replay_sqnum;
-        int need_recovery;
-        int replaying;
        struct list_head unclean_leb_list;
        struct ubifs_mst_node *rcvrd_mst_node;
        struct rb_root size_tree;
-        int remounting_rw;
-        int always_chk_crc;
        struct ubifs_mount_opts mount_opts;
 #ifdef CONFIG_UBIFS_FS_DEBUG
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index c74400f88fe0..3299f469e712 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -56,6 +56,7 @@
 */
 #include "ubifs.h"
+#include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
@@ -80,7 +81,6 @@ enum {
 };
 static const struct inode_operations none_inode_operations;
-static const struct address_space_operations none_address_operations;
 static const struct file_operations none_file_operations;
 /**
@@ -130,7 +130,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
        }
        /* Re-define all operations to be "nothing" */
-        inode->i_mapping->a_ops = &none_address_operations;
+        inode->i_mapping->a_ops = &empty_aops;
        inode->i_op = &none_inode_operations;
        inode->i_fop = &none_file_operations;
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 306ee39ef2c3..95518a9f589e 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -27,11 +27,10 @@
 #include "udf_i.h"
 #include "udf_sb.h"
-#define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr)
+#define udf_clear_bit   __test_and_clear_bit_le
-#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr)
+#define udf_set_bit     __test_and_set_bit_le
-#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr)
+#define udf_test_bit    test_bit_le
-#define udf_find_next_one_bit(addr, size, offset) \
+#define udf_find_next_one_bit   find_next_bit_le
-                ext2_find_next_bit(addr, size, offset)
 static int read_block_bitmap(struct super_block *sb,
                             struct udf_bitmap *bitmap, unsigned int block,
@@ -297,7 +296,7 @@ repeat:
                                break;
                        }
                } else {
-                        bit = udf_find_next_one_bit((char *)bh->b_data,
+                        bit = udf_find_next_one_bit(bh->b_data,
                                                    sb->s_blocksize << 3,
                                                    group_start << 3);
                        if (bit < sb->s_blocksize << 3)
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 89c78486cbbe..2a346bb1d9f5 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -98,7 +98,6 @@ static int udf_adinicb_write_end(struct file *file,
 const struct address_space_operations udf_adinicb_aops = {
        .readpage       = udf_adinicb_readpage,
        .writepage      = udf_adinicb_writepage,
-        .sync_page      = block_sync_page,
        .write_begin = simple_write_begin,
        .write_end = udf_adinicb_write_end,
 };
@@ -123,8 +122,8 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (inode->i_sb->s_blocksize <
                                (udf_file_entry_alloc_offset(inode) +
                                                pos + count)) {
-                        udf_expand_file_adinicb(inode, pos + count, &err);
+                        err = udf_expand_file_adinicb(inode);
-                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+                        if (err) {
                                udf_debug("udf_expand_adinicb: err=%d\n", err);
                                up_write(&iinfo->i_data_sem);
                                return err;
@@ -237,7 +236,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_SIZE) &&
            attr->ia_size != i_size_read(inode)) {
-                error = vmtruncate(inode, attr->ia_size);
+                error = udf_setsize(inode, attr->ia_size);
                if (error)
                        return error;
        }
@@ -249,5 +248,4 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
 const struct inode_operations udf_file_inode_operations = {
        .setattr                = udf_setattr,
-        .truncate               = udf_truncate,
 };
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index c6a2e782b97b..1d1358ed80c1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -73,14 +73,12 @@ void udf_evict_inode(struct inode *inode)
        struct udf_inode_info *iinfo = UDF_I(inode);
        int want_delete = 0;
-        truncate_inode_pages(&inode->i_data, 0);
        if (!inode->i_nlink && !is_bad_inode(inode)) {
                want_delete = 1;
-                inode->i_size = 0;
+                udf_setsize(inode, 0);
-                udf_truncate(inode);
                udf_update_inode(inode, IS_SYNC(inode));
-        }
+        } else
+                truncate_inode_pages(&inode->i_data, 0);
        invalidate_inode_buffers(inode);
        end_writeback(inode);
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
@@ -117,9 +115,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
        ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
        if (unlikely(ret)) {
-                loff_t isize = mapping->host->i_size;
+                struct inode *inode = mapping->host;
-                if (pos + len > isize)
+                struct udf_inode_info *iinfo = UDF_I(inode);
-                        vmtruncate(mapping->host, isize);
+                loff_t isize = inode->i_size;
+                if (pos + len > isize) {
+                        truncate_pagecache(inode, pos + len, isize);
+                        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+                                down_write(&iinfo->i_data_sem);
+                                udf_truncate_extents(inode);
+                                up_write(&iinfo->i_data_sem);
+                        }
+                }
        }
        return ret;
@@ -133,36 +140,36 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations udf_aops = {
        .readpage       = udf_readpage,
        .writepage      = udf_writepage,
-        .sync_page      = block_sync_page,
        .write_begin            = udf_write_begin,
        .write_end              = generic_write_end,
        .bmap           = udf_bmap,
 };
-void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err)
+int udf_expand_file_adinicb(struct inode *inode)
 {
        struct page *page;
        char *kaddr;
        struct udf_inode_info *iinfo = UDF_I(inode);
+        int err;
        struct writeback_control udf_wbc = {
                .sync_mode = WB_SYNC_NONE,
                .nr_to_write = 1,
        };
-        /* from now on we have normal address_space methods */
-        inode->i_data.a_ops = &udf_aops;
        if (!iinfo->i_lenAlloc) {
                if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
                        iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
                else
                        iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
+                /* from now on we have normal address_space methods */
+                inode->i_data.a_ops = &udf_aops;
                mark_inode_dirty(inode);
-                return;
+                return 0;
        }
-        page = grab_cache_page(inode->i_mapping, 0);
+        page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
-        BUG_ON(!PageLocked(page));
+        if (!page)
+                return -ENOMEM;
        if (!PageUptodate(page)) {
                kaddr = kmap(page);
@@ -181,11 +188,24 @@ void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err)
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
        else
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
+        /* from now on we have normal address_space methods */
-        inode->i_data.a_ops->writepage(page, &udf_wbc);
+        inode->i_data.a_ops = &udf_aops;
+        err = inode->i_data.a_ops->writepage(page, &udf_wbc);
+        if (err) {
+                /* Restore everything back so that we don't lose data... */
+                lock_page(page);
+                kaddr = kmap(page);
+                memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
+                       inode->i_size);
+                kunmap(page);
+                unlock_page(page);
+                iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
+                inode->i_data.a_ops = &udf_adinicb_aops;
+        }
        page_cache_release(page);
        mark_inode_dirty(inode);
+        return err;
 }
 struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
@@ -348,8 +368,10 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
 }
 /* Extend the file by 'blocks' blocks, return the number of extents added */
-int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
+static int udf_do_extend_file(struct inode *inode,
-                    struct kernel_long_ad *last_ext, sector_t blocks)
+                              struct extent_position *last_pos,
+                              struct kernel_long_ad *last_ext,
+                              sector_t blocks)
 {
        sector_t add;
        int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
@@ -357,6 +379,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        struct kernel_lb_addr prealloc_loc = {};
        int prealloc_len = 0;
        struct udf_inode_info *iinfo;
+        int err;
        /* The previous extent is fake and we should not extend by anything
         * - there's nothing to do... */
@@ -422,26 +445,29 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
        /* Create enough extents to cover the whole hole */
        while (blocks > add) {
                blocks -= add;
-                if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
+                err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
-                                 last_ext->extLength, 1) == -1)
+                                   last_ext->extLength, 1);
-                        return -1;
+                if (err)
+                        return err;
                count++;
        }
        if (blocks) {
                last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
                        (blocks << sb->s_blocksize_bits);
-                if (udf_add_aext(inode, last_pos, &last_ext->extLocation,
+                err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
-                                 last_ext->extLength, 1) == -1)
+                                   last_ext->extLength, 1);
-                        return -1;
+                if (err)
+                        return err;
                count++;
        }
 out:
        /* Do we have some preallocated blocks saved? */
        if (prealloc_len) {
-                if (udf_add_aext(inode, last_pos, &prealloc_loc,
+                err = udf_add_aext(inode, last_pos, &prealloc_loc,
-                                 prealloc_len, 1) == -1)
+                                   prealloc_len, 1);
-                        return -1;
+                if (err)
+                        return err;
                last_ext->extLocation = prealloc_loc;
                last_ext->extLength = prealloc_len;
                count++;
@@ -453,11 +479,68 @@ out:
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                last_pos->offset -= sizeof(struct long_ad);
        else
-                return -1;
+                return -EIO;
        return count;
 }
+static int udf_extend_file(struct inode *inode, loff_t newsize)
+{
+        struct extent_position epos;
+        struct kernel_lb_addr eloc;
+        uint32_t elen;
+        int8_t etype;
+        struct super_block *sb = inode->i_sb;
+        sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
+        int adsize;
+        struct udf_inode_info *iinfo = UDF_I(inode);
+        struct kernel_long_ad extent;
+        int err;
+        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+                adsize = sizeof(struct short_ad);
+        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+                adsize = sizeof(struct long_ad);
+        else
+                BUG();
+        etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
+        /* File has extent covering the new size (could happen when extending
+         * inside a block)? */
+        if (etype != -1)
+                return 0;
+        if (newsize & (sb->s_blocksize - 1))
+                offset++;
+        /* Extended file just to the boundary of the last file block? */
+        if (offset == 0)
+                return 0;
+        /* Truncate is extending the file by 'offset' blocks */
+        if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
+            (epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
+                /* File has no extents at all or has empty last
+                 * indirect extent! Create a fake extent... */
+                extent.extLocation.logicalBlockNum = 0;
+                extent.extLocation.partitionReferenceNum = 0;
+                extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
+        } else {
+                epos.offset -= adsize;
+                etype = udf_next_aext(inode, &epos, &extent.extLocation,
+                                      &extent.extLength, 0);
+                extent.extLength |= etype << 30;
+        }
+        err = udf_do_extend_file(inode, &epos, &extent, offset);
+        if (err < 0)
+                goto out;
+        err = 0;
+        iinfo->i_lenExtents = newsize;
+out:
+        brelse(epos.bh);
+        return err;
+}
 static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                                        int *err, sector_t *phys, int *new)
 {
@@ -540,7 +623,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                        elen = EXT_RECORDED_ALLOCATED |
                                ((elen + inode->i_sb->s_blocksize - 1) &
                                 ~(inode->i_sb->s_blocksize - 1));
-                        etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
+                        udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
                }
                brelse(prev_epos.bh);
                brelse(cur_epos.bh);
@@ -564,19 +647,17 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                        memset(&laarr[0].extLocation, 0x00,
                                sizeof(struct kernel_lb_addr));
                        laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
-                        /* Will udf_extend_file() create real extent from
+                        /* Will udf_do_extend_file() create real extent from
                           a fake one? */
                        startnum = (offset > 0);
                }
                /* Create extents for the hole between EOF and offset */
-                ret = udf_extend_file(inode, &prev_epos, laarr, offset);
+                ret = udf_do_extend_file(inode, &prev_epos, laarr, offset);
-                if (ret == -1) {
+                if (ret < 0) {
                        brelse(prev_epos.bh);
                        brelse(cur_epos.bh);
                        brelse(next_epos.bh);
-                        /* We don't really know the error here so we just make
+                        *err = ret;
-                         * something up */
-                        *err = -ENOSPC;
                        return NULL;
                }
                c = 0;
@@ -1005,52 +1086,66 @@ struct buffer_head *udf_bread(struct inode *inode, int block,
        return NULL;
 }
-void udf_truncate(struct inode *inode)
+int udf_setsize(struct inode *inode, loff_t newsize)
 {
-        int offset;
        int err;
        struct udf_inode_info *iinfo;
+        int bsize = 1 << inode->i_blkbits;
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
              S_ISLNK(inode->i_mode)))
-                return;
+                return -EINVAL;
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                return;
+                return -EPERM;
        iinfo = UDF_I(inode);
-        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+        if (newsize > inode->i_size) {
                down_write(&iinfo->i_data_sem);
-                if (inode->i_sb->s_blocksize <
+                if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
-                                (udf_file_entry_alloc_offset(inode) +
+                        if (bsize <
-                                 inode->i_size)) {
+                            (udf_file_entry_alloc_offset(inode) + newsize)) {
-                        udf_expand_file_adinicb(inode, inode->i_size, &err);
+                                err = udf_expand_file_adinicb(inode);
-                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+                                if (err) {
-                                inode->i_size = iinfo->i_lenAlloc;
+                                        up_write(&iinfo->i_data_sem);
-                                up_write(&iinfo->i_data_sem);
+                                        return err;
-                                return;
+                                }
                        } else
-                                udf_truncate_extents(inode);
+                                iinfo->i_lenAlloc = newsize;
-                } else {
+                }
-                        offset = inode->i_size & (inode->i_sb->s_blocksize - 1);
+                err = udf_extend_file(inode, newsize);
-                        memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset,
+                if (err) {
-                                0x00, inode->i_sb->s_blocksize -
+                        up_write(&iinfo->i_data_sem);
-                                offset - udf_file_entry_alloc_offset(inode));
+                        return err;
-                        iinfo->i_lenAlloc = inode->i_size;
                }
+                truncate_setsize(inode, newsize);
                up_write(&iinfo->i_data_sem);
        } else {
-                block_truncate_page(inode->i_mapping, inode->i_size,
+                if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
-                                    udf_get_block);
+                        down_write(&iinfo->i_data_sem);
+                        memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize,
+                               0x00, bsize - newsize -
+                               udf_file_entry_alloc_offset(inode));
+                        iinfo->i_lenAlloc = newsize;
+                        truncate_setsize(inode, newsize);
+                        up_write(&iinfo->i_data_sem);
+                        goto update_time;
+                }
+                err = block_truncate_page(inode->i_mapping, newsize,
+                                          udf_get_block);
+                if (err)
+                        return err;
                down_write(&iinfo->i_data_sem);
+                truncate_setsize(inode, newsize);
                udf_truncate_extents(inode);
                up_write(&iinfo->i_data_sem);
        }
+update_time:
        inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
        if (IS_SYNC(inode))
                udf_sync_inode(inode);
        else
                mark_inode_dirty(inode);
+        return 0;
 }
 static void __udf_read_inode(struct inode *inode)
@@ -1637,14 +1732,13 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
        return NULL;
 }
-int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
+int udf_add_aext(struct inode *inode, struct extent_position *epos,
-                    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+                 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
        int adsize;
        struct short_ad *sad = NULL;
        struct long_ad *lad = NULL;
        struct allocExtDesc *aed;
-        int8_t etype;
        uint8_t *ptr;
        struct udf_inode_info *iinfo = UDF_I(inode);
@@ -1660,7 +1754,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
        else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
                adsize = sizeof(struct long_ad);
        else
-                return -1;
+                return -EIO;
        if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
                unsigned char *sptr, *dptr;
@@ -1672,12 +1766,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                                                obloc.partitionReferenceNum,
                                                obloc.logicalBlockNum, &err);
                if (!epos->block.logicalBlockNum)
-                        return -1;
+                        return -ENOSPC;
                nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
                                                                 &epos->block,
                                                                 0));
                if (!nbh)
-                        return -1;
+                        return -EIO;
                lock_buffer(nbh);
                memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize);
                set_buffer_uptodate(nbh);
@@ -1746,7 +1840,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                epos->bh = nbh;
        }
-        etype = udf_write_aext(inode, epos, eloc, elen, inc);
+        udf_write_aext(inode, epos, eloc, elen, inc);
        if (!epos->bh) {
                iinfo->i_lenAlloc += adsize;
@@ -1764,11 +1858,11 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
                mark_buffer_dirty_inode(epos->bh, inode);
        }
-        return etype;
+        return 0;
 }
-int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
+void udf_write_aext(struct inode *inode, struct extent_position *epos,
-                      struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+                    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
        int adsize;
        uint8_t *ptr;
@@ -1798,7 +1892,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
                adsize = sizeof(struct long_ad);
                break;
        default:
-                return -1;
+                return;
        }
        if (epos->bh) {
@@ -1817,8 +1911,6 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
        if (inc)
                epos->offset += adsize;
-        return (elen >> 30);
 }
 int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 2be0f9eb86d2..f1dce848ef96 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -32,6 +32,8 @@
 #include <linux/crc-itu-t.h>
 #include <linux/exportfs.h>
+enum { UDF_MAX_LINKS = 0xffff };
 static inline int udf_match(int len1, const unsigned char *name1, int len2,
                            const unsigned char *name2)
 {
@@ -650,7 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct udf_inode_info *iinfo;
        err = -EMLINK;
-        if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
+        if (dir->i_nlink >= UDF_MAX_LINKS)
                goto out;
        err = -EIO;
@@ -1034,9 +1036,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        struct fileIdentDesc cfi, *fi;
        int err;
-        if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
+        if (inode->i_nlink >= UDF_MAX_LINKS)
                return -EMLINK;
-        }
        fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
        if (!fi) {
@@ -1131,9 +1132,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
                        goto end_rename;
                retval = -EMLINK;
-                if (!new_inode &&
+                if (!new_inode && new_dir->i_nlink >= UDF_MAX_LINKS)
-                        new_dir->i_nlink >=
-                                (256 << sizeof(new_dir->i_nlink)) - 1)
                        goto end_rename;
        }
        if (!nfi) {
@@ -1287,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
        struct fid *fid = (struct fid *)fh;
        int type = FILEID_UDF_WITHOUT_PARENT;
-        if (len < 3 || (connectable && len < 5))
+        if (connectable && (len < 5)) {
+                *lenp = 5;
                return 255;
+        } else if (len < 3) {
+                *lenp = 3;
+                return 255;
+        }
        *lenp = 3;
        fid->udf.block = location.logicalBlockNum;
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 225527cdc885..8424308db4b4 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -197,6 +197,11 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
        mark_buffer_dirty_inode(epos->bh, inode);
 }
+/*
+ * Truncate extents of inode to inode->i_size. This function can be used only
+ * for making file shorter. For making file longer, udf_extend_file() has to
+ * be used.
+ */
 void udf_truncate_extents(struct inode *inode)
 {
        struct extent_position epos;
@@ -219,96 +224,65 @@ void udf_truncate_extents(struct inode *inode)
        etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
        byte_offset = (offset << sb->s_blocksize_bits) +
                (inode->i_size & (sb->s_blocksize - 1));
-        if (etype != -1) {
+        if (etype == -1) {
-                epos.offset -= adsize;
+                /* We should extend the file? */
-                extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
+                WARN_ON(byte_offset);
-                epos.offset += adsize;
+                return;
-                if (byte_offset)
+        }
-                        lenalloc = epos.offset;
+        epos.offset -= adsize;
-                else
+        extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
-                        lenalloc = epos.offset - adsize;
+        epos.offset += adsize;
+        if (byte_offset)
-                if (!epos.bh)
+                lenalloc = epos.offset;
-                        lenalloc -= udf_file_entry_alloc_offset(inode);
+        else
-                else
+                lenalloc = epos.offset - adsize;
-                        lenalloc -= sizeof(struct allocExtDesc);
-                while ((etype = udf_current_aext(inode, &epos, &eloc,
-                                                 &elen, 0)) != -1) {
-                        if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
-                                udf_write_aext(inode, &epos, &neloc, nelen, 0);
-                                if (indirect_ext_len) {
-                                        /* We managed to free all extents in the
-                                         * indirect extent - free it too */
-                                        BUG_ON(!epos.bh);
-                                        udf_free_blocks(sb, inode, &epos.block,
-                                                        0, indirect_ext_len);
-                                } else if (!epos.bh) {
-                                        iinfo->i_lenAlloc = lenalloc;
-                                        mark_inode_dirty(inode);
-                                } else
-                                        udf_update_alloc_ext_desc(inode,
-                                                        &epos, lenalloc);
-                                brelse(epos.bh);
-                                epos.offset = sizeof(struct allocExtDesc);
-                                epos.block = eloc;
-                                epos.bh = udf_tread(sb,
-                                                udf_get_lb_pblock(sb, &eloc, 0));
-                                if (elen)
-                                        indirect_ext_len =
-                                                (elen + sb->s_blocksize - 1) >>
-                                                sb->s_blocksize_bits;
-                                else
-                                        indirect_ext_len = 1;
-                        } else {
-                                extent_trunc(inode, &epos, &eloc, etype,
-                                             elen, 0);
-                                epos.offset += adsize;
-                        }
-                }
-                if (indirect_ext_len) {
+        if (!epos.bh)
-                        BUG_ON(!epos.bh);
+                lenalloc -= udf_file_entry_alloc_offset(inode);
-                        udf_free_blocks(sb, inode, &epos.block, 0,
+        else
-                                        indirect_ext_len);
+                lenalloc -= sizeof(struct allocExtDesc);
-                } else if (!epos.bh) {
-                        iinfo->i_lenAlloc = lenalloc;
-                        mark_inode_dirty(inode);
-                } else
-                        udf_update_alloc_ext_desc(inode, &epos, lenalloc);
-        } else if (inode->i_size) {
-                if (byte_offset) {
-                        struct kernel_long_ad extent;
-                        /*
+        while ((etype = udf_current_aext(inode, &epos, &eloc,
-                         *  OK, there is not extent covering inode->i_size and
+                                         &elen, 0)) != -1) {
-                         *  no extent above inode->i_size => truncate is
+                if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
-                         *  extending the file by 'offset' blocks.
+                        udf_write_aext(inode, &epos, &neloc, nelen, 0);
-                         */
+                        if (indirect_ext_len) {
-                        if ((!epos.bh &&
+                                /* We managed to free all extents in the
-                             epos.offset ==
+                                 * indirect extent - free it too */
-                                        udf_file_entry_alloc_offset(inode)) ||
+                                BUG_ON(!epos.bh);
-                            (epos.bh && epos.offset ==
+                                udf_free_blocks(sb, inode, &epos.block,
-                                                sizeof(struct allocExtDesc))) {
+                                                0, indirect_ext_len);
-                                /* File has no extents at all or has empty last
+                        } else if (!epos.bh) {
-                                 * indirect extent! Create a fake extent... */
+                                iinfo->i_lenAlloc = lenalloc;
-                                extent.extLocation.logicalBlockNum = 0;
+                                mark_inode_dirty(inode);
-                                extent.extLocation.partitionReferenceNum = 0;
+                        } else
-                                extent.extLength =
+                                udf_update_alloc_ext_desc(inode,
-                                        EXT_NOT_RECORDED_NOT_ALLOCATED;
+                                                &epos, lenalloc);
-                        } else {
+                        brelse(epos.bh);
-                                epos.offset -= adsize;
+                        epos.offset = sizeof(struct allocExtDesc);
-                                etype = udf_next_aext(inode, &epos,
+                        epos.block = eloc;
-                                                      &extent.extLocation,
+                        epos.bh = udf_tread(sb,
-                                                      &extent.extLength, 0);
+                                        udf_get_lb_pblock(sb, &eloc, 0));
-                                extent.extLength |= etype << 30;
+                        if (elen)
-                        }
+                                indirect_ext_len =
-                        udf_extend_file(inode, &epos, &extent,
+                                        (elen + sb->s_blocksize - 1) >>
-                                        offset +
+                                        sb->s_blocksize_bits;
-                                        ((inode->i_size &
+                        else
-                                                (sb->s_blocksize - 1)) != 0));
+                                indirect_ext_len = 1;
+                } else {
+                        extent_trunc(inode, &epos, &eloc, etype, elen, 0);
+                        epos.offset += adsize;
                }
        }
+        if (indirect_ext_len) {
+                BUG_ON(!epos.bh);
+                udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len);
+        } else if (!epos.bh) {
+                iinfo->i_lenAlloc = lenalloc;
+                mark_inode_dirty(inode);
+        } else
+                udf_update_alloc_ext_desc(inode, &epos, lenalloc);
        iinfo->i_lenExtents = inode->i_size;
        brelse(epos.bh);
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index eba48209f9f3..dbd52d4b5eed 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -136,22 +136,20 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 extern long udf_ioctl(struct file *, unsigned int, unsigned long);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
-extern void udf_expand_file_adinicb(struct inode *, int, int *);
+extern int udf_expand_file_adinicb(struct inode *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
-extern void udf_truncate(struct inode *);
+extern int udf_setsize(struct inode *, loff_t);
 extern void udf_read_inode(struct inode *);
 extern void udf_evict_inode(struct inode *);
 extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
 extern long udf_block_map(struct inode *, sector_t);
-extern int udf_extend_file(struct inode *, struct extent_position *,
-                           struct kernel_long_ad *, sector_t);
 extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
                         struct kernel_lb_addr *, uint32_t *, sector_t *);
-extern int8_t udf_add_aext(struct inode *, struct extent_position *,
+extern int udf_add_aext(struct inode *, struct extent_position *,
+                        struct kernel_lb_addr *, uint32_t, int);
+extern void udf_write_aext(struct inode *, struct extent_position *,
                           struct kernel_lb_addr *, uint32_t, int);
-extern int8_t udf_write_aext(struct inode *, struct extent_position *,
-                             struct kernel_lb_addr *, uint32_t, int);
 extern int8_t udf_delete_aext(struct inode *, struct extent_position,
                              struct kernel_lb_addr, uint32_t);
 extern int8_t udf_next_aext(struct inode *, struct extent_position *,
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index 30c8f223253d..e4f10a40768a 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,7 +1,6 @@
 config UFS_FS
        tristate "UFS file system support (read only)"
        depends on BLOCK
-        depends on BKL # probably fixable
        help
          BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
          OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 2b251f2093af..e765743cf9f3 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -34,7 +34,6 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/mm.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
@@ -43,7 +42,7 @@
 #include "swab.h"
 #include "util.h"
-static u64 ufs_frag_map(struct inode *inode, sector_t frag);
+static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock);
 static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
 {
@@ -79,10 +78,10 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
 /*
 * Returns the location of the fragment from
- * the begining of the filesystem.
+ * the beginning of the filesystem.
 */
-static u64 ufs_frag_map(struct inode *inode, sector_t frag)
+static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock)
 {
        struct ufs_inode_info *ufsi = UFS_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -107,7 +106,8 @@ static u64 ufs_frag_map(struct inode *inode, sector_t frag)
        p = offsets;
-        lock_kernel();
+        if (needs_lock)
+                lock_ufs(sb);
        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
                goto ufs2;
@@ -152,7 +152,8 @@ ufs2:
        ret = temp + (u64) (frag & uspi->s_fpbmask);
 out:
-        unlock_kernel();
+        if (needs_lock)
+                unlock_ufs(sb);
        return ret;
 }
@@ -415,14 +416,16 @@ out:
 int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
 {
        struct super_block * sb = inode->i_sb;
-        struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi;
+        struct ufs_sb_info * sbi = UFS_SB(sb);
+        struct ufs_sb_private_info * uspi = sbi->s_uspi;
        struct buffer_head * bh;
        int ret, err, new;
        unsigned long ptr,phys;
        u64 phys64 = 0;
+        bool needs_lock = (sbi->mutex_owner != current);
        
        if (!create) {
-                phys64 = ufs_frag_map(inode, fragment);
+                phys64 = ufs_frag_map(inode, fragment, needs_lock);
                UFSD("phys64 = %llu\n", (unsigned long long)phys64);
                if (phys64)
                        map_bh(bh_result, sb, phys64);
@@ -436,7 +439,8 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head
        ret = 0;
        bh = NULL;
-        lock_kernel();
+        if (needs_lock)
+                lock_ufs(sb);
        UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
        if (fragment >
@@ -498,7 +502,9 @@ out:
                set_buffer_new(bh_result);
        map_bh(bh_result, sb, phys);
 abort:
-        unlock_kernel();
+        if (needs_lock)
+                unlock_ufs(sb);
        return err;
 abort_too_big:
@@ -506,48 +512,6 @@ abort_too_big:
        goto abort;
 }
-static struct buffer_head *ufs_getfrag(struct inode *inode,
-                                       unsigned int fragment,
-                                       int create, int *err)
-{
-        struct buffer_head dummy;
-        int error;
-        dummy.b_state = 0;
-        dummy.b_blocknr = -1000;
-        error = ufs_getfrag_block(inode, fragment, &dummy, create);
-        *err = error;
-        if (!error && buffer_mapped(&dummy)) {
-                struct buffer_head *bh;
-                bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-                if (buffer_new(&dummy)) {
-                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
-                        set_buffer_uptodate(bh);
-                        mark_buffer_dirty(bh);
-                }
-                return bh;
-        }
-        return NULL;
-}
-struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment,
-        int create, int * err)
-{
-        struct buffer_head * bh;
-        UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment);
-        bh = ufs_getfrag (inode, fragment, create, err);
-        if (!bh || buffer_uptodate(bh))                 
-                return bh;
-        ll_rw_block (READ, 1, &bh);
-        wait_on_buffer (bh);
-        if (buffer_uptodate(bh))
-                return bh;
-        brelse (bh);
-        *err = -EIO;
-        return NULL;
-}
 static int ufs_writepage(struct page *page, struct writeback_control *wbc)
 {
        return block_write_full_page(page,ufs_getfrag_block,wbc);
@@ -588,7 +552,6 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
 const struct address_space_operations ufs_aops = {
        .readpage = ufs_readpage,
        .writepage = ufs_writepage,
-        .sync_page = block_sync_page,
        .write_begin = ufs_write_begin,
        .write_end = generic_write_end,
        .bmap = ufs_bmap
@@ -900,9 +863,9 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
 int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        int ret;
-        lock_kernel();
+        lock_ufs(inode->i_sb);
        ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-        unlock_kernel();
+        unlock_ufs(inode->i_sb);
        return ret;
 }
@@ -922,22 +885,22 @@ void ufs_evict_inode(struct inode * inode)
        if (want_delete) {
                loff_t old_i_size;
                /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
-                lock_kernel();
+                lock_ufs(inode->i_sb);
                mark_inode_dirty(inode);
                ufs_update_inode(inode, IS_SYNC(inode));
                old_i_size = inode->i_size;
                inode->i_size = 0;
                if (inode->i_blocks && ufs_truncate(inode, old_i_size))
                        ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
-                unlock_kernel();
+                unlock_ufs(inode->i_sb);
        }
        invalidate_inode_buffers(inode);
        end_writeback(inode);
        if (want_delete) {
-                lock_kernel();
+                lock_ufs(inode->i_sb);
                ufs_free_inode (inode);
-                unlock_kernel();
+                unlock_ufs(inode->i_sb);
        }
 }
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 12f39b9e4437..29309e25417f 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -29,7 +29,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -55,16 +54,16 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
        if (dentry->d_name.len > UFS_MAXNAMLEN)
                return ERR_PTR(-ENAMETOOLONG);
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        ino = ufs_inode_by_name(dir, &dentry->d_name);
        if (ino) {
                inode = ufs_iget(dir->i_sb, ino);
                if (IS_ERR(inode)) {
-                        unlock_kernel();
+                        unlock_ufs(dir->i_sb);
                        return ERR_CAST(inode);
                }
        }
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        d_add(dentry, inode);
        return NULL;
 }
@@ -93,9 +92,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
                inode->i_fop = &ufs_file_operations;
                inode->i_mapping->a_ops = &ufs_aops;
                mark_inode_dirty(inode);
-                lock_kernel();
+                lock_ufs(dir->i_sb);
                err = ufs_add_nondir(dentry, inode);
-                unlock_kernel();
+                unlock_ufs(dir->i_sb);
        }
        UFSD("END: err=%d\n", err);
        return err;
@@ -115,9 +114,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
                init_special_inode(inode, mode, rdev);
                ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev);
                mark_inode_dirty(inode);
-                lock_kernel();
+                lock_ufs(dir->i_sb);
                err = ufs_add_nondir(dentry, inode);
-                unlock_kernel();
+                unlock_ufs(dir->i_sb);
        }
        return err;
 }
@@ -133,7 +132,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        if (l > sb->s_blocksize)
                goto out_notlocked;
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
        err = PTR_ERR(inode);
        if (IS_ERR(inode))
@@ -156,7 +155,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        err = ufs_add_nondir(dentry, inode);
 out:
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
 out_notlocked:
        return err;
@@ -172,9 +171,9 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
        struct inode *inode = old_dentry->d_inode;
        int error;
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        if (inode->i_nlink >= UFS_LINK_MAX) {
-                unlock_kernel();
+                unlock_ufs(dir->i_sb);
                return -EMLINK;
        }
@@ -183,7 +182,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
        ihold(inode);
        error = ufs_add_nondir(dentry, inode);
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        return error;
 }
@@ -195,7 +194,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        if (dir->i_nlink >= UFS_LINK_MAX)
                goto out;
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        inode_inc_link_count(dir);
        inode = ufs_new_inode(dir, S_IFDIR|mode);
@@ -216,7 +215,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        err = ufs_add_link(dentry, inode);
        if (err)
                goto out_fail;
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        d_instantiate(dentry, inode);
 out:
@@ -228,7 +227,7 @@ out_fail:
        iput (inode);
 out_dir:
        inode_dec_link_count(dir);
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        goto out;
 }
@@ -259,7 +258,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
        struct inode * inode = dentry->d_inode;
        int err= -ENOTEMPTY;
-        lock_kernel();
+        lock_ufs(dir->i_sb);
        if (ufs_empty_dir (inode)) {
                err = ufs_unlink(dir, dentry);
                if (!err) {
@@ -268,7 +267,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
                        inode_dec_link_count(dir);
                }
        }
-        unlock_kernel();
+        unlock_ufs(dir->i_sb);
        return err;
 }
@@ -306,7 +305,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
-                inode_inc_link_count(old_inode);
                ufs_set_link(new_dir, new_de, new_page, old_inode);
                new_inode->i_ctime = CURRENT_TIME_SEC;
                if (dir_de)
@@ -318,12 +316,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        if (new_dir->i_nlink >= UFS_LINK_MAX)
                                goto out_dir;
                }
-                inode_inc_link_count(old_inode);
                err = ufs_add_link(new_dentry, old_inode);
-                if (err) {
+                if (err)
-                        inode_dec_link_count(old_inode);
                        goto out_dir;
-                }
                if (dir_de)
                        inode_inc_link_count(new_dir);
        }
@@ -331,12 +326,11 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
-         * inode_dec_link_count() will mark the inode dirty.
         */
        old_inode->i_ctime = CURRENT_TIME_SEC;
        ufs_delete_entry(old_dir, old_de, old_page);
-        inode_dec_link_count(old_inode);
+        mark_inode_dirty(old_inode);
        if (dir_de) {
                ufs_set_link(old_inode, dir_de, dir_page, new_dir);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2c61ac5d4e48..3915ade6f9a8 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -84,7 +84,6 @@
 #include <linux/blkdev.h>
 #include <linux/init.h>
 #include <linux/parser.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/log2.h>
@@ -96,6 +95,26 @@
 #include "swab.h"
 #include "util.h"
+void lock_ufs(struct super_block *sb)
+{
+#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
+        struct ufs_sb_info *sbi = UFS_SB(sb);
+        mutex_lock(&sbi->mutex);
+        sbi->mutex_owner = current;
+#endif
+}
+void unlock_ufs(struct super_block *sb)
+{
+#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
+        struct ufs_sb_info *sbi = UFS_SB(sb);
+        sbi->mutex_owner = NULL;
+        mutex_unlock(&sbi->mutex);
+#endif
+}
 static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
 {
        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
@@ -313,7 +332,6 @@ void ufs_panic (struct super_block * sb, const char * function,
        struct ufs_super_block_first * usb1;
        va_list args;
        
-        lock_kernel();
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
        
@@ -465,9 +483,9 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
 }
 /*
- * Diffrent types of UFS hold fs_cstotal in different
+ * Different types of UFS hold fs_cstotal in different
- * places, and use diffrent data structure for it.
+ * places, and use different data structure for it.
- * To make things simplier we just copy fs_cstotal to ufs_sb_private_info
+ * To make things simpler we just copy fs_cstotal to ufs_sb_private_info
 */
 static void ufs_setup_cstotal(struct super_block *sb)
 {
@@ -521,7 +539,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
         */
        size = uspi->s_cssize;
        blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
-        base = space = kmalloc(size, GFP_KERNEL);
+        base = space = kmalloc(size, GFP_NOFS);
        if (!base)
                goto failed; 
        sbi->s_csp = (struct ufs_csum *)space;
@@ -546,7 +564,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
         * Read cylinder group (we read only first fragment from block
         * at this time) and prepare internal data structures for cg caching.
         */
-        if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_KERNEL)))
+        if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_NOFS)))
                goto failed;
        for (i = 0; i < uspi->s_ncg; i++) 
                sbi->s_ucg[i] = NULL;
@@ -564,7 +582,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
                ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
        }
        for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
-                if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL)))
+                if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_NOFS)))
                        goto failed;
                sbi->s_cgno[i] = UFS_CGNO_EMPTY;
        }
@@ -646,8 +664,6 @@ static void ufs_put_super_internal(struct super_block *sb)
        
        UFSD("ENTER\n");
-        lock_kernel();
        ufs_put_cstotal(sb);
        size = uspi->s_cssize;
        blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -676,8 +692,6 @@ static void ufs_put_super_internal(struct super_block *sb)
        kfree (sbi->s_ucg);
        kfree (base);
-        unlock_kernel();
        UFSD("EXIT\n");
 }
@@ -696,8 +710,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
        unsigned maxsymlen;
        int ret = -EINVAL;
-        lock_kernel();
        uspi = NULL;
        ubh = NULL;
        flags = 0;
@@ -718,6 +730,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                goto failed;
        }
 #endif
+        mutex_init(&sbi->mutex);
        /*
         * Set default mount options
         * Parse mount options
@@ -1165,7 +1178,6 @@ magic_found:
                        goto failed;
        UFSD("EXIT\n");
-        unlock_kernel();
        return 0;
 dalloc_failed:
@@ -1177,12 +1189,10 @@ failed:
        kfree(sbi);
        sb->s_fs_info = NULL;
        UFSD("EXIT (FAILED)\n");
-        unlock_kernel();
        return ret;
 failed_nomem:
        UFSD("EXIT (NOMEM)\n");
-        unlock_kernel();
        return -ENOMEM;
 }
@@ -1193,8 +1203,8 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
        struct ufs_super_block_third * usb3;
        unsigned flags;
+        lock_ufs(sb);
        lock_super(sb);
-        lock_kernel();
        UFSD("ENTER\n");
@@ -1213,8 +1223,8 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
        sb->s_dirt = 0;
        UFSD("EXIT\n");
-        unlock_kernel();
        unlock_super(sb);
+        unlock_ufs(sb);
        return 0;
 }
@@ -1256,7 +1266,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        unsigned new_mount_opt, ufstype;
        unsigned flags;
-        lock_kernel();
+        lock_ufs(sb);
        lock_super(sb);
        uspi = UFS_SB(sb)->s_uspi;
        flags = UFS_SB(sb)->s_flags;
@@ -1272,7 +1282,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        ufs_set_opt (new_mount_opt, ONERROR_LOCK);
        if (!ufs_parse_options (data, &new_mount_opt)) {
                unlock_super(sb);
-                unlock_kernel();
+                unlock_ufs(sb);
                return -EINVAL;
        }
        if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
@@ -1280,14 +1290,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
                printk("ufstype can't be changed during remount\n");
                unlock_super(sb);
-                unlock_kernel();
+                unlock_ufs(sb);
                return -EINVAL;
        }
        if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
                UFS_SB(sb)->s_mount_opt = new_mount_opt;
                unlock_super(sb);
-                unlock_kernel();
+                unlock_ufs(sb);
                return 0;
        }
        
@@ -1313,7 +1323,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                printk("ufs was compiled with read-only support, "
                "can't be mounted as read-write\n");
                unlock_super(sb);
-                unlock_kernel();
+                unlock_ufs(sb);
                return -EINVAL;
 #else
                if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
@@ -1323,13 +1333,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
                        printk("this ufstype is read-only supported\n");
                        unlock_super(sb);
-                        unlock_kernel();
+                        unlock_ufs(sb);
                        return -EINVAL;
                }
                if (!ufs_read_cylinder_structures(sb)) {
                        printk("failed during remounting\n");
                        unlock_super(sb);
-                        unlock_kernel();
+                        unlock_ufs(sb);
                        return -EPERM;
                }
                sb->s_flags &= ~MS_RDONLY;
@@ -1337,7 +1347,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        }
        UFS_SB(sb)->s_mount_opt = new_mount_opt;
        unlock_super(sb);
-        unlock_kernel();
+        unlock_ufs(sb);
        return 0;
 }
@@ -1371,7 +1381,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct ufs_super_block_third *usb3;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
-        lock_kernel();
+        lock_ufs(sb);
        usb1 = ubh_get_usb_first(uspi);
        usb2 = ubh_get_usb_second(uspi);
@@ -1395,7 +1405,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
-        unlock_kernel();
+        unlock_ufs(sb);
        return 0;
 }
@@ -1405,7 +1415,7 @@ static struct kmem_cache * ufs_inode_cachep;
 static struct inode *ufs_alloc_inode(struct super_block *sb)
 {
        struct ufs_inode_info *ei;
-        ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_KERNEL);
+        ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
        ei->vfs_inode.i_version = 1;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index a58f9155fc9a..5f821dbc0579 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -40,7 +40,6 @@
 #include <linux/time.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/sched.h>
@@ -467,7 +466,6 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
        block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
-        lock_kernel();
        while (1) {
                retry = ufs_trunc_direct(inode);
                retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK,
@@ -481,13 +479,11 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
                        break;
                if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
                        ufs_sync_inode (inode);
-                blk_run_address_space(inode->i_mapping);
                yield();
        }
        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
        ufsi->i_lastfrag = DIRECT_FRAGMENT;
-        unlock_kernel();
        mark_inode_dirty(inode);
 out:
        UFSD("EXIT: err %d\n", err);
@@ -510,7 +506,9 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
                /* XXX(truncate): truncate_setsize should be called last */
                truncate_setsize(inode, attr->ia_size);
+                lock_ufs(inode->i_sb);
                error = ufs_truncate(inode, old_i_size);
+                unlock_ufs(inode->i_sb);
                if (error)
                        return error;
        }
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index c08782e1b48a..5be2755dd715 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -18,6 +18,8 @@ struct ufs_sb_info {
        unsigned s_cgno[UFS_MAX_GROUP_LOADED];
        unsigned short s_cg_loaded;
        unsigned s_mount_opt;
+        struct mutex mutex;
+        struct task_struct *mutex_owner;
 };
 struct ufs_inode_info {
@@ -109,7 +111,6 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long);
 extern int ufs_write_inode (struct inode *, struct writeback_control *);
 extern int ufs_sync_inode (struct inode *);
 extern void ufs_evict_inode (struct inode *);
-extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
 extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
 /* namei.c */
@@ -154,4 +155,7 @@ static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b)
        return do_div(b, uspi->s_fpg);
 }
+extern void lock_ufs(struct super_block *sb);
+extern void unlock_ufs(struct super_block *sb);
 #endif /* _UFS_UFS_H */
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index d2c36d53fe66..95425b59ce0a 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -27,7 +27,7 @@ struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
        if (count > UFS_MAXFRAG)
                return NULL;
        ubh = (struct ufs_buffer_head *)
-                kmalloc (sizeof (struct ufs_buffer_head), GFP_KERNEL);
+                kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS);
        if (!ubh)
                return NULL;
        ubh->fragment = fragment;
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 9f8775ce381c..954175928240 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -408,7 +408,7 @@ static inline unsigned _ubh_find_next_zero_bit_(
        for (;;) {
                count = min_t(unsigned int, size + offset, uspi->s_bpf);
                size -= count - offset;
-                pos = ext2_find_next_zero_bit (ubh->bh[base]->b_data, count, offset);
+                pos = find_next_zero_bit_le(ubh->bh[base]->b_data, count, offset);
                if (pos < count || !size)
                        break;
                base++;
diff --git a/fs/utimes.c b/fs/utimes.c
index 179b58690657..ba653f3dc1bc 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -95,7 +95,7 @@ static int utimes_common(struct path *path, struct timespec *times)
                if (IS_IMMUTABLE(inode))
                        goto mnt_drop_write_and_out;
-                if (!is_owner_or_cap(inode)) {
+                if (!inode_owner_or_capable(inode)) {
                        error = inode_permission(inode, MAY_WRITE);
                        if (error)
                                goto mnt_drop_write_and_out;
diff --git a/fs/xattr.c b/fs/xattr.c
index 01bb8135e14a..a19acdb81cd1 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -59,7 +59,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
                if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
                        return -EPERM;
                if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
-                    (mask & MAY_WRITE) && !is_owner_or_cap(inode))
+                    (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
                        return -EPERM;
        }
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index faca44997099..284a7c89697e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,14 +16,11 @@
 # Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #
-EXTRA_CFLAGS +=  -I$(src) -I$(src)/linux-2.6
+ccflags-y := -I$(src) -I$(src)/linux-2.6
+ccflags-$(CONFIG_XFS_DEBUG) += -g
 XFS_LINUX := linux-2.6
-ifeq ($(CONFIG_XFS_DEBUG),y)
-        EXTRA_CFLAGS += -g
-endif
 obj-$(CONFIG_XFS_FS)            += xfs.o
 xfs-y                           += linux-2.6/xfs_trace.o
@@ -105,11 +102,10 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
                                   xfs_globals.o \
                                   xfs_ioctl.o \
                                   xfs_iops.o \
+                                   xfs_message.o \
                                   xfs_super.o \
                                   xfs_sync.o \
                                   xfs_xattr.o)
 # Objects in support/
-xfs-y                           += $(addprefix support/, \
+xfs-y                           += support/uuid.o
-                                   debug.o \
-                                   uuid.o)
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 666c9db48eb6..a907de565db3 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -23,6 +23,7 @@
 #include <linux/backing-dev.h>
 #include "time.h"
 #include "kmem.h"
+#include "xfs_message.h"
 /*
 * Greedy allocation.  May fail and may return vmalloced memory.
@@ -56,8 +57,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
                        return ptr;
                if (!(++retries % 100))
-                        printk(KERN_ERR "XFS: possible memory allocation "
+                        xfs_err(NULL,
-                                        "deadlock in %s (mode:0x%x)\n",
+                "possible memory allocation deadlock in %s (mode:0x%x)",
                                        __func__, lflags);
                congestion_wait(BLK_RW_ASYNC, HZ/50);
        } while (1);
@@ -112,8 +113,8 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
                if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
                        return ptr;
                if (!(++retries % 100))
-                        printk(KERN_ERR "XFS: possible memory allocation "
+                        xfs_err(NULL,
-                                        "deadlock in %s (mode:0x%x)\n",
+                "possible memory allocation deadlock in %s (mode:0x%x)",
                                        __func__, lflags);
                congestion_wait(BLK_RW_ASYNC, HZ/50);
        } while (1);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index ec7bbb5645b6..79ce38be15a1 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -413,8 +413,7 @@ xfs_submit_ioend_bio(
        if (xfs_ioend_new_eof(ioend))
                xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
-        submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
+        submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
-                   WRITE_SYNC_PLUG : WRITE, bio);
 }
 STATIC struct bio *
@@ -854,7 +853,7 @@ xfs_aops_discard_page(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                goto out_invalidate;
-        xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+        xfs_alert(ip->i_mount,
                "page discard on page %p, inode 0x%llx, offset %llu.",
                        page, ip->i_ino, offset);
@@ -872,7 +871,7 @@ xfs_aops_discard_page(
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                                xfs_alert(ip->i_mount,
                        "page discard unable to remove delalloc mapping.");
                        }
                        break;
@@ -1296,7 +1295,7 @@ xfs_get_blocks_direct(
 * If the private argument is non-NULL __xfs_get_blocks signals us that we
 * need to issue a transaction to convert the range from unwritten to written
 * extents.  In case this is regular synchronous I/O we just call xfs_end_io
- * to do this and we are done.  But in case this was a successfull AIO
+ * to do this and we are done.  But in case this was a successful AIO
 * request this handler is called from interrupt context, from which we
 * can't start transactions.  In that case offload the I/O completion to
 * the workqueues we also use for buffered I/O completion.
@@ -1411,7 +1410,7 @@ xfs_vm_write_failed(
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                                xfs_alert(ip->i_mount,
                        "xfs_vm_write_failed: unable to clean up ino %lld",
                                                ip->i_ino);
                        }
@@ -1495,7 +1494,6 @@ const struct address_space_operations xfs_address_space_operations = {
        .readpages              = xfs_vm_readpages,
        .writepage              = xfs_vm_writepage,
        .writepages             = xfs_vm_writepages,
-        .sync_page              = block_sync_page,
        .releasepage            = xfs_vm_releasepage,
        .invalidatepage         = xfs_vm_invalidatepage,
        .write_begin            = xfs_vm_write_begin,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ac1c7e8378dd..9ef9ed2cfe2e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -94,75 +94,6 @@ xfs_buf_vmap_len(
 }
 /*
- *      Page Region interfaces.
- *
- *      For pages in filesystems where the blocksize is smaller than the
- *      pagesize, we use the page->private field (long) to hold a bitmap
- *      of uptodate regions within the page.
- *
- *      Each such region is "bytes per page / bits per long" bytes long.
- *
- *      NBPPR == number-of-bytes-per-page-region
- *      BTOPR == bytes-to-page-region (rounded up)
- *      BTOPRT == bytes-to-page-region-truncated (rounded down)
- */
-#if (BITS_PER_LONG == 32)
-#define PRSHIFT         (PAGE_CACHE_SHIFT - 5)  /* (32 == 1<<5) */
-#elif (BITS_PER_LONG == 64)
-#define PRSHIFT         (PAGE_CACHE_SHIFT - 6)  /* (64 == 1<<6) */
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-#define NBPPR           (PAGE_CACHE_SIZE/BITS_PER_LONG)
-#define BTOPR(b)        (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
-#define BTOPRT(b)       (((unsigned int)(b) >> PRSHIFT))
-STATIC unsigned long
-page_region_mask(
-        size_t          offset,
-        size_t          length)
-{
-        unsigned long   mask;
-        int             first, final;
-        first = BTOPR(offset);
-        final = BTOPRT(offset + length - 1);
-        first = min(first, final);
-        mask = ~0UL;
-        mask <<= BITS_PER_LONG - (final - first);
-        mask >>= BITS_PER_LONG - (final);
-        ASSERT(offset + length <= PAGE_CACHE_SIZE);
-        ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
-        return mask;
-}
-STATIC void
-set_page_region(
-        struct page     *page,
-        size_t          offset,
-        size_t          length)
-{
-        set_page_private(page,
-                page_private(page) | page_region_mask(offset, length));
-        if (page_private(page) == ~0UL)
-                SetPageUptodate(page);
-}
-STATIC int
-test_page_region(
-        struct page     *page,
-        size_t          offset,
-        size_t          length)
-{
-        unsigned long   mask = page_region_mask(offset, length);
-        return (mask && (page_private(page) & mask) == mask);
-}
-/*
 * xfs_buf_lru_add - add a buffer to the LRU.
 *
 * The LRU takes a new reference to the buffer so that it will only be freed
@@ -189,7 +120,7 @@ xfs_buf_lru_add(
 * The unlocked check is safe here because it only occurs when there are not
 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
 * to optimise the shrinker removing the buffer from the LRU and calling
- * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
+ * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
 * bt_lru_lock.
 */
 STATIC void
@@ -332,7 +263,7 @@ xfs_buf_free(
        ASSERT(list_empty(&bp->b_lru));
-        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
+        if (bp->b_flags & _XBF_PAGES) {
                uint            i;
                if (xfs_buf_is_vmapped(bp))
@@ -342,56 +273,77 @@ xfs_buf_free(
                for (i = 0; i < bp->b_page_count; i++) {
                        struct page     *page = bp->b_pages[i];
-                        if (bp->b_flags & _XBF_PAGE_CACHE)
+                        __free_page(page);
-                                ASSERT(!PagePrivate(page));
-                        page_cache_release(page);
                }
-        }
+        } else if (bp->b_flags & _XBF_KMEM)
+                kmem_free(bp->b_addr);
        _xfs_buf_free_pages(bp);
        xfs_buf_deallocate(bp);
 }
 /*
- *      Finds all pages for buffer in question and builds it's page list.
+ * Allocates all the pages for buffer in question and builds it's page list.
 */
 STATIC int
-_xfs_buf_lookup_pages(
+xfs_buf_allocate_memory(
        xfs_buf_t               *bp,
        uint                    flags)
 {
-        struct address_space    *mapping = bp->b_target->bt_mapping;
-        size_t                  blocksize = bp->b_target->bt_bsize;
        size_t                  size = bp->b_count_desired;
        size_t                  nbytes, offset;
        gfp_t                   gfp_mask = xb_to_gfp(flags);
        unsigned short          page_count, i;
-        pgoff_t                 first;
        xfs_off_t               end;
        int                     error;
+        /*
+         * for buffers that are contained within a single page, just allocate
+         * the memory from the heap - there's no need for the complexity of
+         * page arrays to keep allocation down to order 0.
+         */
+        if (bp->b_buffer_length < PAGE_SIZE) {
+                bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
+                if (!bp->b_addr) {
+                        /* low memory - use alloc_page loop instead */
+                        goto use_alloc_page;
+                }
+                if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
+                                                                PAGE_MASK) !=
+                    ((unsigned long)bp->b_addr & PAGE_MASK)) {
+                        /* b_addr spans two pages - use alloc_page instead */
+                        kmem_free(bp->b_addr);
+                        bp->b_addr = NULL;
+                        goto use_alloc_page;
+                }
+                bp->b_offset = offset_in_page(bp->b_addr);
+                bp->b_pages = bp->b_page_array;
+                bp->b_pages[0] = virt_to_page(bp->b_addr);
+                bp->b_page_count = 1;
+                bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
+                return 0;
+        }
+use_alloc_page:
        end = bp->b_file_offset + bp->b_buffer_length;
        page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
        error = _xfs_buf_get_pages(bp, page_count, flags);
        if (unlikely(error))
                return error;
-        bp->b_flags |= _XBF_PAGE_CACHE;
        offset = bp->b_offset;
-        first = bp->b_file_offset >> PAGE_CACHE_SHIFT;
+        bp->b_flags |= _XBF_PAGES;
        for (i = 0; i < bp->b_page_count; i++) {
                struct page     *page;
                uint            retries = 0;
+retry:
-              retry:
+                page = alloc_page(gfp_mask);
-                page = find_or_create_page(mapping, first + i, gfp_mask);
                if (unlikely(page == NULL)) {
                        if (flags & XBF_READ_AHEAD) {
                                bp->b_page_count = i;
-                                for (i = 0; i < bp->b_page_count; i++)
+                                error = ENOMEM;
-                                        unlock_page(bp->b_pages[i]);
+                                goto out_free_pages;
-                                return -ENOMEM;
                        }
                        /*
@@ -401,9 +353,8 @@ _xfs_buf_lookup_pages(
                         * handle buffer allocation failures we can't do much.
                         */
                        if (!(++retries % 100))
-                                printk(KERN_ERR
+                                xfs_err(NULL,
-                                        "XFS: possible memory allocation "
+                "possible memory allocation deadlock in %s (mode:0x%x)",
-                                        "deadlock in %s (mode:0x%x)\n",
                                        __func__, gfp_mask);
                        XFS_STATS_INC(xb_page_retries);
@@ -413,52 +364,44 @@ _xfs_buf_lookup_pages(
                XFS_STATS_INC(xb_page_found);
-                nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
+                nbytes = min_t(size_t, size, PAGE_SIZE - offset);
                size -= nbytes;
-                ASSERT(!PagePrivate(page));
-                if (!PageUptodate(page)) {
-                        page_count--;
-                        if (blocksize >= PAGE_CACHE_SIZE) {
-                                if (flags & XBF_READ)
-                                        bp->b_flags |= _XBF_PAGE_LOCKED;
-                        } else if (!PagePrivate(page)) {
-                                if (test_page_region(page, offset, nbytes))
-                                        page_count++;
-                        }
-                }
                bp->b_pages[i] = page;
                offset = 0;
        }
+        return 0;
-        if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
+out_free_pages:
-                for (i = 0; i < bp->b_page_count; i++)
+        for (i = 0; i < bp->b_page_count; i++)
-                        unlock_page(bp->b_pages[i]);
+                __free_page(bp->b_pages[i]);
-        }
-        if (page_count == bp->b_page_count)
-                bp->b_flags |= XBF_DONE;
        return error;
 }
 /*
- *      Map buffer into kernel address-space if nessecary.
+ *      Map buffer into kernel address-space if necessary.
 */
 STATIC int
 _xfs_buf_map_pages(
        xfs_buf_t               *bp,
        uint                    flags)
 {
-        /* A single page buffer is always mappable */
+        ASSERT(bp->b_flags & _XBF_PAGES);
        if (bp->b_page_count == 1) {
+                /* A single page buffer is always mappable */
                bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
                bp->b_flags |= XBF_MAPPED;
        } else if (flags & XBF_MAPPED) {
-                bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+                int retried = 0;
-                                        -1, PAGE_KERNEL);
-                if (unlikely(bp->b_addr == NULL))
+                do {
+                        bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+                                                -1, PAGE_KERNEL);
+                        if (bp->b_addr)
+                                break;
+                        vm_unmap_aliases();
+                } while (retried++ <= 1);
+                if (!bp->b_addr)
                        return -ENOMEM;
                bp->b_addr += bp->b_offset;
                bp->b_flags |= XBF_MAPPED;
@@ -569,9 +512,14 @@ found:
                }
        }
+        /*
+         * if the buffer is stale, clear all the external state associated with
+         * it. We need to keep flags such as how we allocated the buffer memory
+         * intact here.
+         */
        if (bp->b_flags & XBF_STALE) {
                ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
-                bp->b_flags &= XBF_MAPPED;
+                bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
        }
        trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -592,7 +540,7 @@ xfs_buf_get(
        xfs_buf_flags_t         flags)
 {
        xfs_buf_t               *bp, *new_bp;
-        int                     error = 0, i;
+        int                     error = 0;
        new_bp = xfs_buf_allocate(flags);
        if (unlikely(!new_bp))
@@ -600,7 +548,7 @@ xfs_buf_get(
        bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
        if (bp == new_bp) {
-                error = _xfs_buf_lookup_pages(bp, flags);
+                error = xfs_buf_allocate_memory(bp, flags);
                if (error)
                        goto no_buffer;
        } else {
@@ -609,14 +557,11 @@ xfs_buf_get(
                        return NULL;
        }
-        for (i = 0; i < bp->b_page_count; i++)
-                mark_page_accessed(bp->b_pages[i]);
        if (!(bp->b_flags & XBF_MAPPED)) {
                error = _xfs_buf_map_pages(bp, flags);
                if (unlikely(error)) {
-                        printk(KERN_WARNING "%s: failed to map pages\n",
+                        xfs_warn(target->bt_mount,
-                                        __func__);
+                                "%s: failed to map pages\n", __func__);
                        goto no_buffer;
                }
        }
@@ -710,10 +655,7 @@ xfs_buf_readahead(
        xfs_off_t               ioff,
        size_t                  isize)
 {
-        struct backing_dev_info *bdi;
+        if (bdi_read_congested(target->bt_bdi))
-        bdi = target->bt_mapping->backing_dev_info;
-        if (bdi_read_congested(bdi))
                return;
        xfs_buf_read(target, ioff, isize,
@@ -791,10 +733,10 @@ xfs_buf_associate_memory(
        size_t                  buflen;
        int                     page_count;
-        pageaddr = (unsigned long)mem & PAGE_CACHE_MASK;
+        pageaddr = (unsigned long)mem & PAGE_MASK;
        offset = (unsigned long)mem - pageaddr;
-        buflen = PAGE_CACHE_ALIGN(len + offset);
+        buflen = PAGE_ALIGN(len + offset);
-        page_count = buflen >> PAGE_CACHE_SHIFT;
+        page_count = buflen >> PAGE_SHIFT;
        /* Free any previous set of page pointers */
        if (bp->b_pages)
@@ -811,13 +753,12 @@ xfs_buf_associate_memory(
        for (i = 0; i < bp->b_page_count; i++) {
                bp->b_pages[i] = mem_to_page((void *)pageaddr);
-                pageaddr += PAGE_CACHE_SIZE;
+                pageaddr += PAGE_SIZE;
        }
        bp->b_count_desired = len;
        bp->b_buffer_length = buflen;
        bp->b_flags |= XBF_MAPPED;
-        bp->b_flags &= ~_XBF_PAGE_LOCKED;
        return 0;
 }
@@ -850,8 +791,8 @@ xfs_buf_get_uncached(
        error = _xfs_buf_map_pages(bp, XBF_MAPPED);
        if (unlikely(error)) {
-                printk(KERN_WARNING "%s: failed to map pages\n",
+                xfs_warn(target->bt_mount,
-                                __func__);
+                        "%s: failed to map pages\n", __func__);
                goto fail_free_mem;
        }
@@ -924,20 +865,7 @@ xfs_buf_rele(
 /*
- *      Mutual exclusion on buffers.  Locking model:
+ *      Lock a buffer object, if it is not already locked.
- *
- *      Buffers associated with inodes for which buffer locking
- *      is not enabled are not protected by semaphores, and are
- *      assumed to be exclusively owned by the caller.  There is a
- *      spinlock in the buffer, used by the caller when concurrent
- *      access is possible.
- */
-/*
- *      Locks a buffer object, if it is not already locked.  Note that this in
- *      no way locks the underlying pages, so it is only useful for
- *      synchronizing concurrent use of buffer objects, not for synchronizing
- *      independent access to the underlying pages.
 *
 *      If we come across a stale, pinned, locked buffer, we know that we are
 *      being asked to lock a buffer that has been reallocated. Because it is
@@ -971,10 +899,7 @@ xfs_buf_lock_value(
 }
 /*
- *      Locks a buffer object.
+ *      Lock a buffer object.
- *      Note that this in no way locks the underlying pages, so it is only
- *      useful for synchronizing concurrent use of buffer objects, not for
- *      synchronizing independent access to the underlying pages.
 *
 *      If we come across a stale, pinned, locked buffer, we know that we
 *      are being asked to lock a buffer that has been reallocated. Because
@@ -990,8 +915,6 @@ xfs_buf_lock(
        if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
                xfs_log_force(bp->b_target->bt_mount, 0);
-        if (atomic_read(&bp->b_io_remaining))
-                blk_run_address_space(bp->b_target->bt_mapping);
        down(&bp->b_sema);
        XB_SET_OWNER(bp);
@@ -1035,9 +958,7 @@ xfs_buf_wait_unpin(
                set_current_state(TASK_UNINTERRUPTIBLE);
                if (atomic_read(&bp->b_pin_count) == 0)
                        break;
-                if (atomic_read(&bp->b_io_remaining))
+                io_schedule();
-                        blk_run_address_space(bp->b_target->bt_mapping);
-                schedule();
        }
        remove_wait_queue(&bp->b_waiters, &wait);
        set_current_state(TASK_RUNNING);
@@ -1249,10 +1170,8 @@ _xfs_buf_ioend(
        xfs_buf_t               *bp,
        int                     schedule)
 {
-        if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+        if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
-                bp->b_flags &= ~_XBF_PAGE_LOCKED;
                xfs_buf_ioend(bp, schedule);
-        }
 }
 STATIC void
@@ -1261,35 +1180,12 @@ xfs_buf_bio_end_io(
        int                     error)
 {
        xfs_buf_t               *bp = (xfs_buf_t *)bio->bi_private;
-        unsigned int            blocksize = bp->b_target->bt_bsize;
-        struct bio_vec          *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
        xfs_buf_ioerror(bp, -error);
        if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
-        do {
-                struct page     *page = bvec->bv_page;
-                ASSERT(!PagePrivate(page));
-                if (unlikely(bp->b_error)) {
-                        if (bp->b_flags & XBF_READ)
-                                ClearPageUptodate(page);
-                } else if (blocksize >= PAGE_CACHE_SIZE) {
-                        SetPageUptodate(page);
-                } else if (!PagePrivate(page) &&
-                                (bp->b_flags & _XBF_PAGE_CACHE)) {
-                        set_page_region(page, bvec->bv_offset, bvec->bv_len);
-                }
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
-                if (bp->b_flags & _XBF_PAGE_LOCKED)
-                        unlock_page(page);
-        } while (bvec >= bio->bi_io_vec);
        _xfs_buf_ioend(bp, 1);
        bio_put(bio);
 }
@@ -1303,7 +1199,6 @@ _xfs_buf_ioapply(
        int                     offset = bp->b_offset;
        int                     size = bp->b_count_desired;
        sector_t                sector = bp->b_bn;
-        unsigned int            blocksize = bp->b_target->bt_bsize;
        total_nr_pages = bp->b_page_count;
        map_i = 0;
@@ -1324,29 +1219,6 @@ _xfs_buf_ioapply(
                     (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
        }
-        /* Special code path for reading a sub page size buffer in --
-         * we populate up the whole page, and hence the other metadata
-         * in the same page.  This optimization is only valid when the
-         * filesystem block size is not smaller than the page size.
-         */
-        if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
-            ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
-              (XBF_READ|_XBF_PAGE_LOCKED)) &&
-            (blocksize >= PAGE_CACHE_SIZE)) {
-                bio = bio_alloc(GFP_NOIO, 1);
-                bio->bi_bdev = bp->b_target->bt_bdev;
-                bio->bi_sector = sector - (offset >> BBSHIFT);
-                bio->bi_end_io = xfs_buf_bio_end_io;
-                bio->bi_private = bp;
-                bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
-                size = 0;
-                atomic_inc(&bp->b_io_remaining);
-                goto submit_io;
-        }
 next_chunk:
        atomic_inc(&bp->b_io_remaining);
@@ -1360,8 +1232,9 @@ next_chunk:
        bio->bi_end_io = xfs_buf_bio_end_io;
        bio->bi_private = bp;
        for (; size && nr_pages; nr_pages--, map_i++) {
-                int     rbytes, nbytes = PAGE_CACHE_SIZE - offset;
+                int     rbytes, nbytes = PAGE_SIZE - offset;
                if (nbytes > size)
                        nbytes = size;
@@ -1376,7 +1249,6 @@ next_chunk:
                total_nr_pages--;
        }
-submit_io:
        if (likely(bio->bi_size)) {
                if (xfs_buf_is_vmapped(bp)) {
                        flush_kernel_vmap_range(bp->b_addr,
@@ -1386,18 +1258,7 @@ submit_io:
                if (size)
                        goto next_chunk;
        } else {
-                /*
-                 * if we get here, no pages were added to the bio. However,
-                 * we can't just error out here - if the pages are locked then
-                 * we have to unlock them otherwise we can hang on a later
-                 * access to the page.
-                 */
                xfs_buf_ioerror(bp, EIO);
-                if (bp->b_flags & _XBF_PAGE_LOCKED) {
-                        int i;
-                        for (i = 0; i < bp->b_page_count; i++)
-                                unlock_page(bp->b_pages[i]);
-                }
                bio_put(bio);
        }
 }
@@ -1442,8 +1303,6 @@ xfs_buf_iowait(
 {
        trace_xfs_buf_iowait(bp, _RET_IP_);
-        if (atomic_read(&bp->b_io_remaining))
-                blk_run_address_space(bp->b_target->bt_mapping);
        wait_for_completion(&bp->b_iowait);
        trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1461,8 +1320,8 @@ xfs_buf_offset(
                return XFS_BUF_PTR(bp) + offset;
        offset += bp->b_offset;
-        page = bp->b_pages[offset >> PAGE_CACHE_SHIFT];
+        page = bp->b_pages[offset >> PAGE_SHIFT];
-        return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1));
+        return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
 }
 /*
@@ -1484,9 +1343,9 @@ xfs_buf_iomove(
                page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
                cpoff = xfs_buf_poff(boff + bp->b_offset);
                csize = min_t(size_t,
-                              PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff);
+                              PAGE_SIZE-cpoff, bp->b_count_desired-boff);
-                ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
+                ASSERT(((csize + cpoff) <= PAGE_SIZE));
                switch (mode) {
                case XBRW_ZERO:
@@ -1599,7 +1458,6 @@ xfs_free_buftarg(
        xfs_flush_buftarg(btp, 1);
        if (mp->m_flags & XFS_MOUNT_BARRIER)
                xfs_blkdev_issue_flush(btp);
-        iput(btp->bt_mapping->host);
        kthread_stop(btp->bt_task);
        kmem_free(btp);
@@ -1617,21 +1475,12 @@ xfs_setsize_buftarg_flags(
        btp->bt_smask = sectorsize - 1;
        if (set_blocksize(btp->bt_bdev, sectorsize)) {
-                printk(KERN_WARNING
+                xfs_warn(btp->bt_mount,
-                        "XFS: Cannot set_blocksize to %u on device %s\n",
+                        "Cannot set_blocksize to %u on device %s\n",
                        sectorsize, XFS_BUFTARG_NAME(btp));
                return EINVAL;
        }
-        if (verbose &&
-            (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
-                printk(KERN_WARNING
-                        "XFS: %u byte sectors in use on device %s.  "
-                        "This is suboptimal; %u or greater is ideal.\n",
-                        sectorsize, XFS_BUFTARG_NAME(btp),
-                        (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
-        }
        return 0;
 }
@@ -1646,7 +1495,7 @@ xfs_setsize_buftarg_early(
        struct block_device     *bdev)
 {
        return xfs_setsize_buftarg_flags(btp,
-                        PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
+                        PAGE_SIZE, bdev_logical_block_size(bdev), 0);
 }
 int
@@ -1659,41 +1508,6 @@ xfs_setsize_buftarg(
 }
 STATIC int
-xfs_mapping_buftarg(
-        xfs_buftarg_t           *btp,
-        struct block_device     *bdev)
-{
-        struct backing_dev_info *bdi;
-        struct inode            *inode;
-        struct address_space    *mapping;
-        static const struct address_space_operations mapping_aops = {
-                .sync_page = block_sync_page,
-                .migratepage = fail_migrate_page,
-        };
-        inode = new_inode(bdev->bd_inode->i_sb);
-        if (!inode) {
-                printk(KERN_WARNING
-                        "XFS: Cannot allocate mapping inode for device %s\n",
-                        XFS_BUFTARG_NAME(btp));
-                return ENOMEM;
-        }
-        inode->i_ino = get_next_ino();
-        inode->i_mode = S_IFBLK;
-        inode->i_bdev = bdev;
-        inode->i_rdev = bdev->bd_dev;
-        bdi = blk_get_backing_dev_info(bdev);
-        if (!bdi)
-                bdi = &default_backing_dev_info;
-        mapping = &inode->i_data;
-        mapping->a_ops = &mapping_aops;
-        mapping->backing_dev_info = bdi;
-        mapping_set_gfp_mask(mapping, GFP_NOFS);
-        btp->bt_mapping = mapping;
-        return 0;
-}
-STATIC int
 xfs_alloc_delwrite_queue(
        xfs_buftarg_t           *btp,
        const char              *fsname)
@@ -1721,12 +1535,14 @@ xfs_alloc_buftarg(
        btp->bt_mount = mp;
        btp->bt_dev =  bdev->bd_dev;
        btp->bt_bdev = bdev;
+        btp->bt_bdi = blk_get_backing_dev_info(bdev);
+        if (!btp->bt_bdi)
+                goto error;
        INIT_LIST_HEAD(&btp->bt_lru);
        spin_lock_init(&btp->bt_lru_lock);
        if (xfs_setsize_buftarg_early(btp, bdev))
                goto error;
-        if (xfs_mapping_buftarg(btp, bdev))
-                goto error;
        if (xfs_alloc_delwrite_queue(btp, fsname))
                goto error;
        btp->bt_shrinker.shrink = xfs_buftarg_shrink;
@@ -1923,8 +1739,8 @@ xfsbufd(
        do {
                long    age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
                long    tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
-                int     count = 0;
                struct list_head tmp;
+                struct blk_plug plug;
                if (unlikely(freezing(current))) {
                        set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
@@ -1940,16 +1756,15 @@ xfsbufd(
                xfs_buf_delwri_split(target, &tmp, age);
                list_sort(NULL, &tmp, xfs_buf_cmp);
+                blk_start_plug(&plug);
                while (!list_empty(&tmp)) {
                        struct xfs_buf *bp;
                        bp = list_first_entry(&tmp, struct xfs_buf, b_list);
                        list_del_init(&bp->b_list);
                        xfs_bdstrat_cb(bp);
-                        count++;
                }
-                if (count)
+                blk_finish_plug(&plug);
-                        blk_run_address_space(target->bt_mapping);
        } while (!kthread_should_stop());
        return 0;
@@ -1969,6 +1784,7 @@ xfs_flush_buftarg(
        int             pincount = 0;
        LIST_HEAD(tmp_list);
        LIST_HEAD(wait_list);
+        struct blk_plug plug;
        xfs_buf_runall_queues(xfsconvertd_workqueue);
        xfs_buf_runall_queues(xfsdatad_workqueue);
@@ -1983,6 +1799,8 @@ xfs_flush_buftarg(
         * we do that after issuing all the IO.
         */
        list_sort(NULL, &tmp_list, xfs_buf_cmp);
+        blk_start_plug(&plug);
        while (!list_empty(&tmp_list)) {
                bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
                ASSERT(target == bp->b_target);
@@ -1993,10 +1811,10 @@ xfs_flush_buftarg(
                }
                xfs_bdstrat_cb(bp);
        }
+        blk_finish_plug(&plug);
        if (wait) {
-                /* Expedite and wait for IO to complete. */
+                /* Wait for IO to complete. */
-                blk_run_address_space(target->bt_mapping);
                while (!list_empty(&wait_list)) {
                        bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
@@ -2022,11 +1840,12 @@ xfs_buf_init(void)
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
-        xfsdatad_workqueue = create_workqueue("xfsdatad");
+        xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
        if (!xfsdatad_workqueue)
                goto out_destroy_xfslogd_workqueue;
-        xfsconvertd_workqueue = create_workqueue("xfsconvertd");
+        xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
+                                                WQ_MEM_RECLAIM, 1);
        if (!xfsconvertd_workqueue)
                goto out_destroy_xfsdatad_workqueue;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index cbe65950e524..a9a1c4512645 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -61,30 +61,11 @@ typedef enum {
 #define XBF_DONT_BLOCK  (1 << 16)/* do not block in current thread */
 /* flags used only internally */
-#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */
 #define _XBF_PAGES      (1 << 18)/* backed by refcounted pages */
 #define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
+#define _XBF_KMEM       (1 << 20)/* backed by heap memory */
 #define _XBF_DELWRI_Q   (1 << 21)/* buffer on delwri queue */
-/*
- * Special flag for supporting metadata blocks smaller than a FSB.
- *
- * In this case we can have multiple xfs_buf_t on a single page and
- * need to lock out concurrent xfs_buf_t readers as they only
- * serialise access to the buffer.
- *
- * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
- * between reads of the page. Hence we can have one thread read the
- * page and modify it, but then race with another thread that thinks
- * the page is not up-to-date and hence reads it again.
- *
- * The result is that the first modifcation to the page is lost.
- * This sort of AGF/AGI reading race can happen when unlinking inodes
- * that require truncation and results in the AGI unlinked list
- * modifications being lost.
- */
-#define _XBF_PAGE_LOCKED        (1 << 22)
 typedef unsigned int xfs_buf_flags_t;
 #define XFS_BUF_FLAGS \
@@ -100,12 +81,10 @@ typedef unsigned int xfs_buf_flags_t;
        { XBF_LOCK,             "LOCK" },       /* should never be set */\
        { XBF_TRYLOCK,          "TRYLOCK" },    /* ditto */\
        { XBF_DONT_BLOCK,       "DONT_BLOCK" }, /* ditto */\
-        { _XBF_PAGE_CACHE,      "PAGE_CACHE" }, \
        { _XBF_PAGES,           "PAGES" }, \
        { _XBF_RUN_QUEUES,      "RUN_QUEUES" }, \
-        { _XBF_DELWRI_Q,        "DELWRI_Q" }, \
+        { _XBF_KMEM,            "KMEM" }, \
-        { _XBF_PAGE_LOCKED,     "PAGE_LOCKED" }
+        { _XBF_DELWRI_Q,        "DELWRI_Q" }
 typedef enum {
        XBT_FORCE_SLEEP = 0,
@@ -120,7 +99,7 @@ typedef struct xfs_bufhash {
 typedef struct xfs_buftarg {
        dev_t                   bt_dev;
        struct block_device     *bt_bdev;
-        struct address_space    *bt_mapping;
+        struct backing_dev_info *bt_bdi;
        struct xfs_mount        *bt_mount;
        unsigned int            bt_bsize;
        unsigned int            bt_sshift;
@@ -139,17 +118,6 @@ typedef struct xfs_buftarg {
        unsigned int            bt_lru_nr;
 } xfs_buftarg_t;
-/*
- *      xfs_buf_t:  Buffer structure for pagecache-based buffers
- *
- * This buffer structure is used by the pagecache buffer management routines
- * to refer to an assembly of pages forming a logical buffer.
- *
- * The buffer structure is used on a temporary basis only, and discarded when
- * released.  The real data storage is recorded in the pagecache. Buffers are
- * hashed to the block device on which the file system resides.
- */
 struct xfs_buf;
 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index 05201ae719e5..d61611c88012 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -152,6 +152,8 @@ xfs_ioc_trim(
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
+        if (!blk_queue_discard(q))
+                return -XFS_ERROR(EOPNOTSUPP);
        if (copy_from_user(&range, urange, sizeof(range)))
                return -XFS_ERROR(EFAULT);
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index fc0114da7fdd..f4f878fc0083 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -89,8 +89,10 @@ xfs_fs_encode_fh(
         * seven combinations work.  The real answer is "don't use v2".
         */
        len = xfs_fileid_length(fileid_type);
-        if (*max_len < len)
+        if (*max_len < len) {
+                *max_len = len;
                return 255;
+        }
        *max_len = len;
        switch (fileid_type) {
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index a55c1b46b219..f4213ba1ff85 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -381,7 +381,7 @@ xfs_aio_write_isize_update(
 /*
 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
- * part of the I/O may have been written to disk before the error occured.  In
+ * part of the I/O may have been written to disk before the error occurred.  In
 * this case the on-disk file size may have been adjusted beyond the in-memory
 * file size and now needs to be truncated back.
 */
@@ -896,6 +896,7 @@ xfs_file_fallocate(
        xfs_flock64_t   bf;
        xfs_inode_t     *ip = XFS_I(inode);
        int             cmd = XFS_IOC_RESVSP;
+        int             attr_flags = XFS_ATTR_NOLOCK;
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
                return -EOPNOTSUPP;
@@ -918,7 +919,10 @@ xfs_file_fallocate(
                        goto out_unlock;
        }
-        error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
+        if (file->f_flags & O_DSYNC)
+                attr_flags |= XFS_ATTR_SYNC;
+        error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
        if (error)
                goto out_unlock;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index f5e2a19e0f8e..acca2c5ca3fa 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -624,6 +624,10 @@ xfs_ioc_space(
        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
                attr_flags |= XFS_ATTR_NONBLOCK;
+        if (filp->f_flags & O_DSYNC)
+                attr_flags |= XFS_ATTR_SYNC;
        if (ioflags & IO_INVIS)
                attr_flags |= XFS_ATTR_DMI;
@@ -695,14 +699,19 @@ xfs_ioc_fsgeometry_v1(
        xfs_mount_t             *mp,
        void                    __user *arg)
 {
-        xfs_fsop_geom_v1_t      fsgeo;
+        xfs_fsop_geom_t         fsgeo;
        int                     error;
-        error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
+        error = xfs_fs_geometry(mp, &fsgeo, 3);
        if (error)
                return -error;
-        if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+        /*
+         * Caller should have passed an argument of type
+         * xfs_fsop_geom_v1_t.  This is a proper subset of the
+         * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
+         */
+        if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
                return -XFS_ERROR(EFAULT);
        return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index bd5727852fd6..dd21784525a8 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -70,7 +70,7 @@ xfs_synchronize_times(
 /*
 * If the linux inode is valid, mark it dirty.
- * Used when commiting a dirty inode into a transaction so that
+ * Used when committing a dirty inode into a transaction so that
 * the inode will get written back by the linux code
 */
 void
@@ -102,7 +102,8 @@ xfs_mark_inode_dirty(
 STATIC int
 xfs_init_security(
        struct inode    *inode,
-        struct inode    *dir)
+        struct inode    *dir,
+        const struct qstr *qstr)
 {
        struct xfs_inode *ip = XFS_I(inode);
        size_t          length;
@@ -110,7 +111,7 @@ xfs_init_security(
        unsigned char   *name;
        int             error;
-        error = security_inode_init_security(inode, dir, (char **)&name,
+        error = security_inode_init_security(inode, dir, qstr, (char **)&name,
                                             &value, &length);
        if (error) {
                if (error == -EOPNOTSUPP)
@@ -194,7 +195,7 @@ xfs_vn_mknod(
        inode = VFS_I(ip);
-        error = xfs_init_security(inode, dir);
+        error = xfs_init_security(inode, dir, &dentry->d_name);
        if (unlikely(error))
                goto out_cleanup_inode;
@@ -367,7 +368,7 @@ xfs_vn_symlink(
        inode = VFS_I(cip);
-        error = xfs_init_security(inode, dir);
+        error = xfs_init_security(inode, dir, &dentry->d_name);
        if (unlikely(error))
                goto out_cleanup_inode;
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 096494997747..244be9cbfe78 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -39,7 +39,6 @@
 #include <mrlock.h>
 #include <time.h>
-#include <support/debug.h>
 #include <support/uuid.h>
 #include <linux/semaphore.h>
@@ -86,6 +85,7 @@
 #include <xfs_aops.h>
 #include <xfs_super.h>
 #include <xfs_buf.h>
+#include <xfs_message.h>
 /*
 * Feature macros (disable/enable)
@@ -280,4 +280,25 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
 #define __arch_pack
 #endif
+#define ASSERT_ALWAYS(expr)     \
+        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#ifndef DEBUG
+#define ASSERT(expr)    ((void)0)
+#ifndef STATIC
+# define STATIC static noinline
+#endif
+#else /* DEBUG */
+#define ASSERT(expr)    \
+        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#ifndef STATIC
+# define STATIC noinline
+#endif
+#endif /* DEBUG */
 #endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
new file mode 100644
index 000000000000..3ca795609113
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2011 Red Hat, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+/*
+ * XFS logging functions
+ */
+static void
+__xfs_printk(
+        const char              *level,
+        const struct xfs_mount  *mp,
+        struct va_format        *vaf)
+{
+        if (mp && mp->m_fsname)
+                printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
+        printk("%sXFS: %pV\n", level, vaf);
+}
+void xfs_printk(
+        const char              *level,
+        const struct xfs_mount  *mp,
+        const char              *fmt, ...)
+{
+        struct va_format        vaf;
+        va_list                 args;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        __xfs_printk(level, mp, &vaf);
+        va_end(args);
+}
+#define define_xfs_printk_level(func, kern_level)               \
+void func(const struct xfs_mount *mp, const char *fmt, ...)     \
+{                                                               \
+        struct va_format        vaf;                            \
+        va_list                 args;                           \
+                                                                \
+        va_start(args, fmt);                                    \
+                                                                \
+        vaf.fmt = fmt;                                          \
+        vaf.va = &args;                                         \
+                                                                \
+        __xfs_printk(kern_level, mp, &vaf);                     \
+        va_end(args);                                           \
+}                                                               \
+define_xfs_printk_level(xfs_emerg, KERN_EMERG);
+define_xfs_printk_level(xfs_alert, KERN_ALERT);
+define_xfs_printk_level(xfs_crit, KERN_CRIT);
+define_xfs_printk_level(xfs_err, KERN_ERR);
+define_xfs_printk_level(xfs_warn, KERN_WARNING);
+define_xfs_printk_level(xfs_notice, KERN_NOTICE);
+define_xfs_printk_level(xfs_info, KERN_INFO);
+#ifdef DEBUG
+define_xfs_printk_level(xfs_debug, KERN_DEBUG);
+#endif
+void
+xfs_alert_tag(
+        const struct xfs_mount  *mp,
+        int                     panic_tag,
+        const char              *fmt, ...)
+{
+        struct va_format        vaf;
+        va_list                 args;
+        int                     do_panic = 0;
+        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
+                xfs_printk(KERN_ALERT, mp,
+                        "XFS: Transforming an alert into a BUG.");
+                do_panic = 1;
+        }
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        __xfs_printk(KERN_ALERT, mp, &vaf);
+        va_end(args);
+        BUG_ON(do_panic);
+}
+void
+assfail(char *expr, char *file, int line)
+{
+        xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
+                expr, file, line);
+        BUG();
+}
+void
+xfs_hex_dump(void *p, int length)
+{
+        print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
+}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
new file mode 100644
index 000000000000..f1b3fc1b6c4e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -0,0 +1,40 @@
+#ifndef __XFS_MESSAGE_H
+#define __XFS_MESSAGE_H 1
+struct xfs_mount;
+extern void xfs_printk(const char *level, const struct xfs_mount *mp,
+                      const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
+                         const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+#ifdef DEBUG
+extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+#else
+static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+{
+}
+#endif
+extern void assfail(char *expr, char *f, int l);
+extern void xfs_hex_dump(void *p, int length);
+#endif  /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9731898083ae..b38e58d02299 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -173,6 +173,15 @@ xfs_parseargs(
        __uint8_t               iosizelog = 0;
        /*
+         * set up the mount name first so all the errors will refer to the
+         * correct device.
+         */
+        mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+        if (!mp->m_fsname)
+                return ENOMEM;
+        mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+        /*
         * Copy binary VFS mount flags we are interested in.
         */
        if (sb->s_flags & MS_RDONLY)
@@ -189,6 +198,7 @@ xfs_parseargs(
        mp->m_flags |= XFS_MOUNT_BARRIER;
        mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
        mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+        mp->m_flags |= XFS_MOUNT_DELAYLOG;
        /*
         * These can be overridden by the mount option parsing.
@@ -207,24 +217,21 @@ xfs_parseargs(
                if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        mp->m_logbufs = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        mp->m_logbsize = suffix_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -232,14 +239,12 @@ xfs_parseargs(
                        if (!mp->m_logname)
                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s option not allowed on this system",
-                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -248,8 +253,7 @@ xfs_parseargs(
                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -257,8 +261,7 @@ xfs_parseargs(
                        iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -280,16 +283,14 @@ xfs_parseargs(
                        mp->m_flags |= XFS_MOUNT_SWALLOC;
                } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
                        dsunit = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
                        if (!value || !*value) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "%s option requires an argument",
-                                        "XFS: %s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
@@ -297,8 +298,7 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
                        mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
 #if !XFS_BIG_INUMS
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s option not allowed on this system",
-                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
 #endif
@@ -356,20 +356,19 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, "ihashsize")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: ihashsize no longer used, option is deprecated.");
+        "ihashsize no longer used, option is deprecated.");
                } else if (!strcmp(this_char, "osyncisdsync")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: osyncisdsync has no effect, option is deprecated.");
+        "osyncisdsync has no effect, option is deprecated.");
                } else if (!strcmp(this_char, "osyncisosync")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: osyncisosync has no effect, option is deprecated.");
+        "osyncisosync has no effect, option is deprecated.");
                } else if (!strcmp(this_char, "irixsgid")) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: irixsgid is now a sysctl(2) variable, option is deprecated.");
+        "irixsgid is now a sysctl(2) variable, option is deprecated.");
                } else {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "unknown mount option [%s].", this_char);
-                                "XFS: unknown mount option [%s].", this_char);
                        return EINVAL;
                }
        }
@@ -379,40 +378,37 @@ xfs_parseargs(
         */
        if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
            !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only.");
+                xfs_warn(mp, "no-recovery mounts must be read-only.");
                return EINVAL;
        }
        if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: sunit and swidth options incompatible with the noalign option");
+        "sunit and swidth options incompatible with the noalign option");
                return EINVAL;
        }
 #ifndef CONFIG_XFS_QUOTA
        if (XFS_IS_QUOTA_RUNNING(mp)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "quota support not available in this kernel.");
-                        "XFS: quota support not available in this kernel.");
                return EINVAL;
        }
 #endif
        if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
            (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "cannot mount with both project and group quota");
-                        "XFS: cannot mount with both project and group quota");
                return EINVAL;
        }
        if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "sunit and swidth must be specified together");
-                        "XFS: sunit and swidth must be specified together");
                return EINVAL;
        }
        if (dsunit && (dswidth % dsunit != 0)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)",
+        "stripe width (%d) must be a multiple of the stripe unit (%d)",
                        dswidth, dsunit);
                return EINVAL;
        }
@@ -438,8 +434,7 @@ done:
            mp->m_logbufs != 0 &&
            (mp->m_logbufs < XLOG_MIN_ICLOGS ||
             mp->m_logbufs > XLOG_MAX_ICLOGS)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
-                        "XFS: invalid logbufs value: %d [not %d-%d]",
                        mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
                return XFS_ERROR(EINVAL);
        }
@@ -448,22 +443,16 @@ done:
            (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
             mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
             !is_power_of_2(mp->m_logbsize))) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+                        "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
                        mp->m_logbsize);
                return XFS_ERROR(EINVAL);
        }
-        mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
-        if (!mp->m_fsname)
-                return ENOMEM;
-        mp->m_fsname_len = strlen(mp->m_fsname) + 1;
        if (iosizelog) {
                if (iosizelog > XFS_MAX_IO_LOG ||
                    iosizelog < XFS_MIN_IO_LOG) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
-                "XFS: invalid log iosize: %d [not %d-%d]",
                                iosizelog, XFS_MIN_IO_LOG,
                                XFS_MAX_IO_LOG);
                        return XFS_ERROR(EINVAL);
@@ -610,7 +599,7 @@ xfs_blkdev_get(
                                    mp);
        if (IS_ERR(*bdevp)) {
                error = PTR_ERR(*bdevp);
-                printk("XFS: Invalid device [%s], error=%d\n", name, error);
+                xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
        }
        return -error;
@@ -664,23 +653,23 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
        int error;
        if (mp->m_logdev_targp != mp->m_ddev_targp) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
+                xfs_notice(mp,
                  "Disabling barriers, not supported with external log device");
                mp->m_flags &= ~XFS_MOUNT_BARRIER;
                return;
        }
        if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
+                xfs_notice(mp,
-                  "Disabling barriers, underlying device is readonly");
+                        "Disabling barriers, underlying device is readonly");
                mp->m_flags &= ~XFS_MOUNT_BARRIER;
                return;
        }
        error = xfs_barrier_test(mp);
        if (error) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
+                xfs_notice(mp,
-                  "Disabling barriers, trial barrier write failed");
+                        "Disabling barriers, trial barrier write failed");
                mp->m_flags &= ~XFS_MOUNT_BARRIER;
                return;
        }
@@ -743,8 +732,8 @@ xfs_open_devices(
                        goto out_close_logdev;
                if (rtdev == ddev || rtdev == logdev) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev.");
+        "Cannot mount filesystem with identical rtdev and ddev/logdev.");
                        error = EINVAL;
                        goto out_close_rtdev;
                }
@@ -827,75 +816,6 @@ xfs_setup_devices(
        return 0;
 }
-/*
- * XFS AIL push thread support
- */
-void
-xfsaild_wakeup(
-        struct xfs_ail          *ailp,
-        xfs_lsn_t               threshold_lsn)
-{
-        /* only ever move the target forwards */
-        if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
-                ailp->xa_target = threshold_lsn;
-                wake_up_process(ailp->xa_task);
-        }
-}
-STATIC int
-xfsaild(
-        void    *data)
-{
-        struct xfs_ail  *ailp = data;
-        xfs_lsn_t       last_pushed_lsn = 0;
-        long            tout = 0; /* milliseconds */
-        while (!kthread_should_stop()) {
-                /*
-                 * for short sleeps indicating congestion, don't allow us to
-                 * get woken early. Otherwise all we do is bang on the AIL lock
-                 * without making progress.
-                 */
-                if (tout && tout <= 20)
-                        __set_current_state(TASK_KILLABLE);
-                else
-                        __set_current_state(TASK_INTERRUPTIBLE);
-                schedule_timeout(tout ?
-                                 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
-                /* swsusp */
-                try_to_freeze();
-                ASSERT(ailp->xa_mount->m_log);
-                if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
-                        continue;
-                tout = xfsaild_push(ailp, &last_pushed_lsn);
-        }
-        return 0;
-}       /* xfsaild */
-int
-xfsaild_start(
-        struct xfs_ail  *ailp)
-{
-        ailp->xa_target = 0;
-        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
-                                    ailp->xa_mount->m_fsname);
-        if (IS_ERR(ailp->xa_task))
-                return -PTR_ERR(ailp->xa_task);
-        return 0;
-}
-void
-xfsaild_stop(
-        struct xfs_ail  *ailp)
-{
-        kthread_stop(ailp->xa_task);
-}
 /* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
@@ -1089,7 +1009,7 @@ xfs_fs_write_inode(
                        error = 0;
                        goto out_unlock;
                }
-                error = xfs_iflush(ip, 0);
+                error = xfs_iflush(ip, SYNC_TRYLOCK);
        }
 out_unlock:
@@ -1202,22 +1122,12 @@ xfs_fs_sync_fs(
                return -error;
        if (laptop_mode) {
-                int     prev_sync_seq = mp->m_sync_seq;
                /*
                 * The disk must be active because we're syncing.
                 * We schedule xfssyncd now (now that the disk is
                 * active) instead of later (when it might not be).
                 */
-                wake_up_process(mp->m_sync_task);
+                flush_delayed_work_sync(&mp->m_sync_work);
-                /*
-                 * We have to wait for the sync iteration to complete.
-                 * If we don't, the disk activity caused by the sync
-                 * will come after the sync is completed, and that
-                 * triggers another sync from laptop mode.
-                 */
-                wait_event(mp->m_wait_single_sync_task,
-                                mp->m_sync_seq != prev_sync_seq);
        }
        return 0;
@@ -1345,8 +1255,8 @@ xfs_fs_remount(
                         * options that we can't actually change.
                         */
 #if 0
-                        printk(KERN_INFO
+                        xfs_info(mp,
-        "XFS: mount option \"%s\" not supported for remount\n", p);
+                "mount option \"%s\" not supported for remount\n", p);
                        return -EINVAL;
 #else
                        break;
@@ -1367,8 +1277,7 @@ xfs_fs_remount(
                if (mp->m_update_flags) {
                        error = xfs_mount_log_sb(mp, mp->m_update_flags);
                        if (error) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "failed to write sb changes");
-                                        "XFS: failed to write sb changes");
                                return error;
                        }
                        mp->m_update_flags = 0;
@@ -1452,15 +1361,15 @@ xfs_finish_flags(
                        mp->m_logbsize = mp->m_sb.sb_logsunit;
                } else if (mp->m_logbsize > 0 &&
                           mp->m_logbsize < mp->m_sb.sb_logsunit) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: logbuf size must be greater than or equal to log stripe size");
+                "logbuf size must be greater than or equal to log stripe size");
                        return XFS_ERROR(EINVAL);
                }
        } else {
                /* Fail a mount if the logbuf is larger than 32K */
                if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-        "XFS: logbuf size for version 1 logs must be 16K or 32K");
+                "logbuf size for version 1 logs must be 16K or 32K");
                        return XFS_ERROR(EINVAL);
                }
        }
@@ -1477,8 +1386,8 @@ xfs_finish_flags(
         * prohibit r/w mounts of read-only filesystems
         */
        if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: cannot mount a read-only filesystem as read-write");
+                        "cannot mount a read-only filesystem as read-write");
                return XFS_ERROR(EROFS);
        }
@@ -1502,9 +1411,6 @@ xfs_fs_fill_super(
        spin_lock_init(&mp->m_sb_lock);
        mutex_init(&mp->m_growlock);
        atomic_set(&mp->m_active_trans, 0);
-        INIT_LIST_HEAD(&mp->m_sync_list);
-        spin_lock_init(&mp->m_sync_lock);
-        init_waitqueue_head(&mp->m_wait_single_sync_task);
        mp->m_super = sb;
        sb->s_fs_info = mp;
@@ -1551,10 +1457,14 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_sb;
-        error = xfs_mountfs(mp);
+        /*
-        if (error)
+         * we must configure the block size in the superblock before we run the
-                goto out_filestream_unmount;
+         * full mount process as the mount process can lookup and cache inodes.
+         * For the same reason we must also initialise the syncd and register
+         * the inode cache shrinker so that inodes can be reclaimed during
+         * operations like a quotacheck that iterate all inodes in the
+         * filesystem.
+         */
        sb->s_magic = XFS_SB_MAGIC;
        sb->s_blocksize = mp->m_sb.sb_blocksize;
        sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1562,6 +1472,16 @@ xfs_fs_fill_super(
        sb->s_time_gran = 1;
        set_posix_acl_flag(sb);
+        error = xfs_syncd_init(mp);
+        if (error)
+                goto out_filestream_unmount;
+        xfs_inode_shrinker_register(mp);
+        error = xfs_mountfs(mp);
+        if (error)
+                goto out_syncd_stop;
        root = igrab(VFS_I(mp->m_rootip));
        if (!root) {
                error = ENOENT;
@@ -1577,14 +1497,11 @@ xfs_fs_fill_super(
                goto fail_vnrele;
        }
-        error = xfs_syncd_init(mp);
-        if (error)
-                goto fail_vnrele;
-        xfs_inode_shrinker_register(mp);
        return 0;
+ out_syncd_stop:
+        xfs_inode_shrinker_unregister(mp);
+        xfs_syncd_stop(mp);
 out_filestream_unmount:
        xfs_filestream_unmount(mp);
 out_free_sb:
@@ -1608,6 +1525,9 @@ xfs_fs_fill_super(
        }
 fail_unmount:
+        xfs_inode_shrinker_unregister(mp);
+        xfs_syncd_stop(mp);
        /*
         * Blow away any referenced inode in the filestreams cache.
         * This can and will cause log traffic as inodes go inactive
@@ -1797,6 +1717,38 @@ xfs_destroy_zones(void)
 }
 STATIC int __init
+xfs_init_workqueues(void)
+{
+        /*
+         * max_active is set to 8 to give enough concurency to allow
+         * multiple work operations on each CPU to run. This allows multiple
+         * filesystems to be running sync work concurrently, and scales with
+         * the number of CPUs in the system.
+         */
+        xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
+        if (!xfs_syncd_wq)
+                goto out;
+        xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
+        if (!xfs_ail_wq)
+                goto out_destroy_syncd;
+        return 0;
+out_destroy_syncd:
+        destroy_workqueue(xfs_syncd_wq);
+out:
+        return -ENOMEM;
+}
+STATIC void
+xfs_destroy_workqueues(void)
+{
+        destroy_workqueue(xfs_ail_wq);
+        destroy_workqueue(xfs_syncd_wq);
+}
+STATIC int __init
 init_xfs_fs(void)
 {
        int                     error;
@@ -1811,10 +1763,14 @@ init_xfs_fs(void)
        if (error)
                goto out;
-        error = xfs_mru_cache_init();
+        error = xfs_init_workqueues();
        if (error)
                goto out_destroy_zones;
+        error = xfs_mru_cache_init();
+        if (error)
+                goto out_destroy_wq;
        error = xfs_filestream_init();
        if (error)
                goto out_mru_cache_uninit;
@@ -1831,6 +1787,10 @@ init_xfs_fs(void)
        if (error)
                goto out_cleanup_procfs;
+        error = xfs_init_workqueues();
+        if (error)
+                goto out_sysctl_unregister;
        vfs_initquota();
        error = register_filesystem(&xfs_fs_type);
@@ -1848,6 +1808,8 @@ init_xfs_fs(void)
        xfs_filestream_uninit();
 out_mru_cache_uninit:
        xfs_mru_cache_uninit();
+ out_destroy_wq:
+        xfs_destroy_workqueues();
 out_destroy_zones:
        xfs_destroy_zones();
 out:
@@ -1864,6 +1826,7 @@ exit_xfs_fs(void)
        xfs_buf_terminate();
        xfs_filestream_uninit();
        xfs_mru_cache_uninit();
+        xfs_destroy_workqueues();
        xfs_destroy_zones();
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index e22f0057d21f..e4f9c1b0836c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -22,6 +22,7 @@
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
+#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -39,6 +40,8 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+struct workqueue_struct *xfs_syncd_wq;  /* sync workqueue */
 /*
 * The inode lookup is done in batches to keep the amount of lock traffic and
 * radix tree lookups to a minimum. The batch size is a trade off between
@@ -401,7 +404,7 @@ xfs_quiesce_fs(
 /*
 * Second stage of a quiesce. The data is already synced, now we have to take
 * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceding.
+ * wait for any remaining transactions to drain out before proceeding.
 */
 void
 xfs_quiesce_attr(
@@ -425,69 +428,18 @@ xfs_quiesce_attr(
        /* Push the superblock and write an unmount record */
        error = xfs_log_sbcount(mp, 1);
        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp,
+                xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
-                                "xfs_attr_quiesce: failed to log sb changes. "
                                "Frozen image may not be consistent.");
        xfs_log_unmount_write(mp);
        xfs_unmountfs_writesb(mp);
 }
-/*
+static void
- * Enqueue a work item to be picked up by the vfs xfssyncd thread.
+xfs_syncd_queue_sync(
- * Doing this has two advantages:
+        struct xfs_mount        *mp)
- * - It saves on stack space, which is tight in certain situations
- * - It can be used (with care) as a mechanism to avoid deadlocks.
- * Flushing while allocating in a full filesystem requires both.
- */
-STATIC void
-xfs_syncd_queue_work(
-        struct xfs_mount *mp,
-        void            *data,
-        void            (*syncer)(struct xfs_mount *, void *),
-        struct completion *completion)
-{
-        struct xfs_sync_work *work;
-        work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
-        INIT_LIST_HEAD(&work->w_list);
-        work->w_syncer = syncer;
-        work->w_data = data;
-        work->w_mount = mp;
-        work->w_completion = completion;
-        spin_lock(&mp->m_sync_lock);
-        list_add_tail(&work->w_list, &mp->m_sync_list);
-        spin_unlock(&mp->m_sync_lock);
-        wake_up_process(mp->m_sync_task);
-}
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room...
- */
-STATIC void
-xfs_flush_inodes_work(
-        struct xfs_mount *mp,
-        void            *arg)
-{
-        struct inode    *inode = arg;
-        xfs_sync_data(mp, SYNC_TRYLOCK);
-        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-        iput(inode);
-}
-void
-xfs_flush_inodes(
-        xfs_inode_t     *ip)
 {
-        struct inode    *inode = VFS_I(ip);
+        queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
-        DECLARE_COMPLETION_ONSTACK(completion);
+                                msecs_to_jiffies(xfs_syncd_centisecs * 10));
-        igrab(inode);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
-        wait_for_completion(&completion);
-        xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
 }
 /*
@@ -497,9 +449,10 @@ xfs_flush_inodes(
 */
 STATIC void
 xfs_sync_worker(
-        struct xfs_mount *mp,
+        struct work_struct *work)
-        void            *unused)
 {
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                        struct xfs_mount, m_sync_work);
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
@@ -509,73 +462,106 @@ xfs_sync_worker(
                        error = xfs_fs_log_dummy(mp);
                else
                        xfs_log_force(mp, 0);
-                xfs_reclaim_inodes(mp, 0);
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
+                /* start pushing all the metadata that is currently dirty */
+                xfs_ail_push_all(mp->m_ail);
        }
-        mp->m_sync_seq++;
-        wake_up(&mp->m_wait_single_sync_task);
+        /* queue us up again */
+        xfs_syncd_queue_sync(mp);
 }
-STATIC int
+/*
-xfssyncd(
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
-        void                    *arg)
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_syncd_queue_reclaim(
+        struct xfs_mount        *mp)
 {
-        struct xfs_mount        *mp = arg;
-        long                    timeleft;
-        xfs_sync_work_t         *work, *n;
-        LIST_HEAD               (tmp);
-        set_freezable();
-        timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
-        for (;;) {
-                if (list_empty(&mp->m_sync_list))
-                        timeleft = schedule_timeout_interruptible(timeleft);
-                /* swsusp */
-                try_to_freeze();
-                if (kthread_should_stop() && list_empty(&mp->m_sync_list))
-                        break;
-                spin_lock(&mp->m_sync_lock);
+        /*
-                /*
+         * We can have inodes enter reclaim after we've shut down the syncd
-                 * We can get woken by laptop mode, to do a sync -
+         * workqueue during unmount, so don't allow reclaim work to be queued
-                 * that's the (only!) case where the list would be
+         * during unmount.
-                 * empty with time remaining.
+         */
-                 */
+        if (!(mp->m_super->s_flags & MS_ACTIVE))
-                if (!timeleft || list_empty(&mp->m_sync_list)) {
+                return;
-                        if (!timeleft)
-                                timeleft = xfs_syncd_centisecs *
-                                                        msecs_to_jiffies(10);
-                        INIT_LIST_HEAD(&mp->m_sync_work.w_list);
-                        list_add_tail(&mp->m_sync_work.w_list,
-                                        &mp->m_sync_list);
-                }
-                list_splice_init(&mp->m_sync_list, &tmp);
-                spin_unlock(&mp->m_sync_lock);
-                list_for_each_entry_safe(work, n, &tmp, w_list) {
+        rcu_read_lock();
-                        (*work->w_syncer)(mp, work->w_data);
+        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-                        list_del(&work->w_list);
+                queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
-                        if (work == &mp->m_sync_work)
+                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-                                continue;
-                        if (work->w_completion)
-                                complete(work->w_completion);
-                        kmem_free(work);
-                }
        }
+        rcu_read_unlock();
+}
-        return 0;
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+STATIC void
+xfs_reclaim_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                        struct xfs_mount, m_reclaim_work);
+        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+        xfs_syncd_queue_reclaim(mp);
+}
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations.  At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room.
+ *
+ * Queue a new data flush if there isn't one already in progress and
+ * wait for completion of the flush. This means that we only ever have one
+ * inode flush in progress no matter how many ENOSPC events are occurring and
+ * so will prevent the system from bogging down due to every concurrent
+ * ENOSPC event scanning all the active inodes in the system for writeback.
+ */
+void
+xfs_flush_inodes(
+        struct xfs_inode        *ip)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        queue_work(xfs_syncd_wq, &mp->m_flush_work);
+        flush_work_sync(&mp->m_flush_work);
+}
+STATIC void
+xfs_flush_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(work,
+                                        struct xfs_mount, m_flush_work);
+        xfs_sync_data(mp, SYNC_TRYLOCK);
+        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
 }
 int
 xfs_syncd_init(
        struct xfs_mount        *mp)
 {
-        mp->m_sync_work.w_syncer = xfs_sync_worker;
+        INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-        mp->m_sync_work.w_mount = mp;
+        INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-        mp->m_sync_work.w_completion = NULL;
+        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
-        if (IS_ERR(mp->m_sync_task))
+        xfs_syncd_queue_sync(mp);
-                return -PTR_ERR(mp->m_sync_task);
+        xfs_syncd_queue_reclaim(mp);
        return 0;
 }
@@ -583,7 +569,9 @@ void
 xfs_syncd_stop(
        struct xfs_mount        *mp)
 {
-        kthread_stop(mp->m_sync_task);
+        cancel_delayed_work_sync(&mp->m_sync_work);
+        cancel_delayed_work_sync(&mp->m_reclaim_work);
+        cancel_work_sync(&mp->m_flush_work);
 }
 void
@@ -602,6 +590,10 @@ __xfs_inode_set_reclaim_tag(
                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
                                XFS_ICI_RECLAIM_TAG);
                spin_unlock(&ip->i_mount->m_perag_lock);
+                /* schedule periodic background inode reclaim */
+                xfs_syncd_queue_reclaim(ip->i_mount);
                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
                                                        -1, _RET_IP_);
        }
@@ -762,8 +754,10 @@ xfs_reclaim_inode(
        struct xfs_perag        *pag,
        int                     sync_mode)
 {
-        int     error = 0;
+        int     error;
+restart:
+        error = 0;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        if (!xfs_iflock_nowait(ip)) {
                if (!(sync_mode & SYNC_WAIT))
@@ -789,9 +783,31 @@ xfs_reclaim_inode(
        if (xfs_inode_clean(ip))
                goto reclaim;
-        /* Now we have an inode that needs flushing */
+        /*
-        error = xfs_iflush(ip, sync_mode);
+         * Now we have an inode that needs flushing.
+         *
+         * We do a nonblocking flush here even if we are doing a SYNC_WAIT
+         * reclaim as we can deadlock with inode cluster removal.
+         * xfs_ifree_cluster() can lock the inode buffer before it locks the
+         * ip->i_lock, and we are doing the exact opposite here. As a result,
+         * doing a blocking xfs_itobp() to get the cluster buffer will result
+         * in an ABBA deadlock with xfs_ifree_cluster().
+         *
+         * As xfs_ifree_cluser() must gather all inodes that are active in the
+         * cache to mark them stale, if we hit this case we don't actually want
+         * to do IO here - we want the inode marked stale so we can simply
+         * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
+         * just unlock the inode, back off and try again. Hopefully the next
+         * pass through will see the stale flag set on the inode.
+         */
+        error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
        if (sync_mode & SYNC_WAIT) {
+                if (error == EAGAIN) {
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                        /* backoff longer than in xfs_ifree_cluster */
+                        delay(2);
+                        goto restart;
+                }
                xfs_iflock(ip);
                goto reclaim;
        }
@@ -806,7 +822,7 @@ xfs_reclaim_inode(
         * pass on the error.
         */
        if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
                        "inode 0x%llx background reclaim flush failed with %d",
                        (long long)ip->i_ino, error);
        }
@@ -994,7 +1010,13 @@ xfs_reclaim_inodes(
 }
 /*
- * Shrinker infrastructure.
+ * Inode cache shrinker.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doiing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
 */
 static int
 xfs_reclaim_inode_shrink(
@@ -1009,10 +1031,15 @@ xfs_reclaim_inode_shrink(
        mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
        if (nr_to_scan) {
+                /* kick background reclaimer and push the AIL */
+                xfs_syncd_queue_reclaim(mp);
+                xfs_ail_push_all(mp->m_ail);
                if (!(gfp_mask & __GFP_FS))
                        return -1;
-                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
+                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
+                                        &nr_to_scan);
                /* terminate if we don't exhaust the scan */
                if (nr_to_scan > 0)
                        return -1;
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 32ba6628290c..e3a6ad27415f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -32,6 +32,8 @@ typedef struct xfs_sync_work {
 #define SYNC_WAIT               0x0001  /* wait for i/o to complete */
 #define SYNC_TRYLOCK            0x0002  /* only try to lock inodes */
+extern struct workqueue_struct  *xfs_syncd_wq;  /* sync workqueue */
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index ee3cee097e7e..ee2d2adaa438 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -37,7 +37,7 @@ xfs_stats_clear_proc_handler(
        ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
        if (!ret && write && *valp) {
-                printk("XFS Clearing xfsstats\n");
+                xfs_notice(NULL, "Clearing xfsstats");
                for_each_possible_cpu(c) {
                        preempt_disable();
                        /* save vn_active, it's a universal truth! */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index d22aa3103106..6fa214603819 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -544,9 +544,10 @@ xfs_qm_dqtobp(
        /*
         * A simple sanity check in case we got a corrupted dquot...
         */
-        if (xfs_qm_dqcheck(ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
+        error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
                           flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
-                           "dqtobp")) {
+                           "dqtobp");
+        if (error) {
                if (!(flags & XFS_QMOPT_DQREPAIR)) {
                        xfs_trans_brelse(tp, bp);
                        return XFS_ERROR(EIO);
@@ -599,7 +600,7 @@ xfs_qm_dqread(
        /*
         * Reservation counters are defined as reservation plus current usage
-         * to avoid having to add everytime.
+         * to avoid having to add every time.
         */
        dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
        dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
@@ -827,7 +828,7 @@ xfs_qm_dqget(
        if (xfs_do_dqerror) {
                if ((xfs_dqerror_target == mp->m_ddev_targp) &&
                    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
-                        cmn_err(CE_DEBUG, "Returning error in dqget");
+                        xfs_debug(mp, "Returning error in dqget");
                        return (EIO);
                }
        }
@@ -1207,8 +1208,9 @@ xfs_qm_dqflush(
        /*
         * A simple sanity check in case we got a corrupted dquot..
         */
-        if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+        error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
-                           XFS_QMOPT_DOWARN, "dqflush (incore copy)")) {
+                           XFS_QMOPT_DOWARN, "dqflush (incore copy)");
+        if (error) {
                xfs_buf_relse(bp);
                xfs_dqfunlock(dqp);
                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -1391,8 +1393,8 @@ xfs_qm_dqpurge(
                 */
                error = xfs_qm_dqflush(dqp, SYNC_WAIT);
                if (error)
-                        xfs_fs_cmn_err(CE_WARN, mp,
+                        xfs_warn(mp, "%s: dquot %p flush failed",
-                                "xfs_qm_dqpurge: dquot %p flush failed", dqp);
+                                __func__, dqp);
                xfs_dqflock(dqp);
        }
        ASSERT(atomic_read(&dqp->q_pincount) == 0);
@@ -1425,36 +1427,38 @@ xfs_qm_dqpurge(
 void
 xfs_qm_dqprint(xfs_dquot_t *dqp)
 {
-        cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------");
+        struct xfs_mount        *mp = dqp->q_mount;
-        cmn_err(CE_DEBUG, "---- dquotID =  %d",
+        xfs_debug(mp, "-----------KERNEL DQUOT----------------");
+        xfs_debug(mp, "---- dquotID =  %d",
                (int)be32_to_cpu(dqp->q_core.d_id));
-        cmn_err(CE_DEBUG, "---- type    =  %s", DQFLAGTO_TYPESTR(dqp));
+        xfs_debug(mp, "---- type    =  %s", DQFLAGTO_TYPESTR(dqp));
-        cmn_err(CE_DEBUG, "---- fs      =  0x%p", dqp->q_mount);
+        xfs_debug(mp, "---- fs      =  0x%p", dqp->q_mount);
-        cmn_err(CE_DEBUG, "---- blkno   =  0x%x", (int) dqp->q_blkno);
+        xfs_debug(mp, "---- blkno   =  0x%x", (int) dqp->q_blkno);
-        cmn_err(CE_DEBUG, "---- boffset =  0x%x", (int) dqp->q_bufoffset);
+        xfs_debug(mp, "---- boffset =  0x%x", (int) dqp->q_bufoffset);
-        cmn_err(CE_DEBUG, "---- blkhlimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- blkhlimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_blk_hardlimit),
                (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit));
-        cmn_err(CE_DEBUG, "---- blkslimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- blkslimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_blk_softlimit),
                (int)be64_to_cpu(dqp->q_core.d_blk_softlimit));
-        cmn_err(CE_DEBUG, "---- inohlimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- inohlimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_ino_hardlimit),
                (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit));
-        cmn_err(CE_DEBUG, "---- inoslimit =  %Lu (0x%x)",
+        xfs_debug(mp, "---- inoslimit =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_ino_softlimit),
                (int)be64_to_cpu(dqp->q_core.d_ino_softlimit));
-        cmn_err(CE_DEBUG, "---- bcount  =  %Lu (0x%x)",
+        xfs_debug(mp, "---- bcount  =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_bcount),
                (int)be64_to_cpu(dqp->q_core.d_bcount));
-        cmn_err(CE_DEBUG, "---- icount  =  %Lu (0x%x)",
+        xfs_debug(mp, "---- icount  =  %Lu (0x%x)",
                be64_to_cpu(dqp->q_core.d_icount),
                (int)be64_to_cpu(dqp->q_core.d_icount));
-        cmn_err(CE_DEBUG, "---- btimer  =  %d",
+        xfs_debug(mp, "---- btimer  =  %d",
                (int)be32_to_cpu(dqp->q_core.d_btimer));
-        cmn_err(CE_DEBUG, "---- itimer  =  %d",
+        xfs_debug(mp, "---- itimer  =  %d",
                (int)be32_to_cpu(dqp->q_core.d_itimer));
-        cmn_err(CE_DEBUG, "---------------------------");
+        xfs_debug(mp, "---------------------------");
 }
 #endif
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 2a1f3dc10a02..9e0e2fa3f2c8 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -136,9 +136,8 @@ xfs_qm_dquot_logitem_push(
         */
        error = xfs_qm_dqflush(dqp, 0);
        if (error)
-                xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
+                xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
-                        "xfs_qm_dquot_logitem_push: push error %d on dqp %p",
+                        __func__, error, dqp);
-                        error, dqp);
        xfs_dqunlock(dqp);
 }
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 206a2815ced6..69228aa8605a 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -80,7 +80,7 @@ xfs_qm_dquot_list_print(
        int             i = 0;
        list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
-                cmn_err(CE_DEBUG, "   %d. \"%d (%s)\"   "
+                xfs_debug(mp, "   %d. \"%d (%s)\"   "
                                  "bcnt = %lld, icnt = %lld, refs = %d",
                        i++, be32_to_cpu(dqp->q_core.d_id),
                        DQFLAGTO_TYPESTR(dqp),
@@ -205,7 +205,7 @@ xfs_qm_destroy(
        list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
                xfs_dqlock(dqp);
 #ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp);
+                xfs_debug(dqp->q_mount, "FREELIST destroy 0x%p", dqp);
 #endif
                list_del_init(&dqp->q_freelist);
                xfs_Gqm->qm_dqfrlist_cnt--;
@@ -341,9 +341,7 @@ xfs_qm_mount_quotas(
         * quotas immediately.
         */
        if (mp->m_sb.sb_rextents) {
-                cmn_err(CE_NOTE,
+                xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
-                        "Cannot turn on quotas for realtime filesystem %s",
-                        mp->m_fsname);
                mp->m_qflags = 0;
                goto write_changes;
        }
@@ -402,14 +400,13 @@ xfs_qm_mount_quotas(
                         * off, but the on disk superblock doesn't know that !
                         */
                        ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp, "%s: Superblock update failed!",
-                                "XFS mount_quotas: Superblock update failed!");
+                                __func__);
                }
        }
        if (error) {
-                xfs_fs_cmn_err(CE_WARN, mp,
+                xfs_warn(mp, "Failed to initialize disk quotas.");
-                        "Failed to initialize disk quotas.");
                return;
        }
@@ -464,12 +461,10 @@ xfs_qm_dqflush_all(
        struct xfs_quotainfo    *q = mp->m_quotainfo;
        int                     recl;
        struct xfs_dquot        *dqp;
-        int                     niters;
        int                     error;
        if (!q)
                return 0;
-        niters = 0;
 again:
        mutex_lock(&q->qi_dqlist_lock);
        list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
@@ -1230,13 +1225,6 @@ xfs_qm_qino_alloc(
        }
        /*
-         * Keep an extra reference to this quota inode. This inode is
-         * locked exclusively and joined to the transaction already.
-         */
-        ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL));
-        IHOLD(*ip);
-        /*
         * Make the changes in the superblock, and log those too.
         * sbfields arg may contain fields other than *QUOTINO;
         * VERSIONNUM for example.
@@ -1264,7 +1252,7 @@ xfs_qm_qino_alloc(
        xfs_mod_sb(tp, sbfields);
        if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!");
+                xfs_alert(mp, "%s failed (error %d)!", __func__, error);
                return error;
        }
        return 0;
@@ -1299,7 +1287,7 @@ xfs_qm_reset_dqcounts(
                 * output any warnings because it's perfectly possible to
                 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
                 */
-                (void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+                (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
                                      "xfs_quotacheck");
                ddq->d_bcount = 0;
                ddq->d_icount = 0;
@@ -1324,14 +1312,9 @@ xfs_qm_dqiter_bufs(
 {
        xfs_buf_t       *bp;
        int             error;
-        int             notcommitted;
-        int             incr;
        int             type;
        ASSERT(blkcnt > 0);
-        notcommitted = 0;
-        incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ?
-                XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt;
        type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
                (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
        error = 0;
@@ -1676,7 +1659,7 @@ xfs_qm_quotacheck(
         */
        ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
-        cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
+        xfs_notice(mp, "Quotacheck needed: Please wait.");
        /*
         * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
@@ -1754,9 +1737,9 @@ xfs_qm_quotacheck(
 error_return:
        if (error) {
-                cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): "
+                xfs_warn(mp,
-                        "Disabling quotas.",
+        "Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
-                        mp->m_fsname, error);
+                        error);
                /*
                 * We must turn off quotas.
                 */
@@ -1764,12 +1747,11 @@ xfs_qm_quotacheck(
                ASSERT(xfs_Gqm != NULL);
                xfs_qm_destroy_quotainfo(mp);
                if (xfs_mount_reset_sbqflags(mp)) {
-                        cmn_err(CE_WARN, "XFS quotacheck %s: "
+                        xfs_warn(mp,
-                                "Failed to reset quota flags.", mp->m_fsname);
+                                "Quotacheck: Failed to reset quota flags.");
                }
-        } else {
+        } else
-                cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
+                xfs_notice(mp, "Quotacheck: Done.");
-        }
        return (error);
 }
@@ -1937,8 +1919,8 @@ again:
                         */
                        error = xfs_qm_dqflush(dqp, 0);
                        if (error) {
-                                xfs_fs_cmn_err(CE_WARN, mp,
+                                xfs_warn(mp, "%s: dquot %p flush failed",
-                        "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
+                                        __func__, dqp);
                        }
                        goto dqunlock;
                }
@@ -2115,7 +2097,7 @@ xfs_qm_write_sb_changes(
        int             error;
 #ifdef QUOTADEBUG
-        cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname);
+        xfs_notice(mp, "Writing superblock quota changes");
 #endif
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
        if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index c9446f1c726d..567b29b9f1b3 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -65,11 +65,6 @@ extern kmem_zone_t	*qm_dqtrxzone;
 * block in the dquot/xqm code.
 */
 #define XFS_DQUOT_CLUSTER_SIZE_FSB      (xfs_filblks_t)1
-/*
- * When doing a quotacheck, we log dquot clusters of this many FSBs at most
- * in a single transaction. We don't want to ask for too huge a log reservation.
- */
-#define XFS_QM_MAX_DQCLUSTER_LOGSZ      3
 typedef xfs_dqhash_t    xfs_dqlist_t;
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 45b5cb1788ab..a0a829addca9 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -119,8 +119,7 @@ xfs_qm_newmount(
             (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
            (!gquotaondisk &&  XFS_IS_OQUOTA_ON(mp)))  &&
            xfs_dev_is_read_only(mp, "changing quota state")) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "please mount with%s%s%s%s.",
-                        "XFS: please mount with%s%s%s%s.",
                        (!quotaondisk ? "out quota" : ""),
                        (uquotaondisk ? " usrquota" : ""),
                        (pquotaondisk ? " prjquota" : ""),
@@ -135,7 +134,7 @@ xfs_qm_newmount(
                 */
                if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
                        /*
-                         * If an error occured, qm_mount_quotas code
+                         * If an error occurred, qm_mount_quotas code
                         * has already disabled quotas. So, just finish
                         * mounting, and get on with the boring life
                         * without disk quotas.
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index bdebc183223e..2dadb15d5ca9 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -41,12 +41,6 @@
 #include "xfs_qm.h"
 #include "xfs_trace.h"
-#ifdef DEBUG
-# define qdprintk(s, args...)   cmn_err(CE_DEBUG, s, ## args)
-#else
-# define qdprintk(s, args...)   do { } while (0)
-#endif
 STATIC int      xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int      xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
                                        uint);
@@ -178,7 +172,7 @@ xfs_qm_scall_quotaoff(
        /*
         * Next we make the changes in the quota flag in the mount struct.
         * This isn't protected by a particular lock directly, because we
-         * don't want to take a mrlock everytime we depend on quotas being on.
+         * don't want to take a mrlock every time we depend on quotas being on.
         */
        mp->m_qflags &= ~(flags);
@@ -294,7 +288,8 @@ xfs_qm_scall_trunc_qfiles(
        int             error = 0, error2 = 0;
        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
-                qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
+                xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
+                        __func__, flags, mp->m_qflags);
                return XFS_ERROR(EINVAL);
        }
@@ -318,20 +313,19 @@ xfs_qm_scall_quotaon(
 {
        int             error;
        uint            qf;
-        uint            accflags;
        __int64_t       sbflags;
        flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
        /*
         * Switching on quota accounting must be done at mount time.
         */
-        accflags = flags & XFS_ALL_QUOTA_ACCT;
        flags &= ~(XFS_ALL_QUOTA_ACCT);
        sbflags = 0;
        if (flags == 0) {
-                qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags);
+                xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
+                        __func__, mp->m_qflags);
                return XFS_ERROR(EINVAL);
        }
@@ -352,12 +346,13 @@ xfs_qm_scall_quotaon(
            (flags & XFS_GQUOTA_ACCT) == 0 &&
            (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
            (flags & XFS_OQUOTA_ENFD))) {
-                qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n",
+                xfs_debug(mp,
-                        flags, mp->m_sb.sb_qflags);
+                        "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
+                        __func__, flags, mp->m_sb.sb_qflags);
                return XFS_ERROR(EINVAL);
        }
        /*
-         * If everything's upto-date incore, then don't waste time.
+         * If everything's up to-date incore, then don't waste time.
         */
        if ((mp->m_qflags & flags) == flags)
                return XFS_ERROR(EEXIST);
@@ -541,7 +536,7 @@ xfs_qm_scall_setqlim(
                        q->qi_bsoftlimit = soft;
                }
        } else {
-                qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
+                xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
        }
        hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
@@ -557,7 +552,7 @@ xfs_qm_scall_setqlim(
                        q->qi_rtbsoftlimit = soft;
                }
        } else {
-                qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
+                xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
        }
        hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
@@ -574,7 +569,7 @@ xfs_qm_scall_setqlim(
                        q->qi_isoftlimit = soft;
                }
        } else {
-                qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
+                xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
        }
        /*
@@ -939,10 +934,11 @@ struct mutex  qcheck_lock;
 #define DQTEST_LIST_PRINT(l, NXT, title) \
 { \
          xfs_dqtest_t  *dqp; int i = 0;\
-          cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \
+          xfs_debug(NULL, "%s (#%d)", title, (int) (l)->qh_nelems); \
          for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
               dqp = (xfs_dqtest_t *)dqp->NXT) { \
-                cmn_err(CE_DEBUG, "  %d. \"%d (%s)\"  bcnt = %d, icnt = %d", \
+                xfs_debug(dqp->q_mount,         \
+                        "  %d. \"%d (%s)\"  bcnt = %d, icnt = %d", \
                         ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp),      \
                         dqp->d_bcount, dqp->d_icount); } \
 }
@@ -966,16 +962,17 @@ xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
 }
 STATIC void
 xfs_qm_dqtest_print(
-        xfs_dqtest_t    *d)
+        struct xfs_mount        *mp,
+        struct dqtest           *d)
 {
-        cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------");
+        xfs_debug(mp, "-----------DQTEST DQUOT----------------");
-        cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id);
+        xfs_debug(mp, "---- dquot ID = %d", d->d_id);
-        cmn_err(CE_DEBUG, "---- fs       = 0x%p", d->q_mount);
+        xfs_debug(mp, "---- fs       = 0x%p", d->q_mount);
-        cmn_err(CE_DEBUG, "---- bcount   = %Lu (0x%x)",
+        xfs_debug(mp, "---- bcount   = %Lu (0x%x)",
                d->d_bcount, (int)d->d_bcount);
-        cmn_err(CE_DEBUG, "---- icount   = %Lu (0x%x)",
+        xfs_debug(mp, "---- icount   = %Lu (0x%x)",
                d->d_icount, (int)d->d_icount);
-        cmn_err(CE_DEBUG, "---------------------------");
+        xfs_debug(mp, "---------------------------");
 }
 STATIC void
@@ -989,12 +986,14 @@ xfs_qm_dqtest_failed(
 {
        qmtest_nfails++;
        if (error)
-                cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s",
+                xfs_debug(dqp->q_mount,
-                       d->d_id, error, reason);
+                        "quotacheck failed id=%d, err=%d\nreason: %s",
+                        d->d_id, error, reason);
        else
-                cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]",
+                xfs_debug(dqp->q_mount,
-                       d->d_id, reason, (int)a, (int)b);
+                        "quotacheck failed id=%d (%s) [%d != %d]",
-        xfs_qm_dqtest_print(d);
+                        d->d_id, reason, (int)a, (int)b);
+        xfs_qm_dqtest_print(dqp->q_mount, d);
        if (dqp)
                xfs_qm_dqprint(dqp);
 }
@@ -1021,9 +1020,9 @@ xfs_dqtest_cmp2(
            be64_to_cpu(dqp->q_core.d_bcount) >=
            be64_to_cpu(dqp->q_core.d_blk_softlimit)) {
                if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
-                        cmn_err(CE_DEBUG,
+                        xfs_debug(dqp->q_mount,
-                                "%d [%s] [0x%p] BLK TIMER NOT STARTED",
+                                "%d [%s] BLK TIMER NOT STARTED",
-                                d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+                                d->d_id, DQFLAGTO_TYPESTR(d));
                        err++;
                }
        }
@@ -1031,16 +1030,16 @@ xfs_dqtest_cmp2(
            be64_to_cpu(dqp->q_core.d_icount) >=
            be64_to_cpu(dqp->q_core.d_ino_softlimit)) {
                if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
-                        cmn_err(CE_DEBUG,
+                        xfs_debug(dqp->q_mount,
-                                "%d [%s] [0x%p] INO TIMER NOT STARTED",
+                                "%d [%s] INO TIMER NOT STARTED",
-                                d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+                                d->d_id, DQFLAGTO_TYPESTR(d));
                        err++;
                }
        }
 #ifdef QUOTADEBUG
        if (!err) {
-                cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked",
+                xfs_debug(dqp->q_mount, "%d [%s] qchecked",
-                        d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+                        d->d_id, DQFLAGTO_TYPESTR(d));
        }
 #endif
        return (err);
@@ -1137,8 +1136,8 @@ xfs_qm_internalqcheck_adjust(
        if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
                *res = BULKSTAT_RV_NOTHING;
-                qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n",
+                xfs_debug(mp, "%s: ino=%llu, uqino=%llu, gqino=%llu\n",
-                        (unsigned long long) ino,
+                        __func__, (unsigned long long) ino,
                        (unsigned long long) mp->m_sb.sb_uquotino,
                        (unsigned long long) mp->m_sb.sb_gquotino);
                return XFS_ERROR(EINVAL);
@@ -1223,12 +1222,12 @@ xfs_qm_internalqcheck(
                                 xfs_qm_internalqcheck_adjust,
                                 0, NULL, &done);
                if (error) {
-                        cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error);
+                        xfs_debug(mp, "Bulkstat returned error 0x%x", error);
                        break;
                }
        } while (!done);
-        cmn_err(CE_DEBUG, "Checking results against system dquots");
+        xfs_debug(mp, "Checking results against system dquots");
        for (i = 0; i < qmtest_hashmask; i++) {
                xfs_dqtest_t    *d, *n;
                xfs_dqhash_t    *h;
@@ -1246,10 +1245,10 @@ xfs_qm_internalqcheck(
        }
        if (qmtest_nfails) {
-                cmn_err(CE_DEBUG, "******** quotacheck failed  ********");
+                xfs_debug(mp, "******** quotacheck failed  ********");
-                cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails);
+                xfs_debug(mp, "failures = %d", qmtest_nfails);
        } else {
-                cmn_err(CE_DEBUG, "******** quotacheck successful! ********");
+                xfs_debug(mp, "******** quotacheck successful! ********");
        }
        kmem_free(qmtest_udqtab);
        kmem_free(qmtest_gdqtab);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 7de91d1b75c0..2a3648731331 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -643,8 +643,9 @@ xfs_trans_dqresv(
             (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
              (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
 #ifdef QUOTADEBUG
-                cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld"
+                xfs_debug(mp,
-                          " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit);
+                        "BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?",
+                        nblks, *resbcountp, hardlimit);
 #endif
                if (nblks > 0) {
                        /*
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
deleted file mode 100644
index 0df88897ef84..000000000000
--- a/fs/xfs/support/debug.c
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <xfs.h>
-#include "debug.h"
-/* xfs_mount.h drags a lot of crap in, sorry.. */
-#include "xfs_sb.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_error.h"
-void
-cmn_err(
-        const char      *lvl,
-        const char      *fmt,
-        ...)
-{
-        struct va_format vaf;
-        va_list         args;
-        va_start(args, fmt);
-        vaf.fmt = fmt;
-        vaf.va = &args;
-        printk("%s%pV", lvl, &vaf);
-        va_end(args);
-        BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
-}
-void
-xfs_fs_cmn_err(
-        const char              *lvl,
-        struct xfs_mount        *mp,
-        const char              *fmt,
-        ...)
-{
-        struct va_format        vaf;
-        va_list                 args;
-        va_start(args, fmt);
-        vaf.fmt = fmt;
-        vaf.va = &args;
-        printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
-        va_end(args);
-        BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
-}
-/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
-void
-xfs_cmn_err(
-        int                     panic_tag,
-        const char              *lvl,
-        struct xfs_mount        *mp,
-        const char              *fmt,
-        ...)
-{
-        struct va_format        vaf;
-        va_list                 args;
-        int                     do_panic = 0;
-        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
-                printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
-                do_panic = 1;
-        }
-        va_start(args, fmt);
-        vaf.fmt = fmt;
-        vaf.va = &args;
-        printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
-        va_end(args);
-        BUG_ON(do_panic);
-}
-void
-assfail(char *expr, char *file, int line)
-{
-        printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
-               file, line);
-        BUG();
-}
-void
-xfs_hex_dump(void *p, int length)
-{
-        print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
-}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
deleted file mode 100644
index 05699f67d475..000000000000
--- a/fs/xfs/support/debug.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_DEBUG_H__
-#define __XFS_SUPPORT_DEBUG_H__
-#include <stdarg.h>
-struct xfs_mount;
-#define CE_DEBUG        KERN_DEBUG
-#define CE_CONT         KERN_INFO
-#define CE_NOTE         KERN_NOTICE
-#define CE_WARN         KERN_WARNING
-#define CE_ALERT        KERN_ALERT
-#define CE_PANIC        KERN_EMERG
-void cmn_err(const char *lvl, const char *fmt, ...)
-                __attribute__ ((format (printf, 2, 3)));
-void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
-                const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
-void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
-                const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
-extern void assfail(char *expr, char *f, int l);
-#define ASSERT_ALWAYS(expr)     \
-        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#ifndef DEBUG
-#define ASSERT(expr)    ((void)0)
-#ifndef STATIC
-# define STATIC static noinline
-#endif
-#else /* DEBUG */
-#define ASSERT(expr)    \
-        (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-#ifndef STATIC
-# define STATIC noinline
-#endif
-#endif /* DEBUG */
-#endif  /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index f3227984a9bf..27d64d752eab 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -147,10 +147,9 @@ xfs_alloc_get_rec(
 */
 STATIC void
 xfs_alloc_compute_aligned(
+        xfs_alloc_arg_t *args,          /* allocation argument structure */
        xfs_agblock_t   foundbno,       /* starting block in found extent */
        xfs_extlen_t    foundlen,       /* length in found extent */
-        xfs_extlen_t    alignment,      /* alignment for allocation */
-        xfs_extlen_t    minlen,         /* minimum length for allocation */
        xfs_agblock_t   *resbno,        /* result block number */
        xfs_extlen_t    *reslen)        /* result length */
 {
@@ -158,8 +157,8 @@ xfs_alloc_compute_aligned(
        xfs_extlen_t    diff;
        xfs_extlen_t    len;
-        if (alignment > 1 && foundlen >= minlen) {
+        if (args->alignment > 1 && foundlen >= args->minlen) {
-                bno = roundup(foundbno, alignment);
+                bno = roundup(foundbno, args->alignment);
                diff = bno - foundbno;
                len = diff >= foundlen ? 0 : foundlen - diff;
        } else {
@@ -464,6 +463,27 @@ xfs_alloc_read_agfl(
        return 0;
 }
+STATIC int
+xfs_alloc_update_counters(
+        struct xfs_trans        *tp,
+        struct xfs_perag        *pag,
+        struct xfs_buf          *agbp,
+        long                    len)
+{
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+        pag->pagf_freeblks += len;
+        be32_add_cpu(&agf->agf_freeblks, len);
+        xfs_trans_agblocks_delta(tp, len);
+        if (unlikely(be32_to_cpu(agf->agf_freeblks) >
+                     be32_to_cpu(agf->agf_length)))
+                return EFSCORRUPTED;
+        xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+        return 0;
+}
 /*
 * Allocation group level functions.
 */
@@ -505,49 +525,44 @@ xfs_alloc_ag_vextent(
                ASSERT(0);
                /* NOTREACHED */
        }
-        if (error)
+        if (error || args->agbno == NULLAGBLOCK)
                return error;
-        /*
-         * If the allocation worked, need to change the agf structure
-         * (and log it), and the superblock.
-         */
-        if (args->agbno != NULLAGBLOCK) {
-                xfs_agf_t       *agf;   /* allocation group freelist header */
-                long            slen = (long)args->len;
-                ASSERT(args->len >= args->minlen && args->len <= args->maxlen);
+        ASSERT(args->len >= args->minlen);
-                ASSERT(!(args->wasfromfl) || !args->isfl);
+        ASSERT(args->len <= args->maxlen);
-                ASSERT(args->agbno % args->alignment == 0);
+        ASSERT(!args->wasfromfl || !args->isfl);
-                if (!(args->wasfromfl)) {
+        ASSERT(args->agbno % args->alignment == 0);
-                        agf = XFS_BUF_TO_AGF(args->agbp);
+        if (!args->wasfromfl) {
-                        be32_add_cpu(&agf->agf_freeblks, -(args->len));
+                error = xfs_alloc_update_counters(args->tp, args->pag,
-                        xfs_trans_agblocks_delta(args->tp,
+                                                  args->agbp,
-                                                 -((long)(args->len)));
+                                                  -((long)(args->len)));
-                        args->pag->pagf_freeblks -= args->len;
+                if (error)
-                        ASSERT(be32_to_cpu(agf->agf_freeblks) <=
+                        return error;
-                                be32_to_cpu(agf->agf_length));
-                        xfs_alloc_log_agf(args->tp, args->agbp,
+                /*
-                                                XFS_AGF_FREEBLKS);
+                 * Search the busylist for these blocks and mark the
-                        /*
+                 * transaction as synchronous if blocks are found. This
-                         * Search the busylist for these blocks and mark the
+                 * avoids the need to block due to a synchronous log
-                         * transaction as synchronous if blocks are found. This
+                 * force to ensure correct ordering as the synchronous
-                         * avoids the need to block due to a synchronous log
+                 * transaction will guarantee that for us.
-                         * force to ensure correct ordering as the synchronous
+                 */
-                         * transaction will guarantee that for us.
+                if (xfs_alloc_busy_search(args->mp, args->agno,
-                         */
+                                        args->agbno, args->len))
-                        if (xfs_alloc_busy_search(args->mp, args->agno,
+                        xfs_trans_set_sync(args->tp);
-                                                args->agbno, args->len))
-                                xfs_trans_set_sync(args->tp);
-                }
-                if (!args->isfl)
-                        xfs_trans_mod_sb(args->tp,
-                                args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
-                                        XFS_TRANS_SB_FDBLOCKS, -slen);
-                XFS_STATS_INC(xs_allocx);
-                XFS_STATS_ADD(xs_allocb, args->len);
        }
-        return 0;
+        if (!args->isfl) {
+                xfs_trans_mod_sb(args->tp, args->wasdel ?
+                                 XFS_TRANS_SB_RES_FDBLOCKS :
+                                 XFS_TRANS_SB_FDBLOCKS,
+                                 -((long)(args->len)));
+        }
+        XFS_STATS_INC(xs_allocx);
+        XFS_STATS_ADD(xs_allocb, args->len);
+        return error;
 }
 /*
@@ -693,8 +708,7 @@ xfs_alloc_find_best_extent(
                if (error)
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                xfs_alloc_compute_aligned(*sbno, *slen, args->alignment,
+                xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena);
-                                          args->minlen, &bno, slena);
                /*
                 * The good extent is closer than this one.
@@ -866,8 +880,8 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
+                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
-                                        args->minlen, &ltbnoa, &ltlena);
+                                                  &ltbnoa, &ltlena);
                        if (ltlena < args->minlen)
                                continue;
                        args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
@@ -987,8 +1001,8 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
+                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
-                                        args->minlen, &ltbnoa, &ltlena);
+                                                  &ltbnoa, &ltlena);
                        if (ltlena >= args->minlen)
                                break;
                        if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@@ -1003,8 +1017,8 @@ xfs_alloc_ag_vextent_near(
                        if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment,
+                        xfs_alloc_compute_aligned(args, gtbno, gtlen,
-                                        args->minlen, &gtbnoa, &gtlena);
+                                                  &gtbnoa, &gtlena);
                        if (gtlena >= args->minlen)
                                break;
                        if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@@ -1183,8 +1197,7 @@ xfs_alloc_ag_vextent_size(
         * once aligned; if not, we search left for something better.
         * This can't happen in the second case above.
         */
-        xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen,
+        xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
-                &rbno, &rlen);
        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
                        (rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1209,8 +1222,8 @@ xfs_alloc_ag_vextent_size(
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        if (flen < bestrlen)
                                break;
-                        xfs_alloc_compute_aligned(fbno, flen, args->alignment,
+                        xfs_alloc_compute_aligned(args, fbno, flen,
-                                args->minlen, &rbno, &rlen);
+                                                  &rbno, &rlen);
                        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
                        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
                                (rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1388,6 +1401,7 @@ xfs_free_ag_extent(
        xfs_mount_t     *mp;            /* mount point struct for filesystem */
        xfs_agblock_t   nbno;           /* new starting block of freespace */
        xfs_extlen_t    nlen;           /* new length of freespace */
+        xfs_perag_t     *pag;           /* per allocation group data */
        mp = tp->t_mountp;
        /*
@@ -1586,30 +1600,20 @@ xfs_free_ag_extent(
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
        cnt_cur = NULL;
        /*
         * Update the freespace totals in the ag and superblock.
         */
-        {
+        pag = xfs_perag_get(mp, agno);
-                xfs_agf_t       *agf;
+        error = xfs_alloc_update_counters(tp, pag, agbp, len);
-                xfs_perag_t     *pag;           /* per allocation group data */
+        xfs_perag_put(pag);
+        if (error)
-                pag = xfs_perag_get(mp, agno);
+                goto error0;
-                pag->pagf_freeblks += len;
-                xfs_perag_put(pag);
+        if (!isfl)
+                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
-                agf = XFS_BUF_TO_AGF(agbp);
+        XFS_STATS_INC(xs_freex);
-                be32_add_cpu(&agf->agf_freeblks, len);
+        XFS_STATS_ADD(xs_freeb, len);
-                xfs_trans_agblocks_delta(tp, len);
-                XFS_WANT_CORRUPTED_GOTO(
-                        be32_to_cpu(agf->agf_freeblks) <=
-                        be32_to_cpu(agf->agf_length),
-                        error0);
-                xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
-                if (!isfl)
-                        xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
-                XFS_STATS_INC(xs_freex);
-                XFS_STATS_ADD(xs_freeb, len);
-        }
        trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
@@ -2391,17 +2395,33 @@ xfs_free_extent(
        memset(&args, 0, sizeof(xfs_alloc_arg_t));
        args.tp = tp;
        args.mp = tp->t_mountp;
+        /*
+         * validate that the block number is legal - the enables us to detect
+         * and handle a silent filesystem corruption rather than crashing.
+         */
        args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
-        ASSERT(args.agno < args.mp->m_sb.sb_agcount);
+        if (args.agno >= args.mp->m_sb.sb_agcount)
+                return EFSCORRUPTED;
        args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+        if (args.agbno >= args.mp->m_sb.sb_agblocks)
+                return EFSCORRUPTED;
        args.pag = xfs_perag_get(args.mp, args.agno);
-        if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
+        ASSERT(args.pag);
+        error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+        if (error)
                goto error0;
-#ifdef DEBUG
-        ASSERT(args.agbp != NULL);
+        /* validate the extent size is legal now we have the agf locked */
-        ASSERT((args.agbno + len) <=
+        if (args.agbno + len >
-                be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length));
+                        be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
-#endif
+                error = EFSCORRUPTED;
+                goto error0;
+        }
        error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
 error0:
        xfs_perag_put(args.pag);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index dc3afd7739ff..fa00788de2f5 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2365,6 +2365,13 @@ xfs_bmap_rtalloc(
         */
        if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
                ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+        /*
+         * Lock out other modifications to the RT bitmap inode.
+         */
+        xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
        /*
         * If it's an allocation to an empty file at offset 0,
         * pick an extent that will space things out in the rt area.
@@ -3519,7 +3526,7 @@ xfs_bmap_search_extents(
        if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
                     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
-                xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount,
+                xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
                                "Access to block zero in inode %llu "
                                "start_block: %llx start_off: %llx "
                                "blkcnt: %llx extent-state: %x lastx: %x\n",
@@ -4193,12 +4200,11 @@ xfs_bmap_read_extents(
                num_recs = xfs_btree_get_numrecs(block);
                if (unlikely(i + num_recs > room)) {
                        ASSERT(i + num_recs <= room);
-                        xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                        xfs_warn(ip->i_mount,
                                "corrupt dinode %Lu, (btree extents).",
                                (unsigned long long) ip->i_ino);
-                        XFS_ERROR_REPORT("xfs_bmap_read_extents(1)",
+                        XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
-                                         XFS_ERRLEVEL_LOW,
+                                XFS_ERRLEVEL_LOW, ip->i_mount, block);
-                                        ip->i_mount);
                        goto error0;
                }
                XFS_WANT_CORRUPTED_GOTO(
@@ -5772,7 +5778,7 @@ xfs_check_block(
                        else
                                thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
                        if (*thispa == *pp) {
-                                cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
+                                xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
                                        __func__, j, i,
                                        (unsigned long long)be64_to_cpu(*thispa));
                                panic("%s: ptrs are equal in node\n",
@@ -5937,11 +5943,11 @@ xfs_bmap_check_leaf_extents(
        return;
 error0:
-        cmn_err(CE_WARN, "%s: at error0", __func__);
+        xfs_warn(mp, "%s: at error0", __func__);
        if (bp_release)
                xfs_trans_brelse(NULL, bp);
 error_norelse:
-        cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents",
+        xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
                __func__, i);
        panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
        return;
@@ -6144,7 +6150,7 @@ xfs_bmap_punch_delalloc_range(
                if (error) {
                        /* something screwed, just bail */
                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
+                                xfs_alert(ip->i_mount,
                        "Failed delalloc mapping lookup ino %lld fsb %lld.",
                                                ip->i_ino, start_fsb);
                        }
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 6f8c21ce0d6d..7b7e005e3dcc 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -130,10 +130,12 @@ xfs_buf_item_log_check(
        orig = bip->bli_orig;
        buffer = XFS_BUF_PTR(bp);
        for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
-                if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))
+                if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
-                        cmn_err(CE_PANIC,
+                        xfs_emerg(bp->b_mount,
-        "xfs_buf_item_log_check bip %x buffer %x orig %x index %d",
+                                "%s: bip %x buffer %x orig %x index %d",
-                                bip, bp, orig, x);
+                                __func__, bip, bp, orig, x);
+                        ASSERT(0);
+                }
        }
 }
 #else
@@ -983,15 +985,14 @@ xfs_buf_iodone_callbacks(
        if (XFS_BUF_TARGET(bp) != lasttarg ||
            time_after(jiffies, (lasttime + 5*HZ))) {
                lasttime = jiffies;
-                cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
+                xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
-                                " block 0x%llx in %s",
                        XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
-                      (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
+                      (__uint64_t)XFS_BUF_ADDR(bp));
        }
        lasttarg = XFS_BUF_TARGET(bp);
        /*
-         * If the write was asynchronous then noone will be looking for the
+         * If the write was asynchronous then no one will be looking for the
         * error.  Clear the error state and write the buffer out again.
         *
         * During sync or umount we'll write all pending buffers again
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 1c00bedb3175..6102ac6d1dff 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1995,13 +1995,12 @@ xfs_da_do_buf(
                error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
                if (unlikely(error == EFSCORRUPTED)) {
                        if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
-                                cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n",
+                                xfs_alert(mp, "%s: bno %lld dir: inode %lld",
-                                        (long long)bno);
+                                        __func__, (long long)bno,
-                                cmn_err(CE_ALERT, "dir: inode %lld\n",
                                        (long long)dp->i_ino);
                                for (i = 0; i < nmap; i++) {
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                                "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n",
+"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
                                                i,
                                                (long long)mapp[i].br_startoff,
                                                (long long)mapp[i].br_startblock,
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index e60490bc00a6..be628677c288 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -270,9 +270,9 @@ xfs_swap_extents(
        /* check inode formats now that data is flushed */
        error = xfs_swap_extents_check_format(ip, tip);
        if (error) {
-                xfs_fs_cmn_err(CE_NOTE, mp,
+                xfs_notice(mp,
                    "%s: inode 0x%llx format is incompatible for exchanging.",
-                                __FILE__, ip->i_ino);
+                                __func__, ip->i_ino);
                goto out_unlock;
        }
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index a1321bc7f192..dba7a71cedf3 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -159,7 +159,7 @@ xfs_dir_ino_validate(
                XFS_AGINO_TO_INO(mp, agno, agino) == ino;
        if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
                        XFS_RANDOM_DIR_INO_VALIDATE))) {
-                xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx",
+                xfs_warn(mp, "Invalid inode number 0x%Lx",
                                (unsigned long long) ino);
                XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
                return XFS_ERROR(EFSCORRUPTED);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index f9a0864b696a..a0aab7d3294f 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -899,10 +899,9 @@ xfs_dir2_leafn_rebalance(
        if(blk2->index < 0) {
                state->inleaf = 1;
                blk2->index = 0;
-                cmn_err(CE_ALERT,
+                xfs_alert(args->dp->i_mount,
-                        "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting original leaf: "
+        "%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n",
-                        "blk1->index %d\n",
+                        __func__, blk1->index);
-                        blk1->index);
        }
 }
@@ -1641,26 +1640,22 @@ xfs_dir2_node_addname_int(
                        }
                        if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
-                                cmn_err(CE_ALERT,
+                                xfs_alert(mp,
-                                        "xfs_dir2_node_addname_int: dir ino "
+                        "%s: dir ino " "%llu needed freesp block %lld for\n"
-                                        "%llu needed freesp block %lld for\n"
+                        "  data block %lld, got %lld ifbno %llu lastfbno %d",
-                                        "  data block %lld, got %lld\n"
+                                        __func__, (unsigned long long)dp->i_ino,
-                                        "  ifbno %llu lastfbno %d\n",
-                                        (unsigned long long)dp->i_ino,
                                        (long long)xfs_dir2_db_to_fdb(mp, dbno),
                                        (long long)dbno, (long long)fbno,
                                        (unsigned long long)ifbno, lastfbno);
                                if (fblk) {
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                                " fblk 0x%p blkno %llu "
+                                " fblk 0x%p blkno %llu index %d magic 0x%x",
-                                                "index %d magic 0x%x\n",
                                                fblk,
                                                (unsigned long long)fblk->blkno,
                                                fblk->index,
                                                fblk->magic);
                                } else {
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp, " ... fblk is NULL");
-                                                " ... fblk is NULL\n");
                                }
                                XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
                                                 XFS_ERRLEVEL_LOW, mp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 4c7db74a05f7..39f06336b99d 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -48,7 +48,7 @@ xfs_error_trap(int e)
                        break;
                if (e != xfs_etrap[i])
                        continue;
-                cmn_err(CE_NOTE, "xfs_error_trap: error %d", e);
+                xfs_notice(NULL, "%s: error %d", __func__, e);
                BUG();
                break;
        }
@@ -74,7 +74,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
                if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(NULL,
        "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
                                expression, file, line, xfs_etest_fsname[i]);
                        return 1;
@@ -95,14 +95,14 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
                if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
-                        cmn_err(CE_WARN, "XFS error tag #%d on", error_tag);
+                        xfs_warn(mp, "error tag #%d on", error_tag);
                        return 0;
                }
        }
        for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
                if (xfs_etest[i] == 0) {
-                        cmn_err(CE_WARN, "Turned on XFS error tag #%d",
+                        xfs_warn(mp, "Turned on XFS error tag #%d",
                                error_tag);
                        xfs_etest[i] = error_tag;
                        xfs_etest_fsid[i] = fsid;
@@ -114,7 +114,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
                }
        }
-        cmn_err(CE_WARN, "error tag overflow, too many turned on");
+        xfs_warn(mp, "error tag overflow, too many turned on");
        return 1;
 }
@@ -133,7 +133,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
                if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
                     xfs_etest[i] != 0) {
                        cleared = 1;
-                        cmn_err(CE_WARN, "Clearing XFS error tag #%d",
+                        xfs_warn(mp, "Clearing XFS error tag #%d",
                                xfs_etest[i]);
                        xfs_etest[i] = 0;
                        xfs_etest_fsid[i] = 0LL;
@@ -144,9 +144,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
        }
        if (loud || cleared)
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "Cleared all XFS error tags for filesystem");
-                        "Cleared all XFS error tags for filesystem \"%s\"",
-                        mp->m_fsname);
        return 0;
 }
@@ -162,9 +160,8 @@ xfs_error_report(
        inst_t                  *ra)
 {
        if (level <= xfs_error_level) {
-                xfs_cmn_err(XFS_PTAG_ERROR_REPORT,
+                xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
-                            CE_ALERT, mp,
+                "Internal error %s at line %d of file %s.  Caller 0x%p\n",
-                "XFS internal error %s at line %d of file %s.  Caller 0x%p\n",
                            tag, linenum, filename, ra);
                xfs_stack_trace();
@@ -184,4 +181,5 @@ xfs_corruption_error(
        if (level <= xfs_error_level)
                xfs_hex_dump(p, 16);
        xfs_error_report(tag, level, mp, filename, linenum, ra);
+        xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
 }
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 10dce5475f02..079a367f44ee 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -145,10 +145,8 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #endif /* DEBUG */
 /*
- * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
+ * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
- *                      a panic by setting xfs_panic_mask in a
+ *                      a panic by setting xfs_panic_mask in a sysctl.
- *                      sysctl.  update xfs_max[XFS_PARAM] if
- *                      more are added.
 */
 #define         XFS_NO_PTAG                     0
 #define         XFS_PTAG_IFLUSH                 0x00000001
@@ -160,17 +158,4 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
 #define         XFS_PTAG_SHUTDOWN_LOGERROR      0x00000040
 #define         XFS_PTAG_FSBLOCK_ZERO           0x00000080
-struct xfs_mount;
-extern void xfs_hex_dump(void *p, int length);
-#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
-        xfs_fs_cmn_err(level, mp, fmt "  Unmount and run xfs_repair.", ## args)
-#define xfs_fs_mount_cmn_err(f, fmt, args...) \
-        do { \
-                if (!(f & XFS_MFSI_QUIET))      \
-                        cmn_err(CE_WARN, "XFS: " fmt, ## args); \
-        } while (0)
 #endif  /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cec89dd5d7d2..9153d2c77caf 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -53,6 +53,9 @@ xfs_fs_geometry(
        xfs_fsop_geom_t         *geo,
        int                     new_version)
 {
+        memset(geo, 0, sizeof(*geo));
        geo->blocksize = mp->m_sb.sb_blocksize;
        geo->rtextsize = mp->m_sb.sb_rextsize;
        geo->agblocks = mp->m_sb.sb_agblocks;
@@ -382,8 +385,8 @@ xfs_growfs_data_private(
                                  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
                                  XFS_FSS_TO_BB(mp, 1), 0, &bp);
                if (error) {
-                        xfs_fs_cmn_err(CE_WARN, mp,
+                        xfs_warn(mp,
-                        "error %d reading secondary superblock for ag %d",
+                "error %d reading secondary superblock for ag %d",
                                error, agno);
                        break;
                }
@@ -396,7 +399,7 @@ xfs_growfs_data_private(
                if (!(error = xfs_bwrite(mp, bp))) {
                        continue;
                } else {
-                        xfs_fs_cmn_err(CE_WARN, mp,
+                        xfs_warn(mp,
                "write error %d updating secondary superblock for ag %d",
                                error, agno);
                        break; /* no point in continuing */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 0626a32c3447..84ebeec16642 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1055,28 +1055,23 @@ xfs_difree(
         */
        agno = XFS_INO_TO_AGNO(mp, inode);
        if (agno >= mp->m_sb.sb_agcount)  {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
-                        "xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s.  Returning EINVAL.",
+                        __func__, agno, mp->m_sb.sb_agcount);
-                        agno, mp->m_sb.sb_agcount, mp->m_fsname);
                ASSERT(0);
                return XFS_ERROR(EINVAL);
        }
        agino = XFS_INO_TO_AGINO(mp, inode);
        if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
-                        "xfs_difree: inode != XFS_AGINO_TO_INO() "
+                        __func__, (unsigned long long)inode,
-                        "(%llu != %llu) on %s.  Returning EINVAL.",
+                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
-                        (unsigned long long)inode,
-                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino),
-                        mp->m_fsname);
                ASSERT(0);
                return XFS_ERROR(EINVAL);
        }
        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
        if (agbno >= mp->m_sb.sb_agblocks)  {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
-                        "xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s.  Returning EINVAL.",
+                        __func__, agbno, mp->m_sb.sb_agblocks);
-                        agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
                ASSERT(0);
                return XFS_ERROR(EINVAL);
        }
@@ -1085,9 +1080,8 @@ xfs_difree(
         */
        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
        if (error) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
-                        "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s.  Returning error.",
+                        __func__, error);
-                        error, mp->m_fsname);
                return error;
        }
        agi = XFS_BUF_TO_AGI(agbp);
@@ -1106,17 +1100,15 @@ xfs_difree(
         * Look for the entry describing this inode.
         */
        if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
-                        "xfs_difree: xfs_inobt_lookup returned()  an error %d on %s.  Returning error.",
+                        __func__, error);
-                        error, mp->m_fsname);
                goto error0;
        }
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        error = xfs_inobt_get_rec(cur, &rec, &i);
        if (error) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
-                        "xfs_difree: xfs_inobt_get_rec()  returned an error %d on %s.  Returning error.",
+                        __func__, error);
-                        error, mp->m_fsname);
                goto error0;
        }
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -1157,8 +1149,8 @@ xfs_difree(
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
                if ((error = xfs_btree_delete(cur, &i))) {
-                        cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n",
+                        xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
-                                error, mp->m_fsname);
+                                __func__, error);
                        goto error0;
                }
@@ -1170,9 +1162,8 @@ xfs_difree(
                error = xfs_inobt_update(cur, &rec);
                if (error) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
-        "xfs_difree: xfs_inobt_update returned an error %d on %s.",
+                                __func__, error);
-                                error, mp->m_fsname);
                        goto error0;
                }
@@ -1218,10 +1209,9 @@ xfs_imap_lookup(
        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
        if (error) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                xfs_alert(mp,
-                                "xfs_ialloc_read_agi() returned "
+                        "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
-                                "error %d, agno %d",
+                        __func__, error, agno);
-                                error, agno);
                return error;
        }
@@ -1299,24 +1289,21 @@ xfs_imap(
                if (flags & XFS_IGET_UNTRUSTED)
                        return XFS_ERROR(EINVAL);
                if (agno >= mp->m_sb.sb_agcount) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                        "xfs_imap: agno (%d) >= "
+                                "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
-                                        "mp->m_sb.sb_agcount (%d)",
+                                __func__, agno, mp->m_sb.sb_agcount);
-                                        agno,  mp->m_sb.sb_agcount);
                }
                if (agbno >= mp->m_sb.sb_agblocks) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                        "xfs_imap: agbno (0x%llx) >= "
+                "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
-                                        "mp->m_sb.sb_agblocks (0x%lx)",
+                                __func__, (unsigned long long)agbno,
-                                        (unsigned long long) agbno,
+                                (unsigned long)mp->m_sb.sb_agblocks);
-                                        (unsigned long) mp->m_sb.sb_agblocks);
                }
                if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                        "xfs_imap: ino (0x%llx) != "
+                "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
-                                        "XFS_AGINO_TO_INO(mp, agno, agino) "
+                                __func__, ino,
-                                        "(0x%llx)",
+                                XFS_AGINO_TO_INO(mp, agno, agino));
-                                        ino, XFS_AGINO_TO_INO(mp, agno, agino));
                }
                xfs_stack_trace();
 #endif /* DEBUG */
@@ -1388,10 +1375,9 @@ out_map:
         */
        if ((imap->im_blkno + imap->im_len) >
            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                xfs_alert(mp,
-                        "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+        "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
-                        " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+                        __func__, (unsigned long long) imap->im_blkno,
-                        (unsigned long long) imap->im_blkno,
                        (unsigned long long) imap->im_len,
                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
                return XFS_ERROR(EINVAL);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index be7cf625421f..a37480a6e023 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -110,8 +110,8 @@ xfs_inobp_check(
                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
                                        i * mp->m_sb.sb_inodesize);
                if (!dip->di_next_unlinked)  {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p.  About to pop an ASSERT.",
+        "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
                                bp);
                        ASSERT(dip->di_next_unlinked);
                }
@@ -142,10 +142,9 @@ xfs_imap_to_bp(
                                   (int)imap->im_len, buf_flags, &bp);
        if (error) {
                if (error != EAGAIN) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp,
-                                "xfs_imap_to_bp: xfs_trans_read_buf()returned "
+                                "%s: xfs_trans_read_buf() returned error %d.",
-                                "an error %d on %s.  Returning error.",
+                                __func__, error);
-                                error, mp->m_fsname);
                } else {
                        ASSERT(buf_flags & XBF_TRYLOCK);
                }
@@ -180,12 +179,11 @@ xfs_imap_to_bp(
                        XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
                                                XFS_ERRLEVEL_HIGH, mp, dip);
 #ifdef DEBUG
-                        cmn_err(CE_PANIC,
+                        xfs_emerg(mp,
-                                        "Device %s - bad inode magic/vsn "
+                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
-                                        "daddr %lld #%d (magic=%x)",
-                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
                                (unsigned long long)imap->im_blkno, i,
                                be16_to_cpu(dip->di_magic));
+                        ASSERT(0);
 #endif
                        xfs_trans_brelse(tp, bp);
                        return XFS_ERROR(EFSCORRUPTED);
@@ -317,7 +315,7 @@ xfs_iformat(
        if (unlikely(be32_to_cpu(dip->di_nextents) +
                     be16_to_cpu(dip->di_anextents) >
                     be64_to_cpu(dip->di_nblocks))) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
                        "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
                        (unsigned long long)ip->i_ino,
                        (int)(be32_to_cpu(dip->di_nextents) +
@@ -330,8 +328,7 @@ xfs_iformat(
        }
        if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
-                        "corrupt dinode %Lu, forkoff = 0x%x.",
                        (unsigned long long)ip->i_ino,
                        dip->di_forkoff);
                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
@@ -341,7 +338,7 @@ xfs_iformat(
        if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
                     !ip->i_mount->m_rtdev_targp)) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
                        "corrupt dinode %Lu, has realtime flag set.",
                        ip->i_ino);
                XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
@@ -373,9 +370,8 @@ xfs_iformat(
                         * no local regular files yet
                         */
                        if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
-                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                                xfs_warn(ip->i_mount,
-                                        "corrupt inode %Lu "
+                        "corrupt inode %Lu (local format for regular file).",
-                                        "(local format for regular file).",
                                        (unsigned long long) ip->i_ino);
                                XFS_CORRUPTION_ERROR("xfs_iformat(4)",
                                                     XFS_ERRLEVEL_LOW,
@@ -385,9 +381,8 @@ xfs_iformat(
                        di_size = be64_to_cpu(dip->di_size);
                        if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
-                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                                xfs_warn(ip->i_mount,
-                                        "corrupt inode %Lu "
+                        "corrupt inode %Lu (bad size %Ld for local inode).",
-                                        "(bad size %Ld for local inode).",
                                        (unsigned long long) ip->i_ino,
                                        (long long) di_size);
                                XFS_CORRUPTION_ERROR("xfs_iformat(5)",
@@ -431,9 +426,8 @@ xfs_iformat(
                size = be16_to_cpu(atp->hdr.totsize);
                if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
-                        xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                        xfs_warn(ip->i_mount,
-                                "corrupt inode %Lu "
+                                "corrupt inode %Lu (bad attr fork size %Ld).",
-                                "(bad attr fork size %Ld).",
                                (unsigned long long) ip->i_ino,
                                (long long) size);
                        XFS_CORRUPTION_ERROR("xfs_iformat(8)",
@@ -488,9 +482,8 @@ xfs_iformat_local(
         * kmem_alloc() or memcpy() below.
         */
        if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
-                        "corrupt inode %Lu "
+        "corrupt inode %Lu (bad size %d for local fork, size = %d).",
-                        "(bad size %d for local fork, size = %d).",
                        (unsigned long long) ip->i_ino, size,
                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
                XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
@@ -547,8 +540,7 @@ xfs_iformat_extents(
         * kmem_alloc() or memcpy() below.
         */
        if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
-                        "corrupt inode %Lu ((a)extents = %d).",
                        (unsigned long long) ip->i_ino, nex);
                XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
@@ -623,11 +615,10 @@ xfs_iformat_btree(
            || XFS_BMDR_SPACE_CALC(nrecs) >
                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
            || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
-                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
-                        "corrupt inode %Lu (btree).",
                        (unsigned long long) ip->i_ino);
-                XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
+                XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
-                                 ip->i_mount);
+                                 ip->i_mount, dip);
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -813,11 +804,9 @@ xfs_iread(
         */
        if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
 #ifdef DEBUG
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
+                xfs_alert(mp,
-                                "dip->di_magic (0x%x) != "
+                        "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
-                                "XFS_DINODE_MAGIC (0x%x)",
+                        __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
-                                be16_to_cpu(dip->di_magic),
-                                XFS_DINODE_MAGIC);
 #endif /* DEBUG */
                error = XFS_ERROR(EINVAL);
                goto out_brelse;
@@ -835,9 +824,8 @@ xfs_iread(
                error = xfs_iformat(ip, dip);
                if (error)  {
 #ifdef DEBUG
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
+                        xfs_alert(mp, "%s: xfs_iformat() returned error %d",
-                                        "xfs_iformat() returned error %d",
+                                __func__, error);
-                                        error);
 #endif /* DEBUG */
                        goto out_brelse;
                }
@@ -1016,8 +1004,8 @@ xfs_ialloc(
         * This is because we're setting fields here we need
         * to prevent others from looking at until we're done.
         */
-        error = xfs_trans_iget(tp->t_mountp, tp, ino,
+        error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
-                                XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
+                         XFS_ILOCK_EXCL, &ip);
        if (error)
                return error;
        ASSERT(ip != NULL);
@@ -1166,6 +1154,7 @@ xfs_ialloc(
        /*
         * Log the new values stuffed into the inode.
         */
+        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, flags);
        /* now that we have an i_mode we can setup inode ops and unlock */
@@ -1820,9 +1809,8 @@ xfs_iunlink_remove(
                 */
                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
                if (error) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
-                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
+                                __func__, error);
-                                error, mp->m_fsname);
                        return error;
                }
                next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -1867,9 +1855,9 @@ xfs_iunlink_remove(
                        error = xfs_inotobp(mp, tp, next_ino, &last_dip,
                                            &last_ibp, &last_offset, 0);
                        if (error) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp,
-                        "xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
+                                        "%s: xfs_inotobp() returned error %d.",
-                                        error, mp->m_fsname);
+                                        __func__, error);
                                return error;
                        }
                        next_agino = be32_to_cpu(last_dip->di_next_unlinked);
@@ -1882,9 +1870,8 @@ xfs_iunlink_remove(
                 */
                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
                if (error) {
-                        cmn_err(CE_WARN,
+                        xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
-                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
+                                __func__, error);
-                                error, mp->m_fsname);
                        return error;
                }
                next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -2802,7 +2789,7 @@ xfs_iflush(
        /*
         * We can't flush the inode until it is unpinned, so wait for it if we
-         * are allowed to block.  We know noone new can pin it, because we are
+         * are allowed to block.  We know no one new can pin it, because we are
         * holding the inode lock shared and you need to hold it exclusively to
         * pin the inode.
         *
@@ -2848,7 +2835,7 @@ xfs_iflush(
         * Get the buffer containing the on-disk inode.
         */
        error = xfs_itobp(mp, NULL, ip, &dip, &bp,
-                                (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK);
+                                (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK);
        if (error || !bp) {
                xfs_ifunlock(ip);
                return error;
@@ -2939,16 +2926,16 @@ xfs_iflush_int(
        if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
+                        "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
-                        ip->i_ino, be16_to_cpu(dip->di_magic), dip);
+                        __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
                goto corrupt_out;
        }
        if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
                                mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                        "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
+                        "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
-                        ip->i_ino, ip, ip->i_d.di_magic);
+                        __func__, ip->i_ino, ip, ip->i_d.di_magic);
                goto corrupt_out;
        }
        if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
@@ -2956,9 +2943,9 @@ xfs_iflush_int(
                    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
                    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
-                        xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                                "xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
+                                "%s: Bad regular inode %Lu, ptr 0x%p",
-                                ip->i_ino, ip);
+                                __func__, ip->i_ino, ip);
                        goto corrupt_out;
                }
        } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
@@ -2967,28 +2954,28 @@ xfs_iflush_int(
                    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
                    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
                    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
-                        xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                                "xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
+                                "%s: Bad directory inode %Lu, ptr 0x%p",
-                                ip->i_ino, ip);
+                                __func__, ip->i_ino, ip);
                        goto corrupt_out;
                }
        }
        if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
                                ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
                                XFS_RANDOM_IFLUSH_5)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                        "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
+                        "%s: detected corrupt incore inode %Lu, "
-                        ip->i_ino,
+                        "total extents = %d, nblocks = %Ld, ptr 0x%p",
+                        __func__, ip->i_ino,
                        ip->i_d.di_nextents + ip->i_d.di_anextents,
-                        ip->i_d.di_nblocks,
+                        ip->i_d.di_nblocks, ip);
-                        ip);
                goto corrupt_out;
        }
        if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
                                mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
-                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
-                        "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
+                        "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
-                        ip->i_ino, ip->i_d.di_forkoff, ip);
+                        __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
                goto corrupt_out;
        }
        /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 5c95fa8ec11d..ff4e2a30227d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -111,7 +111,7 @@ struct xfs_imap {
 * Generally, we do not want to hold the i_rlock while holding the
 * i_ilock. Hierarchy is i_iolock followed by i_rlock.
 *
- * xfs_iptr_t contains all the inode fields upto and including the
+ * xfs_iptr_t contains all the inode fields up to and including the
 * i_mnext and i_mprev fields, it is used as a marker in the inode
 * chain off the mount structure by xfs_sync calls.
 */
@@ -336,7 +336,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
 /*
 * Project quota id helpers (previously projid was 16bit only
- * and using two 16bit values to hold new 32bit projid was choosen
+ * and using two 16bit values to hold new 32bit projid was chosen
 * to retain compatibility with "old" filesystems).
 */
 static inline prid_t
@@ -409,28 +409,35 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
 * Flags for lockdep annotations.
 *
- * XFS_I[O]LOCK_PARENT - for operations that require locking two inodes
+ * XFS_LOCK_PARENT - for directory operations that require locking a
- * (ie directory operations that require locking a directory inode and
+ * parent directory inode and a child entry inode.  The parent gets locked
- * an entry inode).  The first inode gets locked with this flag so it
+ * with this flag so it gets a lockdep subclass of 1 and the child entry
- * gets a lockdep subclass of 1 and the second lock will have a lockdep
+ * lock will have a lockdep subclass of 0.
- * subclass of 0.
+ *
+ * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
+ * inodes do not participate in the normal lock order, and thus have their
+ * own subclasses.
 *
 * XFS_LOCK_INUMORDER - for locking several inodes at the some time
 * with xfs_lock_inodes().  This flag is used as the starting subclass
 * and each subsequent lock acquired will increment the subclass by one.
- * So the first lock acquired will have a lockdep subclass of 2, the
+ * So the first lock acquired will have a lockdep subclass of 4, the
- * second lock will have a lockdep subclass of 3, and so on. It is
+ * second lock will have a lockdep subclass of 5, and so on. It is
 * the responsibility of the class builder to shift this to the correct
 * portion of the lock_mode lockdep mask.
 */
 #define XFS_LOCK_PARENT         1
-#define XFS_LOCK_INUMORDER      2
+#define XFS_LOCK_RTBITMAP       2
+#define XFS_LOCK_RTSUM          3
+#define XFS_LOCK_INUMORDER      4
 #define XFS_IOLOCK_SHIFT        16
 #define XFS_IOLOCK_PARENT       (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
 #define XFS_ILOCK_SHIFT         24
 #define XFS_ILOCK_PARENT        (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
+#define XFS_ILOCK_RTBITMAP      (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
+#define XFS_ILOCK_RTSUM         (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
 #define XFS_IOLOCK_DEP_MASK     0x00ff0000
 #define XFS_ILOCK_DEP_MASK      0xff000000
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fd4f398bd6f1..576fdfe81d60 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -198,6 +198,41 @@ xfs_inode_item_size(
 }
 /*
+ * xfs_inode_item_format_extents - convert in-core extents to on-disk form
+ *
+ * For either the data or attr fork in extent format, we need to endian convert
+ * the in-core extent as we place them into the on-disk inode. In this case, we
+ * need to do this conversion before we write the extents into the log. Because
+ * we don't have the disk inode to write into here, we allocate a buffer and
+ * format the extents into it via xfs_iextents_copy(). We free the buffer in
+ * the unlock routine after the copy for the log has been made.
+ *
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only log on-disk extents
+ * here, so always use the physical fork size to determine the size of the
+ * buffer we need to allocate.
+ */
+STATIC void
+xfs_inode_item_format_extents(
+        struct xfs_inode        *ip,
+        struct xfs_log_iovec    *vecp,
+        int                     whichfork,
+        int                     type)
+{
+        xfs_bmbt_rec_t          *ext_buffer;
+        ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
+        if (whichfork == XFS_DATA_FORK)
+                ip->i_itemp->ili_extents_buf = ext_buffer;
+        else
+                ip->i_itemp->ili_aextents_buf = ext_buffer;
+        vecp->i_addr = ext_buffer;
+        vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork);
+        vecp->i_type = type;
+}
+/*
 * This is called to fill in the vector of log iovecs for the
 * given inode log item.  It fills the first item with an inode
 * log format structure, the second with the on-disk inode structure,
@@ -213,7 +248,6 @@ xfs_inode_item_format(
        struct xfs_inode        *ip = iip->ili_inode;
        uint                    nvecs;
        size_t                  data_bytes;
-        xfs_bmbt_rec_t          *ext_buffer;
        xfs_mount_t             *mp;
        vecp->i_addr = &iip->ili_format;
@@ -320,22 +354,8 @@ xfs_inode_item_format(
                        } else
 #endif
                        {
-                                /*
+                                xfs_inode_item_format_extents(ip, vecp,
-                                 * There are delayed allocation extents
+                                        XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
-                                 * in the inode, or we need to convert
-                                 * the extents to on disk format.
-                                 * Use xfs_iextents_copy()
-                                 * to copy only the real extents into
-                                 * a separate buffer.  We'll free the
-                                 * buffer in the unlock routine.
-                                 */
-                                ext_buffer = kmem_alloc(ip->i_df.if_bytes,
-                                        KM_SLEEP);
-                                iip->ili_extents_buf = ext_buffer;
-                                vecp->i_addr = ext_buffer;
-                                vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
-                                                XFS_DATA_FORK);
-                                vecp->i_type = XLOG_REG_TYPE_IEXT;
                        }
                        ASSERT(vecp->i_len <= ip->i_df.if_bytes);
                        iip->ili_format.ilf_dsize = vecp->i_len;
@@ -445,19 +465,12 @@ xfs_inode_item_format(
                         */
                        vecp->i_addr = ip->i_afp->if_u1.if_extents;
                        vecp->i_len = ip->i_afp->if_bytes;
+                        vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
 #else
                        ASSERT(iip->ili_aextents_buf == NULL);
-                        /*
+                        xfs_inode_item_format_extents(ip, vecp,
-                         * Need to endian flip before logging
+                                        XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
-                         */
-                        ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
-                                KM_SLEEP);
-                        iip->ili_aextents_buf = ext_buffer;
-                        vecp->i_addr = ext_buffer;
-                        vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
-                                        XFS_ATTR_FORK);
 #endif
-                        vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
                        iip->ili_format.ilf_asize = vecp->i_len;
                        vecp++;
                        nvecs++;
@@ -760,11 +773,11 @@ xfs_inode_item_push(
         * Push the inode to it's backing buffer. This will not remove the
         * inode from the AIL - a further push will be required to trigger a
         * buffer push. However, this allows all the dirty inodes to be pushed
-         * to the buffer before it is pushed to disk. THe buffer IO completion
+         * to the buffer before it is pushed to disk. The buffer IO completion
-         * will pull th einode from the AIL, mark it clean and unlock the flush
+         * will pull the inode from the AIL, mark it clean and unlock the flush
         * lock.
         */
-        (void) xfs_iflush(ip, 0);
+        (void) xfs_iflush(ip, SYNC_TRYLOCK);
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
 }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8a0f044750c3..091d82b94c4d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -101,11 +101,11 @@ xfs_iomap_eof_align_last_fsb(
 }
 STATIC int
-xfs_cmn_err_fsblock_zero(
+xfs_alert_fsblock_zero(
        xfs_inode_t     *ip,
        xfs_bmbt_irec_t *imap)
 {
-        xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount,
+        xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
                        "Access to block zero in inode %llu "
                        "start_block: %llx start_off: %llx "
                        "blkcnt: %llx extent-state: %x\n",
@@ -246,7 +246,7 @@ xfs_iomap_write_direct(
        }
        if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
-                error = xfs_cmn_err_fsblock_zero(ip, imap);
+                error = xfs_alert_fsblock_zero(ip, imap);
                goto error_out;
        }
@@ -464,7 +464,7 @@ retry:
        }
        if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
+                return xfs_alert_fsblock_zero(ip, &imap[0]);
        *ret_imap = imap[0];
        return 0;
@@ -614,7 +614,7 @@ xfs_iomap_write_allocate(
                 * covers at least part of the callers request
                 */
                if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                        return xfs_cmn_err_fsblock_zero(ip, imap);
+                        return xfs_alert_fsblock_zero(ip, imap);
                if ((offset_fsb >= imap->br_startoff) &&
                    (offset_fsb < (imap->br_startoff +
@@ -724,7 +724,7 @@ xfs_iomap_write_unwritten(
                        return XFS_ERROR(error);
                if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
-                        return xfs_cmn_err_fsblock_zero(ip, &imap);
+                        return xfs_alert_fsblock_zero(ip, &imap);
                if ((numblks_fsb = imap.br_blockcount) == 0) {
                        /*
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index dc1882adaf54..751e94fe1f77 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -204,7 +204,6 @@ xfs_bulkstat(
        xfs_agi_t               *agi;   /* agi header data */
        xfs_agino_t             agino;  /* inode # in allocation group */
        xfs_agnumber_t          agno;   /* allocation group number */
-        xfs_daddr_t             bno;    /* inode cluster start daddr */
        int                     chunkidx; /* current index into inode chunk */
        int                     clustidx; /* current index into inode cluster */
        xfs_btree_cur_t         *cur;   /* btree cursor for ialloc btree */
@@ -463,7 +462,6 @@ xfs_bulkstat(
                                                 mp->m_sb.sb_inopblog);
                                }
                                ino = XFS_AGINO_TO_INO(mp, agno, agino);
-                                bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
                                /*
                                 * Skip if this inode is free.
                                 */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ae6fef1ff563..b612ce4520ae 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -374,11 +374,10 @@ xfs_log_mount(
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
-                cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
+                xfs_notice(mp, "Mounting Filesystem");
        else {
-                cmn_err(CE_NOTE,
+                xfs_notice(mp,
-                        "Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
+"Mounting filesystem in no-recovery mode.  Filesystem will be inconsistent.");
-                        mp->m_fsname);
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
        }
@@ -393,7 +392,7 @@ xfs_log_mount(
         */
        error = xfs_trans_ail_init(mp);
        if (error) {
-                cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
+                xfs_warn(mp, "AIL initialisation failed: error %d", error);
                goto out_free_log;
        }
        mp->m_log->l_ailp = mp->m_ail;
@@ -413,7 +412,8 @@ xfs_log_mount(
                if (readonly)
                        mp->m_flags |= XFS_MOUNT_RDONLY;
                if (error) {
-                        cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error);
+                        xfs_warn(mp, "log mount/recovery failed: error %d",
+                                error);
                        goto out_destroy_ail;
                }
        }
@@ -542,10 +542,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                         */
                }
-                if (error) {
+                if (error)
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp, "%s: unmount record failed", __func__);
-                                "xfs_log_unmount: unmount record failed");
-                }
                spin_lock(&log->l_icloglock);
@@ -763,7 +761,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
                break;
        case XLOG_STATE_COVER_NEED:
        case XLOG_STATE_COVER_NEED2:
-                if (!xfs_trans_ail_tail(log->l_ailp) &&
+                if (!xfs_ail_min_lsn(log->l_ailp) &&
                    xlog_iclogs_empty(log)) {
                        if (log->l_covered_state == XLOG_STATE_COVER_NEED)
                                log->l_covered_state = XLOG_STATE_COVER_DONE;
@@ -803,7 +801,7 @@ xlog_assign_tail_lsn(
        xfs_lsn_t               tail_lsn;
        struct log              *log = mp->m_log;
-        tail_lsn = xfs_trans_ail_tail(mp->m_ail);
+        tail_lsn = xfs_ail_min_lsn(mp->m_ail);
        if (!tail_lsn)
                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
@@ -852,7 +850,7 @@ xlog_space_left(
                 * In this case we just want to return the size of the
                 * log as the amount of space left.
                 */
-                xfs_fs_cmn_err(CE_ALERT, log->l_mp,
+                xfs_alert(log->l_mp,
                        "xlog_space_left: head behind tail\n"
                        "  tail_cycle = %d, tail_bytes = %d\n"
                        "  GH   cycle = %d, GH   bytes = %d",
@@ -1001,7 +999,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
        log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
        if (!log) {
-                xlog_warn("XFS: Log allocation failed: No memory!");
+                xfs_warn(mp, "Log allocation failed: No memory!");
                goto out;
        }
@@ -1029,24 +1027,24 @@ xlog_alloc_log(xfs_mount_t	*mp,
        if (xfs_sb_version_hassector(&mp->m_sb)) {
                log2_size = mp->m_sb.sb_logsectlog;
                if (log2_size < BBSHIFT) {
-                        xlog_warn("XFS: Log sector size too small "
+                        xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
-                                "(0x%x < 0x%x)", log2_size, BBSHIFT);
+                                log2_size, BBSHIFT);
                        goto out_free_log;
                }
                log2_size -= BBSHIFT;
                if (log2_size > mp->m_sectbb_log) {
-                        xlog_warn("XFS: Log sector size too large "
+                        xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
-                                "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log);
+                                log2_size, mp->m_sectbb_log);
                        goto out_free_log;
                }
                /* for larger sector sizes, must have v2 or external log */
                if (log2_size && log->l_logBBstart > 0 &&
                            !xfs_sb_version_haslogv2(&mp->m_sb)) {
+                        xfs_warn(mp,
-                        xlog_warn("XFS: log sector size (0x%x) invalid "
+                "log sector size (0x%x) invalid for configuration.",
-                                  "for configuration.", log2_size);
+                                log2_size);
                        goto out_free_log;
                }
        }
@@ -1241,7 +1239,7 @@ xlog_grant_push_ail(
         * the filesystem is shutting down.
         */
        if (!XLOG_FORCED_SHUTDOWN(log))
-                xfs_trans_ail_push(log->l_ailp, threshold_lsn);
+                xfs_ail_push(log->l_ailp, threshold_lsn);
 }
 /*
@@ -1563,38 +1561,36 @@ xlog_print_tic_res(
            "SWAPEXT"
        };
-        xfs_fs_cmn_err(CE_WARN, mp,
+        xfs_warn(mp,
-                        "xfs_log_write: reservation summary:\n"
+                "xfs_log_write: reservation summary:\n"
-                        "  trans type  = %s (%u)\n"
+                "  trans type  = %s (%u)\n"
-                        "  unit res    = %d bytes\n"
+                "  unit res    = %d bytes\n"
-                        "  current res = %d bytes\n"
+                "  current res = %d bytes\n"
-                        "  total reg   = %u bytes (o/flow = %u bytes)\n"
+                "  total reg   = %u bytes (o/flow = %u bytes)\n"
-                        "  ophdrs      = %u (ophdr space = %u bytes)\n"
+                "  ophdrs      = %u (ophdr space = %u bytes)\n"
-                        "  ophdr + reg = %u bytes\n"
+                "  ophdr + reg = %u bytes\n"
-                        "  num regions = %u\n",
+                "  num regions = %u\n",
-                        ((ticket->t_trans_type <= 0 ||
+                ((ticket->t_trans_type <= 0 ||
-                          ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
+                  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
-                          "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
+                  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
-                        ticket->t_trans_type,
+                ticket->t_trans_type,
-                        ticket->t_unit_res,
+                ticket->t_unit_res,
-                        ticket->t_curr_res,
+                ticket->t_curr_res,
-                        ticket->t_res_arr_sum, ticket->t_res_o_flow,
+                ticket->t_res_arr_sum, ticket->t_res_o_flow,
-                        ticket->t_res_num_ophdrs, ophdr_spc,
+                ticket->t_res_num_ophdrs, ophdr_spc,
-                        ticket->t_res_arr_sum + 
+                ticket->t_res_arr_sum +
-                        ticket->t_res_o_flow + ophdr_spc,
+                ticket->t_res_o_flow + ophdr_spc,
-                        ticket->t_res_num);
+                ticket->t_res_num);
        for (i = 0; i < ticket->t_res_num; i++) {
-                uint r_type = ticket->t_res_arr[i].r_type; 
+                uint r_type = ticket->t_res_arr[i].r_type;
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
-                            "region[%u]: %s - %u bytes\n",
-                            i, 
                            ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
                            "bad-rtype" : res_type_str[r_type-1]),
                            ticket->t_res_arr[i].r_len);
        }
-        xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+        xfs_alert_tag(mp, XFS_PTAG_LOGRES,
                "xfs_log_write: reservation ran out. Need to up reservation");
        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 }
@@ -1682,7 +1678,7 @@ xlog_write_setup_ophdr(
        case XFS_LOG:
                break;
        default:
-                xfs_fs_cmn_err(CE_WARN, log->l_mp,
+                xfs_warn(log->l_mp,
                        "Bad XFS transaction clientid 0x%x in ticket 0x%p",
                        ophdr->oh_clientid, ticket);
                return NULL;
@@ -2264,7 +2260,7 @@ xlog_state_do_callback(
                if (repeats > 5000) {
                        flushcnt += repeats;
                        repeats = 0;
-                        xfs_fs_cmn_err(CE_WARN, log->l_mp,
+                        xfs_warn(log->l_mp,
                                "%s: possible infinite loop (%d iterations)",
                                __func__, flushcnt);
                }
@@ -3052,10 +3048,8 @@ xfs_log_force(
        int     error;
        error = _xfs_log_force(mp, flags, NULL);
-        if (error) {
+        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+                xfs_warn(mp, "%s: error %d returned.", __func__, error);
-                        "error %d returned.", error);
-        }
 }
 /*
@@ -3204,10 +3198,8 @@ xfs_log_force_lsn(
        int     error;
        error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
-        if (error) {
+        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
+                xfs_warn(mp, "%s: error %d returned.", __func__, error);
-                        "error %d returned.", error);
-        }
 }
 /*
@@ -3412,9 +3404,20 @@ xlog_verify_dest_ptr(
        }
        if (!good_ptr)
-                xlog_panic("xlog_verify_dest_ptr: invalid ptr");
+                xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
 }
+/*
+ * Check to make sure the grant write head didn't just over lap the tail.  If
+ * the cycles are the same, we can't be overlapping.  Otherwise, make sure that
+ * the cycles differ by exactly one and check the byte count.
+ *
+ * This check is run unlocked, so can give false positives. Rather than assert
+ * on failures, use a warn-once flag and a panic tag to allow the admin to
+ * determine if they want to panic the machine when such an error occurs. For
+ * debug kernels this will have the same effect as using an assert but, unlinke
+ * an assert, it can be turned off at runtime.
+ */
 STATIC void
 xlog_verify_grant_tail(
        struct log      *log)
@@ -3422,17 +3425,22 @@ xlog_verify_grant_tail(
        int             tail_cycle, tail_blocks;
        int             cycle, space;
-        /*
-         * Check to make sure the grant write head didn't just over lap the
-         * tail.  If the cycles are the same, we can't be overlapping.
-         * Otherwise, make sure that the cycles differ by exactly one and
-         * check the byte count.
-         */
        xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
        if (tail_cycle != cycle) {
-                ASSERT(cycle - 1 == tail_cycle);
+                if (cycle - 1 != tail_cycle &&
-                ASSERT(space <= BBTOB(tail_blocks));
+                    !(log->l_flags & XLOG_TAIL_WARN)) {
+                        xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+                                "%s: cycle - 1 != tail_cycle", __func__);
+                        log->l_flags |= XLOG_TAIL_WARN;
+                }
+                if (space > BBTOB(tail_blocks) &&
+                    !(log->l_flags & XLOG_TAIL_WARN)) {
+                        xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
+                                "%s: space > BBTOB(tail_blocks)", __func__);
+                        log->l_flags |= XLOG_TAIL_WARN;
+                }
        }
 }
@@ -3448,16 +3456,16 @@ xlog_verify_tail_lsn(xlog_t	    *log,
        blocks =
            log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
        if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
-            xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+                xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
    } else {
        ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
        if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
-            xlog_panic("xlog_verify_tail_lsn: tail wrapped");
+                xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
        blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
        if (blocks < BTOBB(iclog->ic_offset) + 1)
-            xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+                xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
    }
 }       /* xlog_verify_tail_lsn */
@@ -3497,22 +3505,23 @@ xlog_verify_iclog(xlog_t	 *log,
        icptr = log->l_iclog;
        for (i=0; i < log->l_iclog_bufs; i++) {
                if (icptr == NULL)
-                        xlog_panic("xlog_verify_iclog: invalid ptr");
+                        xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
                icptr = icptr->ic_next;
        }
        if (icptr != log->l_iclog)
-                xlog_panic("xlog_verify_iclog: corrupt iclog ring");
+                xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
        spin_unlock(&log->l_icloglock);
        /* check log magic numbers */
        if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM)
-                xlog_panic("xlog_verify_iclog: invalid magic num");
+                xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
        ptr = (xfs_caddr_t) &iclog->ic_header;
        for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
             ptr += BBSIZE) {
                if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
-                        xlog_panic("xlog_verify_iclog: unexpected magic num");
+                        xfs_emerg(log->l_mp, "%s: unexpected magic num",
+                                __func__);
        }
        /* check fields */
@@ -3542,9 +3551,10 @@ xlog_verify_iclog(xlog_t	 *log,
                        }
                }
                if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
-                        cmn_err(CE_WARN, "xlog_verify_iclog: "
+                        xfs_warn(log->l_mp,
-                                "invalid clientid %d op 0x%p offset 0x%lx",
+                                "%s: invalid clientid %d op 0x%p offset 0x%lx",
-                                clientid, ophead, (unsigned long)field_offset);
+                                __func__, clientid, ophead,
+                                (unsigned long)field_offset);
                /* check length */
                field_offset = (__psint_t)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index d5f8be8f4bf6..5864850e9e34 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -87,10 +87,6 @@ static inline uint xlog_get_client_id(__be32 i)
        return be32_to_cpu(i) >> 24;
 }
-#define xlog_panic(args...)     cmn_err(CE_PANIC, ## args)
-#define xlog_exit(args...)      cmn_err(CE_PANIC, ## args)
-#define xlog_warn(args...)      cmn_err(CE_WARN, ## args)
 /*
 * In core log state
 */
@@ -148,6 +144,7 @@ static inline uint xlog_get_client_id(__be32 i)
 #define XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
 #define XLOG_IO_ERROR           0x8     /* log hit an I/O error, and being
                                           shutdown */
+#define XLOG_TAIL_WARN          0x10    /* log tail verify warning issued */
 #ifdef __KERNEL__
 /*
@@ -574,7 +571,7 @@ int	xlog_write(struct log *log, struct xfs_log_vec *log_vector,
 * When we crack an atomic LSN, we sample it first so that the value will not
 * change while we are cracking it into the component values. This means we
 * will always get consistent component values to work from. This should always
- * be used to smaple and crack LSNs taht are stored and updated in atomic
+ * be used to sample and crack LSNs that are stored and updated in atomic
 * variables.
 */
 static inline void
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index aa0ebb776903..5cc464a17c93 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -92,7 +92,7 @@ xlog_get_bp(
        int             nbblks)
 {
        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return NULL;
@@ -101,7 +101,7 @@ xlog_get_bp(
        /*
         * We do log I/O in units of log sectors (a power-of-2
         * multiple of the basic block size), so we round up the
-         * requested size to acommodate the basic blocks required
+         * requested size to accommodate the basic blocks required
         * for complete log sectors.
         *
         * In addition, the buffer may be used for a non-sector-
@@ -112,7 +112,7 @@ xlog_get_bp(
         * an issue.  Nor will this be a problem if the log I/O is
         * done in basic blocks (sector size 1).  But otherwise we
         * extend the buffer by one extra log sector to ensure
-         * there's space to accomodate this possiblility.
+         * there's space to accommodate this possibility.
         */
        if (nbblks > 1 && log->l_sectBBsize > 1)
                nbblks += log->l_sectBBsize;
@@ -160,7 +160,7 @@ xlog_bread_noalign(
        int             error;
        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
@@ -219,7 +219,7 @@ xlog_bwrite(
        int             error;
        if (!xlog_buf_bbcount_valid(log, nbblks)) {
-                xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
                return EFSCORRUPTED;
@@ -254,9 +254,9 @@ xlog_header_check_dump(
        xfs_mount_t             *mp,
        xlog_rec_header_t       *head)
 {
-        cmn_err(CE_DEBUG, "%s:  SB : uuid = %pU, fmt = %d\n",
+        xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d\n",
                __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
-        cmn_err(CE_DEBUG, "    log : uuid = %pU, fmt = %d\n",
+        xfs_debug(mp, "    log : uuid = %pU, fmt = %d\n",
                &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 }
 #else
@@ -279,15 +279,15 @@ xlog_header_check_recover(
         * a dirty log created in IRIX.
         */
        if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
-                xlog_warn(
+                xfs_warn(mp,
-        "XFS: dirty log written in incompatible format - can't recover");
+        "dirty log written in incompatible format - can't recover");
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_recover(1)",
                                 XFS_ERRLEVEL_HIGH, mp);
                return XFS_ERROR(EFSCORRUPTED);
        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
-                xlog_warn(
+                xfs_warn(mp,
-        "XFS: dirty log entry has mismatched uuid - can't recover");
+        "dirty log entry has mismatched uuid - can't recover");
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_recover(2)",
                                 XFS_ERRLEVEL_HIGH, mp);
@@ -312,9 +312,9 @@ xlog_header_check_mount(
                 * h_fs_uuid is nil, we assume this log was last mounted
                 * by IRIX and continue.
                 */
-                xlog_warn("XFS: nil uuid in log - IRIX style log");
+                xfs_warn(mp, "nil uuid in log - IRIX style log");
        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
-                xlog_warn("XFS: log has mismatched uuid - can't recover");
+                xfs_warn(mp, "log has mismatched uuid - can't recover");
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_mount",
                                 XFS_ERRLEVEL_HIGH, mp);
@@ -490,8 +490,8 @@ xlog_find_verify_log_record(
        for (i = (*last_blk) - 1; i >= 0; i--) {
                if (i < start_blk) {
                        /* valid log record not found */
-                        xlog_warn(
+                        xfs_warn(log->l_mp,
-                "XFS: Log inconsistent (didn't find previous header)");
+                "Log inconsistent (didn't find previous header)");
                        ASSERT(0);
                        error = XFS_ERROR(EIO);
                        goto out;
@@ -591,12 +591,12 @@ xlog_find_head(
                         * mkfs etc write a dummy unmount record to a fresh
                         * log so we can store the uuid in there
                         */
-                        xlog_warn("XFS: totally zeroed log");
+                        xfs_warn(log->l_mp, "totally zeroed log");
                }
                return 0;
        } else if (error) {
-                xlog_warn("XFS: empty log check failed");
+                xfs_warn(log->l_mp, "empty log check failed");
                return error;
        }
@@ -819,7 +819,7 @@ validate_head:
        xlog_put_bp(bp);
        if (error)
-            xlog_warn("XFS: failed to find log head");
+                xfs_warn(log->l_mp, "failed to find log head");
        return error;
 }
@@ -912,7 +912,7 @@ xlog_find_tail(
                }
        }
        if (!found) {
-                xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
+                xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
                ASSERT(0);
                return XFS_ERROR(EIO);
        }
@@ -1028,7 +1028,7 @@ done:
        xlog_put_bp(bp);
        if (error)
-                xlog_warn("XFS: failed to locate log tail");
+                xfs_warn(log->l_mp, "failed to locate log tail");
        return error;
 }
@@ -1092,7 +1092,8 @@ xlog_find_zeroed(
                 * the first block must be 1. If it's not, maybe we're
                 * not looking at a log... Bail out.
                 */
-                xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
+                xfs_warn(log->l_mp,
+                        "Log inconsistent or not a log (last==0, first!=1)");
                return XFS_ERROR(EINVAL);
        }
@@ -1506,8 +1507,8 @@ xlog_recover_add_to_trans(
        if (list_empty(&trans->r_itemq)) {
                /* we need to catch log corruptions here */
                if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
-                        xlog_warn("XFS: xlog_recover_add_to_trans: "
+                        xfs_warn(log->l_mp, "%s: bad header magic number",
-                                  "bad header magic number");
+                                __func__);
                        ASSERT(0);
                        return XFS_ERROR(EIO);
                }
@@ -1534,8 +1535,8 @@ xlog_recover_add_to_trans(
        if (item->ri_total == 0) {              /* first region to be added */
                if (in_f->ilf_size == 0 ||
                    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
-                        xlog_warn(
+                        xfs_warn(log->l_mp,
-        "XFS: bad number of regions (%d) in inode log format",
+                "bad number of regions (%d) in inode log format",
                                  in_f->ilf_size);
                        ASSERT(0);
                        return XFS_ERROR(EIO);
@@ -1592,8 +1593,9 @@ xlog_recover_reorder_trans(
                        list_move_tail(&item->ri_list, &trans->r_itemq);
                        break;
                default:
-                        xlog_warn(
+                        xfs_warn(log->l_mp,
-        "XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
+                                "%s: unrecognized type of log operation",
+                                __func__);
                        ASSERT(0);
                        return XFS_ERROR(EIO);
                }
@@ -1803,8 +1805,9 @@ xlog_recover_do_inode_buffer(
                logged_nextp = item->ri_buf[item_index].i_addr +
                                next_unlinked_offset - reg_buf_offset;
                if (unlikely(*logged_nextp == 0)) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
+                "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
+                "Trying to replay bad (0) inode di_next_unlinked field.",
                                item, bp);
                        XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
                                         XFS_ERRLEVEL_LOW, mp);
@@ -1863,17 +1866,17 @@ xlog_recover_do_reg_buffer(
                if (buf_f->blf_flags &
                   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                        if (item->ri_buf[i].i_addr == NULL) {
-                                cmn_err(CE_ALERT,
+                                xfs_alert(mp,
                                        "XFS: NULL dquot in %s.", __func__);
                                goto next;
                        }
                        if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
-                                cmn_err(CE_ALERT,
+                                xfs_alert(mp,
                                        "XFS: dquot too small (%d) in %s.",
                                        item->ri_buf[i].i_len, __func__);
                                goto next;
                        }
-                        error = xfs_qm_dqcheck(item->ri_buf[i].i_addr,
+                        error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
                                               -1, 0, XFS_QMOPT_DOWARN,
                                               "dquot_buf_recover");
                        if (error)
@@ -1898,6 +1901,7 @@ xlog_recover_do_reg_buffer(
 */
 int
 xfs_qm_dqcheck(
+        struct xfs_mount *mp,
        xfs_disk_dquot_t *ddq,
        xfs_dqid_t       id,
        uint             type,    /* used only when IO_dorepair is true */
@@ -1924,14 +1928,14 @@ xfs_qm_dqcheck(
         */
        if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
                        str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
                errs++;
        }
        if (ddq->d_version != XFS_DQUOT_VERSION) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
                        str, id, ddq->d_version, XFS_DQUOT_VERSION);
                errs++;
@@ -1941,7 +1945,7 @@ xfs_qm_dqcheck(
            ddq->d_flags != XFS_DQ_PROJ &&
            ddq->d_flags != XFS_DQ_GROUP) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
                        str, id, ddq->d_flags);
                errs++;
@@ -1949,7 +1953,7 @@ xfs_qm_dqcheck(
        if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
                if (flags & XFS_QMOPT_DOWARN)
-                        cmn_err(CE_ALERT,
+                        xfs_alert(mp,
                        "%s : ondisk-dquot 0x%p, ID mismatch: "
                        "0x%x expected, found id 0x%x",
                        str, ddq, id, be32_to_cpu(ddq->d_id));
@@ -1962,9 +1966,8 @@ xfs_qm_dqcheck(
                                be64_to_cpu(ddq->d_blk_softlimit)) {
                        if (!ddq->d_btimer) {
                                if (flags & XFS_QMOPT_DOWARN)
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                        "%s : Dquot ID 0x%x (0x%p) "
+                        "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
-                                        "BLK TIMER NOT STARTED",
                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
                                errs++;
                        }
@@ -1974,9 +1977,8 @@ xfs_qm_dqcheck(
                                be64_to_cpu(ddq->d_ino_softlimit)) {
                        if (!ddq->d_itimer) {
                                if (flags & XFS_QMOPT_DOWARN)
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                        "%s : Dquot ID 0x%x (0x%p) "
+                        "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
-                                        "INODE TIMER NOT STARTED",
                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
                                errs++;
                        }
@@ -1986,9 +1988,8 @@ xfs_qm_dqcheck(
                                be64_to_cpu(ddq->d_rtb_softlimit)) {
                        if (!ddq->d_rtbtimer) {
                                if (flags & XFS_QMOPT_DOWARN)
-                                        cmn_err(CE_ALERT,
+                                        xfs_alert(mp,
-                                        "%s : Dquot ID 0x%x (0x%p) "
+                        "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
-                                        "RTBLK TIMER NOT STARTED",
                                        str, (int)be32_to_cpu(ddq->d_id), ddq);
                                errs++;
                        }
@@ -1999,7 +2000,7 @@ xfs_qm_dqcheck(
                return errs;
        if (flags & XFS_QMOPT_DOWARN)
-                cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
+                xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
        /*
         * Typically, a repair is only requested by quotacheck.
@@ -2218,9 +2219,9 @@ xlog_recover_inode_pass2(
         */
        if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
+        "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
-                        dip, bp, in_f->ilf_ino);
+                        __func__, dip, bp, in_f->ilf_ino);
                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
@@ -2229,9 +2230,9 @@ xlog_recover_inode_pass2(
        dicp = item->ri_buf[1].i_addr;
        if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
+                        "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
-                        item, in_f->ilf_ino);
+                        __func__, item, in_f->ilf_ino);
                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
                                 XFS_ERRLEVEL_LOW, mp);
                error = EFSCORRUPTED;
@@ -2263,9 +2264,10 @@ xlog_recover_inode_pass2(
                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
                                         XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                "%s: Bad regular inode log record, rec ptr 0x%p, "
-                                item, dip, bp, in_f->ilf_ino);
+                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                                __func__, item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
@@ -2276,9 +2278,10 @@ xlog_recover_inode_pass2(
                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
                                             XFS_ERRLEVEL_LOW, mp, dicp);
                        xfs_buf_relse(bp);
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp,
-                                "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                "%s: Bad dir inode log record, rec ptr 0x%p, "
-                                item, dip, bp, in_f->ilf_ino);
+                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+                                __func__, item, dip, bp, in_f->ilf_ino);
                        error = EFSCORRUPTED;
                        goto error;
                }
@@ -2287,9 +2290,10 @@ xlog_recover_inode_pass2(
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
-                        item, dip, bp, in_f->ilf_ino,
+        "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+                        __func__, item, dip, bp, in_f->ilf_ino,
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
                error = EFSCORRUPTED;
@@ -2299,8 +2303,9 @@ xlog_recover_inode_pass2(
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
+        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
+        "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
                error = EFSCORRUPTED;
                goto error;
@@ -2309,9 +2314,9 @@ xlog_recover_inode_pass2(
                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp,
-                        "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
+                        "%s: Bad inode log record length %d, rec ptr 0x%p",
-                        item->ri_buf[1].i_len, item);
+                        __func__, item->ri_buf[1].i_len, item);
                error = EFSCORRUPTED;
                goto error;
        }
@@ -2398,7 +2403,7 @@ xlog_recover_inode_pass2(
                        break;
                default:
-                        xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag");
+                        xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
                        ASSERT(0);
                        xfs_buf_relse(bp);
                        error = EIO;
@@ -2467,13 +2472,11 @@ xlog_recover_dquot_pass2(
        recddq = item->ri_buf[1].i_addr;
        if (recddq == NULL) {
-                cmn_err(CE_ALERT,
+                xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
-                        "XFS: NULL dquot in %s.", __func__);
                return XFS_ERROR(EIO);
        }
        if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
-                cmn_err(CE_ALERT,
+                xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
-                        "XFS: dquot too small (%d) in %s.",
                        item->ri_buf[1].i_len, __func__);
                return XFS_ERROR(EIO);
        }
@@ -2498,12 +2501,10 @@ xlog_recover_dquot_pass2(
         */
        dq_f = item->ri_buf[0].i_addr;
        ASSERT(dq_f);
-        if ((error = xfs_qm_dqcheck(recddq,
+        error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                           dq_f->qlf_id,
+                           "xlog_recover_dquot_pass2 (log copy)");
-                           0, XFS_QMOPT_DOWARN,
+        if (error)
-                           "xlog_recover_dquot_pass2 (log copy)"))) {
                return XFS_ERROR(EIO);
-        }
        ASSERT(dq_f->qlf_len == 1);
        error = xfs_read_buf(mp, mp->m_ddev_targp,
@@ -2523,8 +2524,9 @@ xlog_recover_dquot_pass2(
         * was among a chunk of dquots created earlier, and we did some
         * minimal initialization then.
         */
-        if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+        error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_dquot_pass2")) {
+                           "xlog_recover_dquot_pass2");
+        if (error) {
                xfs_buf_relse(bp);
                return XFS_ERROR(EIO);
        }
@@ -2676,9 +2678,8 @@ xlog_recover_commit_pass1(
                /* nothing to do in pass 1 */
                return 0;
        default:
-                xlog_warn(
+                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
-        "XFS: invalid item type (%d) xlog_recover_commit_pass1",
+                        __func__, ITEM_TYPE(item));
-                        ITEM_TYPE(item));
                ASSERT(0);
                return XFS_ERROR(EIO);
        }
@@ -2707,9 +2708,8 @@ xlog_recover_commit_pass2(
                /* nothing to do in pass2 */
                return 0;
        default:
-                xlog_warn(
+                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
-        "XFS: invalid item type (%d) xlog_recover_commit_pass2",
+                        __func__, ITEM_TYPE(item));
-                        ITEM_TYPE(item));
                ASSERT(0);
                return XFS_ERROR(EIO);
        }
@@ -2751,10 +2751,11 @@ xlog_recover_commit_trans(
 STATIC int
 xlog_recover_unmount_trans(
+        struct log              *log,
        xlog_recover_t          *trans)
 {
        /* Do nothing now */
-        xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
+        xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
        return 0;
 }
@@ -2797,8 +2798,8 @@ xlog_recover_process_data(
                dp += sizeof(xlog_op_header_t);
                if (ohead->oh_clientid != XFS_TRANSACTION &&
                    ohead->oh_clientid != XFS_LOG) {
-                        xlog_warn(
+                        xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
-                "XFS: xlog_recover_process_data: bad clientid");
+                                        __func__, ohead->oh_clientid);
                        ASSERT(0);
                        return (XFS_ERROR(EIO));
                }
@@ -2811,8 +2812,8 @@ xlog_recover_process_data(
                                        be64_to_cpu(rhead->h_lsn));
                } else {
                        if (dp + be32_to_cpu(ohead->oh_len) > lp) {
-                                xlog_warn(
+                                xfs_warn(log->l_mp, "%s: bad length 0x%x",
-                        "XFS: xlog_recover_process_data: bad length");
+                                        __func__, be32_to_cpu(ohead->oh_len));
                                WARN_ON(1);
                                return (XFS_ERROR(EIO));
                        }
@@ -2825,7 +2826,7 @@ xlog_recover_process_data(
                                                                trans, pass);
                                break;
                        case XLOG_UNMOUNT_TRANS:
-                                error = xlog_recover_unmount_trans(trans);
+                                error = xlog_recover_unmount_trans(log, trans);
                                break;
                        case XLOG_WAS_CONT_TRANS:
                                error = xlog_recover_add_to_cont_trans(log,
@@ -2833,8 +2834,8 @@ xlog_recover_process_data(
                                                be32_to_cpu(ohead->oh_len));
                                break;
                        case XLOG_START_TRANS:
-                                xlog_warn(
+                                xfs_warn(log->l_mp, "%s: bad transaction",
-                        "XFS: xlog_recover_process_data: bad transaction");
+                                        __func__);
                                ASSERT(0);
                                error = XFS_ERROR(EIO);
                                break;
@@ -2844,8 +2845,8 @@ xlog_recover_process_data(
                                                dp, be32_to_cpu(ohead->oh_len));
                                break;
                        default:
-                                xlog_warn(
+                                xfs_warn(log->l_mp, "%s: bad flag 0x%x",
-                        "XFS: xlog_recover_process_data: bad flag");
+                                        __func__, flags);
                                ASSERT(0);
                                error = XFS_ERROR(EIO);
                                break;
@@ -3030,8 +3031,7 @@ xlog_recover_clear_agi_bucket(
 out_abort:
        xfs_trans_cancel(tp, XFS_TRANS_ABORT);
 out_error:
-        xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
+        xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
-                        "failed to clear agi %d. Continuing.", agno);
        return;
 }
@@ -3282,7 +3282,7 @@ xlog_valid_rec_header(
        if (unlikely(
            (!rhead->h_version ||
            (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
-                xlog_warn("XFS: %s: unrecognised log version (%d).",
+                xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
                        __func__, be32_to_cpu(rhead->h_version));
                return XFS_ERROR(EIO);
        }
@@ -3740,10 +3740,9 @@ xlog_recover(
                        return error;
                }
-                cmn_err(CE_NOTE,
+                xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
-                        "Starting XFS recovery on filesystem: %s (logdev: %s)",
+                                log->l_mp->m_logname ? log->l_mp->m_logname
-                        log->l_mp->m_fsname, log->l_mp->m_logname ?
+                                                     : "internal");
-                        log->l_mp->m_logname : "internal");
                error = xlog_do_recover(log, head_blk, tail_blk);
                log->l_flags |= XLOG_RECOVERY_NEEDED;
@@ -3776,9 +3775,7 @@ xlog_recover_finish(
                int     error;
                error = xlog_recover_process_efis(log);
                if (error) {
-                        cmn_err(CE_ALERT,
+                        xfs_alert(log->l_mp, "Failed to recover EFIs");
-                                "Failed to recover EFIs on filesystem: %s",
-                                log->l_mp->m_fsname);
                        return error;
                }
                /*
@@ -3793,15 +3790,12 @@ xlog_recover_finish(
                xlog_recover_check_summary(log);
-                cmn_err(CE_NOTE,
+                xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
-                        "Ending XFS recovery on filesystem: %s (logdev: %s)",
+                                log->l_mp->m_logname ? log->l_mp->m_logname
-                        log->l_mp->m_fsname, log->l_mp->m_logname ?
+                                                     : "internal");
-                        log->l_mp->m_logname : "internal");
                log->l_flags &= ~XLOG_RECOVERY_NEEDED;
        } else {
-                cmn_err(CE_DEBUG,
+                xfs_info(log->l_mp, "Ending clean mount");
-                        "Ending clean XFS mount for filesystem: %s\n",
-                        log->l_mp->m_fsname);
        }
        return 0;
 }
@@ -3834,10 +3828,8 @@ xlog_recover_check_summary(
        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
                error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
                if (error) {
-                        xfs_fs_cmn_err(CE_ALERT, mp,
+                        xfs_alert(mp, "%s agf read failed agno %d error %d",
-                                        "xlog_recover_check_summary(agf)"
+                                                __func__, agno, error);
-                                        "agf read failed agno %d error %d",
-                                                        agno, error);
                } else {
                        agfp = XFS_BUF_TO_AGF(agfbp);
                        freeblks += be32_to_cpu(agfp->agf_freeblks) +
@@ -3846,7 +3838,10 @@ xlog_recover_check_summary(
                }
                error = xfs_read_agi(mp, NULL, agno, &agibp);
-                if (!error) {
+                if (error) {
+                        xfs_alert(mp, "%s agi read failed agno %d error %d",
+                                                __func__, agno, error);
+                } else {
                        struct xfs_agi  *agi = XFS_BUF_TO_AGI(agibp);
                        itotal += be32_to_cpu(agi->agi_count);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d447aef84bc3..bb3f9a7b24ed 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -133,9 +133,7 @@ xfs_uuid_mount(
                return 0;
        if (uuid_is_nil(uuid)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "Filesystem has nil UUID - can't mount");
-                        "XFS: Filesystem %s has nil UUID - can't mount",
-                        mp->m_fsname);
                return XFS_ERROR(EINVAL);
        }
@@ -163,8 +161,7 @@ xfs_uuid_mount(
 out_duplicate:
        mutex_unlock(&xfs_uuid_table_mutex);
-        cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount",
+        xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
-                         mp->m_fsname);
        return XFS_ERROR(EINVAL);
 }
@@ -311,6 +308,8 @@ xfs_mount_validate_sb(
        xfs_sb_t        *sbp,
        int             flags)
 {
+        int             loud = !(flags & XFS_MFSI_QUIET);
        /*
         * If the log device and data device have the
         * same device number, the log is internal.
@@ -319,28 +318,32 @@ xfs_mount_validate_sb(
         * a volume filesystem in a non-volume manner.
         */
        if (sbp->sb_magicnum != XFS_SB_MAGIC) {
-                xfs_fs_mount_cmn_err(flags, "bad magic number");
+                if (loud)
+                        xfs_warn(mp, "bad magic number");
                return XFS_ERROR(EWRONGFS);
        }
        if (!xfs_sb_good_version(sbp)) {
-                xfs_fs_mount_cmn_err(flags, "bad version");
+                if (loud)
+                        xfs_warn(mp, "bad version");
                return XFS_ERROR(EWRONGFS);
        }
        if (unlikely(
            sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "filesystem is marked as having an external log; "
+                        xfs_warn(mp,
-                        "specify logdev on the\nmount command line.");
+                "filesystem is marked as having an external log; "
+                "specify logdev on the mount command line.");
                return XFS_ERROR(EINVAL);
        }
        if (unlikely(
            sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "filesystem is marked as having an internal log; "
+                        xfs_warn(mp,
-                        "do not specify logdev on\nthe mount command line.");
+                "filesystem is marked as having an internal log; "
+                "do not specify logdev on the mount command line.");
                return XFS_ERROR(EINVAL);
        }
@@ -369,7 +372,8 @@ xfs_mount_validate_sb(
            (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
            (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
            (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
-                xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed");
+                if (loud)
+                        xfs_warn(mp, "SB sanity check 1 failed");
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -382,7 +386,8 @@ xfs_mount_validate_sb(
             (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
            sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
                              sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
-                xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed");
+                if (loud)
+                        xfs_warn(mp, "SB sanity check 2 failed");
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -390,12 +395,12 @@ xfs_mount_validate_sb(
         * Until this is fixed only page-sized or smaller data blocks work.
         */
        if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud) {
-                        "file system with blocksize %d bytes",
+                        xfs_warn(mp,
-                        sbp->sb_blocksize);
+                "File system with blocksize %d bytes. "
-                xfs_fs_mount_cmn_err(flags,
+                "Only pagesize (%ld) or less will currently work.",
-                        "only pagesize (%ld) or less will currently work.",
+                                sbp->sb_blocksize, PAGE_SIZE);
-                        PAGE_SIZE);
+                }
                return XFS_ERROR(ENOSYS);
        }
@@ -409,21 +414,23 @@ xfs_mount_validate_sb(
        case 2048:
                break;
        default:
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "inode size of %d bytes not supported",
+                        xfs_warn(mp, "inode size of %d bytes not supported",
-                        sbp->sb_inodesize);
+                                sbp->sb_inodesize);
                return XFS_ERROR(ENOSYS);
        }
        if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "file system too large to be mounted on this system.");
+                        xfs_warn(mp,
+                "file system too large to be mounted on this system.");
                return XFS_ERROR(EFBIG);
        }
        if (unlikely(sbp->sb_inprogress)) {
-                xfs_fs_mount_cmn_err(flags, "file system busy");
+                if (loud)
+                        xfs_warn(mp, "file system busy");
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -431,8 +438,9 @@ xfs_mount_validate_sb(
         * Version 1 directory format has never worked on Linux.
         */
        if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "file system using version 1 directory format");
+                        xfs_warn(mp,
+                                "file system using version 1 directory format");
                return XFS_ERROR(ENOSYS);
        }
@@ -673,6 +681,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
        unsigned int    sector_size;
        xfs_buf_t       *bp;
        int             error;
+        int             loud = !(flags & XFS_MFSI_QUIET);
        ASSERT(mp->m_sb_bp == NULL);
        ASSERT(mp->m_ddev_targp != NULL);
@@ -688,7 +697,8 @@ reread:
        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
                                        XFS_SB_DADDR, sector_size, 0);
        if (!bp) {
-                xfs_fs_mount_cmn_err(flags, "SB buffer read failed");
+                if (loud)
+                        xfs_warn(mp, "SB buffer read failed");
                return EIO;
        }
@@ -699,7 +709,8 @@ reread:
        xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
        error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
        if (error) {
-                xfs_fs_mount_cmn_err(flags, "SB validate failed");
+                if (loud)
+                        xfs_warn(mp, "SB validate failed");
                goto release_buf;
        }
@@ -707,9 +718,9 @@ reread:
         * We must be able to do sector-sized and sector-aligned IO.
         */
        if (sector_size > mp->m_sb.sb_sectsize) {
-                xfs_fs_mount_cmn_err(flags,
+                if (loud)
-                        "device supports only %u byte sectors (not %u)",
+                        xfs_warn(mp, "device supports %u byte sectors (not %u)",
-                        sector_size, mp->m_sb.sb_sectsize);
+                                sector_size, mp->m_sb.sb_sectsize);
                error = ENOSYS;
                goto release_buf;
        }
@@ -853,8 +864,7 @@ xfs_update_alignment(xfs_mount_t *mp)
                if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
                    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
                        if (mp->m_flags & XFS_MOUNT_RETERR) {
-                                cmn_err(CE_WARN,
+                                xfs_warn(mp, "alignment check 1 failed");
-                                        "XFS: alignment check 1 failed");
                                return XFS_ERROR(EINVAL);
                        }
                        mp->m_dalign = mp->m_swidth = 0;
@@ -867,8 +877,9 @@ xfs_update_alignment(xfs_mount_t *mp)
                                if (mp->m_flags & XFS_MOUNT_RETERR) {
                                        return XFS_ERROR(EINVAL);
                                }
-                                xfs_fs_cmn_err(CE_WARN, mp,
+                                xfs_warn(mp,
-"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)",
+                "stripe alignment turned off: sunit(%d)/swidth(%d) "
+                "incompatible with agsize(%d)",
                                        mp->m_dalign, mp->m_swidth,
                                        sbp->sb_agblocks);
@@ -878,9 +889,9 @@ xfs_update_alignment(xfs_mount_t *mp)
                                mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
                        } else {
                                if (mp->m_flags & XFS_MOUNT_RETERR) {
-                                        xfs_fs_cmn_err(CE_WARN, mp,
+                                        xfs_warn(mp,
-"stripe alignment turned off: sunit(%d) less than bsize(%d)",
+                "stripe alignment turned off: sunit(%d) less than bsize(%d)",
-                                                mp->m_dalign,
+                                                mp->m_dalign,
                                                mp->m_blockmask +1);
                                        return XFS_ERROR(EINVAL);
                                }
@@ -1026,14 +1037,14 @@ xfs_check_sizes(xfs_mount_t *mp)
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
-                cmn_err(CE_WARN, "XFS: filesystem size mismatch detected");
+                xfs_warn(mp, "filesystem size mismatch detected");
                return XFS_ERROR(EFBIG);
        }
        bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
                                        d - XFS_FSS_TO_BB(mp, 1),
                                        BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
        if (!bp) {
-                cmn_err(CE_WARN, "XFS: last sector read failed");
+                xfs_warn(mp, "last sector read failed");
                return EIO;
        }
        xfs_buf_relse(bp);
@@ -1041,14 +1052,14 @@ xfs_check_sizes(xfs_mount_t *mp)
        if (mp->m_logdev_targp != mp->m_ddev_targp) {
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
-                        cmn_err(CE_WARN, "XFS: log size mismatch detected");
+                        xfs_warn(mp, "log size mismatch detected");
                        return XFS_ERROR(EFBIG);
                }
                bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
                                        XFS_FSB_TO_B(mp, 1), 0);
                if (!bp) {
-                        cmn_err(CE_WARN, "XFS: log device read failed");
+                        xfs_warn(mp, "log device read failed");
                        return EIO;
                }
                xfs_buf_relse(bp);
@@ -1086,7 +1097,7 @@ xfs_mount_reset_sbqflags(
                return 0;
 #ifdef QUOTADEBUG
-        xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
+        xfs_notice(mp, "Writing superblock quota changes");
 #endif
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
@@ -1094,8 +1105,7 @@ xfs_mount_reset_sbqflags(
                                      XFS_DEFAULT_LOG_COUNT);
        if (error) {
                xfs_trans_cancel(tp, 0);
-                xfs_fs_cmn_err(CE_ALERT, mp,
+                xfs_alert(mp, "%s: Superblock update failed!", __func__);
-                        "xfs_mount_reset_sbqflags: Superblock update failed!");
                return error;
        }
@@ -1161,8 +1171,7 @@ xfs_mountfs(
         * transaction subsystem is online.
         */
        if (xfs_sb_has_mismatched_features2(sbp)) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp, "correcting sb_features alignment problem");
-                        "XFS: correcting sb_features alignment problem");
                sbp->sb_features2 |= sbp->sb_bad_features2;
                sbp->sb_bad_features2 = sbp->sb_features2;
                mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
@@ -1241,7 +1250,7 @@ xfs_mountfs(
         */
        error = xfs_rtmount_init(mp);
        if (error) {
-                cmn_err(CE_WARN, "XFS: RT mount failed");
+                xfs_warn(mp, "RT mount failed");
                goto out_remove_uuid;
        }
@@ -1272,12 +1281,12 @@ xfs_mountfs(
        INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
        error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
        if (error) {
-                cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
+                xfs_warn(mp, "Failed per-ag init: %d", error);
                goto out_remove_uuid;
        }
        if (!sbp->sb_logblocks) {
-                cmn_err(CE_WARN, "XFS: no log defined");
+                xfs_warn(mp, "no log defined");
                XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
                error = XFS_ERROR(EFSCORRUPTED);
                goto out_free_perag;
@@ -1290,7 +1299,7 @@ xfs_mountfs(
                              XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
                              XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
        if (error) {
-                cmn_err(CE_WARN, "XFS: log mount failed");
+                xfs_warn(mp, "log mount failed");
                goto out_free_perag;
        }
@@ -1327,16 +1336,14 @@ xfs_mountfs(
         */
        error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
        if (error) {
-                cmn_err(CE_WARN, "XFS: failed to read root inode");
+                xfs_warn(mp, "failed to read root inode");
                goto out_log_dealloc;
        }
        ASSERT(rip != NULL);
        if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
-                cmn_err(CE_WARN, "XFS: corrupted root inode");
+                xfs_warn(mp, "corrupted root inode %llu: not a directory",
-                cmn_err(CE_WARN, "Device %s - root %llu is not a directory",
-                        XFS_BUFTARG_NAME(mp->m_ddev_targp),
                        (unsigned long long)rip->i_ino);
                xfs_iunlock(rip, XFS_ILOCK_EXCL);
                XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
@@ -1356,7 +1363,7 @@ xfs_mountfs(
                /*
                 * Free up the root inode.
                 */
-                cmn_err(CE_WARN, "XFS: failed to read RT inodes");
+                xfs_warn(mp, "failed to read RT inodes");
                goto out_rele_rip;
        }
@@ -1368,7 +1375,7 @@ xfs_mountfs(
        if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
                error = xfs_mount_log_sb(mp, mp->m_update_flags);
                if (error) {
-                        cmn_err(CE_WARN, "XFS: failed to write sb changes");
+                        xfs_warn(mp, "failed to write sb changes");
                        goto out_rtunmount;
                }
        }
@@ -1389,10 +1396,7 @@ xfs_mountfs(
                 * quotachecked license.
                 */
                if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
-                        cmn_err(CE_NOTE,
+                        xfs_notice(mp, "resetting quota flags");
-                                "XFS: resetting qflags for filesystem %s",
-                                mp->m_fsname);
                        error = xfs_mount_reset_sbqflags(mp);
                        if (error)
                                return error;
@@ -1406,7 +1410,7 @@ xfs_mountfs(
         */
        error = xfs_log_mount_finish(mp);
        if (error) {
-                cmn_err(CE_WARN, "XFS: log mount finish failed");
+                xfs_warn(mp, "log mount finish failed");
                goto out_rtunmount;
        }
@@ -1435,8 +1439,8 @@ xfs_mountfs(
                resblks = xfs_default_resblks(mp);
                error = xfs_reserve_blocks(mp, &resblks, NULL);
                if (error)
-                        cmn_err(CE_WARN, "XFS: Unable to allocate reserve "
+                        xfs_warn(mp,
-                                "blocks. Continuing without a reserve pool.");
+        "Unable to allocate reserve blocks. Continuing without reserve pool.");
        }
        return 0;
@@ -1525,12 +1529,12 @@ xfs_unmountfs(
        resblks = 0;
        error = xfs_reserve_blocks(mp, &resblks, NULL);
        if (error)
-                cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. "
+                xfs_warn(mp, "Unable to free reserved block pool. "
                                "Freespace may not be correct on next mount.");
        error = xfs_log_sbcount(mp, 1);
        if (error)
-                cmn_err(CE_WARN, "XFS: Unable to update superblock counters. "
+                xfs_warn(mp, "Unable to update superblock counters. "
                                "Freespace may not be correct on next mount.");
        xfs_unmountfs_writesb(mp);
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
@@ -2013,10 +2017,8 @@ xfs_dev_is_read_only(
        if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
            xfs_readonly_buftarg(mp->m_logdev_targp) ||
            (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
-                cmn_err(CE_NOTE,
+                xfs_notice(mp, "%s required on read-only device.", message);
-                        "XFS: %s required on read-only device.", message);
+                xfs_notice(mp, "write access unavailable, cannot proceed.");
-                cmn_err(CE_NOTE,
-                        "XFS: write access unavailable, cannot proceed.");
                return EROFS;
        }
        return 0;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a62e8971539d..19af0ab0d0c6 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -203,12 +203,9 @@ typedef struct xfs_mount {
        struct mutex            m_icsb_mutex;   /* balancer sync lock */
 #endif
        struct xfs_mru_cache    *m_filestream;  /* per-mount filestream data */
-        struct task_struct      *m_sync_task;   /* generalised sync thread */
+        struct delayed_work     m_sync_work;    /* background sync work */
-        xfs_sync_work_t         m_sync_work;    /* work item for VFS_SYNC */
+        struct delayed_work     m_reclaim_work; /* background inode reclaim */
-        struct list_head        m_sync_list;    /* sync thread work item list */
+        struct work_struct      m_flush_work;   /* background inode flush */
-        spinlock_t              m_sync_lock;    /* work item list lock */
-        int                     m_sync_seq;     /* sync thread generation no. */
-        wait_queue_head_t       m_wait_single_sync_task;
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index edfa178bafb6..4aff56395732 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -309,7 +309,7 @@ xfs_mru_cache_init(void)
        if (!xfs_mru_elem_zone)
                goto out;
-        xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache");
+        xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
        if (!xfs_mru_reap_wq)
                goto out_destroy_mru_elem_zone;
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 9bb6eda4cd21..a595f29567fe 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -382,7 +382,8 @@ static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
        xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
                                f | XFS_QMOPT_RES_REGBLKS)
-extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
+extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
+                                xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 12a191385310..8f76fdff4f46 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -76,7 +76,7 @@ xfs_growfs_rt_alloc(
        xfs_mount_t     *mp,            /* file system mount point */
        xfs_extlen_t    oblocks,        /* old count of blocks */
        xfs_extlen_t    nblocks,        /* new count of blocks */
-        xfs_ino_t       ino)            /* inode number (bitmap/summary) */
+        xfs_inode_t     *ip)            /* inode (bitmap/summary) */
 {
        xfs_fileoff_t   bno;            /* block number in file */
        xfs_buf_t       *bp;            /* temporary buffer for zeroing */
@@ -86,7 +86,6 @@ xfs_growfs_rt_alloc(
        xfs_fsblock_t   firstblock;     /* first block allocated in xaction */
        xfs_bmap_free_t flist;          /* list of freed blocks */
        xfs_fsblock_t   fsbno;          /* filesystem block for bno */
-        xfs_inode_t     *ip;            /* pointer to incore inode */
        xfs_bmbt_irec_t map;            /* block map output */
        int             nmap;           /* number of block maps */
        int             resblks;        /* space reservation */
@@ -112,9 +111,9 @@ xfs_growfs_rt_alloc(
                /*
                 * Lock the inode.
                 */
-                if ((error = xfs_trans_iget(mp, tp, ino, 0,
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                                                XFS_ILOCK_EXCL, &ip)))
+                xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
-                        goto error_cancel;
                xfs_bmap_init(&flist, &firstblock);
                /*
                 * Allocate blocks to the bitmap file.
@@ -155,9 +154,8 @@ xfs_growfs_rt_alloc(
                        /*
                         * Lock the bitmap inode.
                         */
-                        if ((error = xfs_trans_iget(mp, tp, ino, 0,
+                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                                                        XFS_ILOCK_EXCL, &ip)))
+                        xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
-                                goto error_cancel;
                        /*
                         * Get a buffer for the block.
                         */
@@ -1854,7 +1852,6 @@ xfs_growfs_rt(
        xfs_rtblock_t   bmbno;          /* bitmap block number */
        xfs_buf_t       *bp;            /* temporary buffer */
        int             error;          /* error return value */
-        xfs_inode_t     *ip;            /* bitmap inode, used as lock */
        xfs_mount_t     *nmp;           /* new (fake) mount structure */
        xfs_drfsbno_t   nrblocks;       /* new number of realtime blocks */
        xfs_extlen_t    nrbmblocks;     /* new number of rt bitmap blocks */
@@ -1918,11 +1915,11 @@ xfs_growfs_rt(
        /*
         * Allocate space to the bitmap and summary files, as necessary.
         */
-        if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks,
+        error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
-                        mp->m_sb.sb_rbmino)))
+        if (error)
                return error;
-        if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks,
+        error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
-                        mp->m_sb.sb_rsumino)))
+        if (error)
                return error;
        /*
         * Allocate a new (fake) mount/sb.
@@ -1972,10 +1969,8 @@ xfs_growfs_rt(
                /*
                 * Lock out other callers by grabbing the bitmap inode lock.
                 */
-                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+                xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-                                                XFS_ILOCK_EXCL, &ip)))
+                xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-                        goto error_cancel;
-                ASSERT(ip == mp->m_rbmip);
                /*
                 * Update the bitmap inode's size.
                 */
@@ -1986,10 +1981,8 @@ xfs_growfs_rt(
                /*
                 * Get the summary inode into the transaction.
                 */
-                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
+                xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
-                                                XFS_ILOCK_EXCL, &ip)))
+                xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
-                        goto error_cancel;
-                ASSERT(ip == mp->m_rsumip);
                /*
                 * Update the summary inode's size.
                 */
@@ -2075,15 +2068,15 @@ xfs_rtallocate_extent(
        xfs_extlen_t    prod,           /* extent product factor */
        xfs_rtblock_t   *rtblock)       /* out: start block allocated */
 {
+        xfs_mount_t     *mp = tp->t_mountp;
        int             error;          /* error value */
-        xfs_inode_t     *ip;            /* inode for bitmap file */
-        xfs_mount_t     *mp;            /* file system mount structure */
        xfs_rtblock_t   r;              /* result allocated block */
        xfs_fsblock_t   sb;             /* summary file block number */
        xfs_buf_t       *sumbp;         /* summary file block buffer */
+        ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
        ASSERT(minlen > 0 && minlen <= maxlen);
-        mp = tp->t_mountp;
        /*
         * If prod is set then figure out what to do to minlen and maxlen.
         */
@@ -2099,12 +2092,7 @@ xfs_rtallocate_extent(
                        return 0;
                }
        }
-        /*
-         * Lock out other callers by grabbing the bitmap inode lock.
-         */
-        if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
-                                        XFS_ILOCK_EXCL, &ip)))
-                return error;
        sumbp = NULL;
        /*
         * Allocate by size, or near another block, or exactly at some block.
@@ -2123,11 +2111,12 @@ xfs_rtallocate_extent(
                                len, &sumbp, &sb, prod, &r);
                break;
        default:
+                error = EIO;
                ASSERT(0);
        }
-        if (error) {
+        if (error)
                return error;
-        }
        /*
         * If it worked, update the superblock.
         */
@@ -2155,7 +2144,6 @@ xfs_rtfree_extent(
        xfs_extlen_t    len)            /* length of extent freed */
 {
        int             error;          /* error value */
-        xfs_inode_t     *ip;            /* bitmap file inode */
        xfs_mount_t     *mp;            /* file system mount structure */
        xfs_fsblock_t   sb;             /* summary file block number */
        xfs_buf_t       *sumbp;         /* summary file block buffer */
@@ -2164,9 +2152,9 @@ xfs_rtfree_extent(
        /*
         * Synchronize by locking the bitmap inode.
         */
-        if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+        xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
-                                        XFS_ILOCK_EXCL, &ip)))
+        xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
-                return error;
 #if defined(__KERNEL__) && defined(DEBUG)
        /*
         * Check to see that this whole range is currently allocated.
@@ -2199,10 +2187,10 @@ xfs_rtfree_extent(
         */
        if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
            mp->m_sb.sb_rextents) {
-                if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+                if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
-                        ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+                        mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
-                *(__uint64_t *)&ip->i_d.di_atime = 0;
+                *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
-                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
        }
        return 0;
 }
@@ -2222,8 +2210,8 @@ xfs_rtmount_init(
        if (sbp->sb_rblocks == 0)
                return 0;
        if (mp->m_rtdev_targp == NULL) {
-                cmn_err(CE_WARN,
+                xfs_warn(mp,
-        "XFS: This filesystem has a realtime volume, use rtdev=device option");
+        "Filesystem has a realtime volume, use rtdev=device option");
                return XFS_ERROR(ENODEV);
        }
        mp->m_rsumlevels = sbp->sb_rextslog + 1;
@@ -2237,7 +2225,7 @@ xfs_rtmount_init(
         */
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
-                cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
+                xfs_warn(mp, "realtime mount -- %llu != %llu",
                        (unsigned long long) XFS_BB_TO_FSB(mp, d),
                        (unsigned long long) mp->m_sb.sb_rblocks);
                return XFS_ERROR(EFBIG);
@@ -2246,7 +2234,7 @@ xfs_rtmount_init(
                                        d - XFS_FSB_TO_BB(mp, 1),
                                        XFS_FSB_TO_B(mp, 1), 0);
        if (!bp) {
-                cmn_err(CE_WARN, "XFS: realtime device size check failed");
+                xfs_warn(mp, "realtime device size check failed");
                return EIO;
        }
        xfs_buf_relse(bp);
@@ -2306,20 +2294,16 @@ xfs_rtpick_extent(
        xfs_rtblock_t   *pick)          /* result rt extent */
 {
        xfs_rtblock_t   b;              /* result block */
-        int             error;          /* error return value */
-        xfs_inode_t     *ip;            /* bitmap incore inode */
        int             log2;           /* log of sequence number */
        __uint64_t      resid;          /* residual after log removed */
        __uint64_t      seq;            /* sequence number of file creation */
        __uint64_t      *seqp;          /* pointer to seqno in inode */
-        if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
+        ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
-                                        XFS_ILOCK_EXCL, &ip)))
-                return error;
+        seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
-        ASSERT(ip == mp->m_rbmip);
+        if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
-        seqp = (__uint64_t *)&ip->i_d.di_atime;
+                mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
-        if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
-                ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
                *seqp = 0;
        }
        seq = *seqp;
@@ -2335,7 +2319,7 @@ xfs_rtpick_extent(
                        b = mp->m_sb.sb_rextents - len;
        }
        *seqp = seq + 1;
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
        *pick = b;
        return 0;
 }
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index ff614c29b441..09e1f4f35e97 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -154,7 +154,7 @@ xfs_rtmount_init(
        if (mp->m_sb.sb_rblocks == 0)
                return 0;
-        cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
+        xfs_warn(mp, "Not built with CONFIG_XFS_RT");
        return ENOSYS;
 }
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 56861d5daaef..d6d6fdfe9422 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -49,9 +49,9 @@ xfs_do_force_shutdown(
        logerror = flags & SHUTDOWN_LOG_IO_ERROR;
        if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-                cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from "
+                xfs_notice(mp,
-                                 "line %d of file %s.  Return address = 0x%p",
+        "%s(0x%x) called from line %d of file %s.  Return address = 0x%p",
-                        mp->m_fsname, flags, lnnum, fname, __return_address);
+                        __func__, flags, lnnum, fname, __return_address);
        }
        /*
         * No need to duplicate efforts.
@@ -69,30 +69,25 @@ xfs_do_force_shutdown(
                return;
        if (flags & SHUTDOWN_CORRUPT_INCORE) {
-                xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp,
+                xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
-    "Corruption of in-memory data detected.  Shutting down filesystem: %s",
+    "Corruption of in-memory data detected.  Shutting down filesystem");
-                        mp->m_fsname);
+                if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
-                if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
                        xfs_stack_trace();
-                }
        } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
                if (logerror) {
-                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
-                "Log I/O Error Detected.  Shutting down filesystem: %s",
+                "Log I/O Error Detected.  Shutting down filesystem");
-                                mp->m_fsname);
                } else if (flags & SHUTDOWN_DEVICE_REQ) {
-                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
-                "All device paths lost.  Shutting down filesystem: %s",
+                "All device paths lost.  Shutting down filesystem");
-                                mp->m_fsname);
                } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
-                        xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
+                        xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
-                "I/O Error Detected.  Shutting down filesystem: %s",
+                "I/O Error Detected. Shutting down filesystem");
-                                mp->m_fsname);
                }
        }
        if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-                cmn_err(CE_ALERT, "Please umount the filesystem, "
+                xfs_alert(mp,
-                                  "and rectify the problem(s)");
+        "Please umount the filesystem and rectify the problem(s)");
        }
 }
@@ -106,10 +101,9 @@ xfs_ioerror_alert(
        xfs_buf_t               *bp,
        xfs_daddr_t             blkno)
 {
-        cmn_err(CE_ALERT,
+        xfs_alert(mp,
- "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx"
+                 "I/O error occurred: meta-data dev %s block 0x%llx"
- "       (\"%s\") error %d buf count %zd",
+                 "       (\"%s\") error %d buf count %zd",
-                (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
                XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
                (__uint64_t)blkno, func,
                XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
@@ -173,17 +167,9 @@ xfs_extlen_t
 xfs_get_extsz_hint(
        struct xfs_inode        *ip)
 {
-        xfs_extlen_t            extsz;
+        if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
+                return ip->i_d.di_extsize;
-        if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
+        if (XFS_IS_REALTIME_INODE(ip))
-                extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
+                return ip->i_mount->m_sb.sb_rextsize;
-                                ? ip->i_d.di_extsize
+        return 0;
-                                : ip->i_mount->m_sb.sb_rextsize;
-                ASSERT(extsz);
-        } else {
-                extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-                                ? ip->i_d.di_extsize : 0;
-        }
-        return extsz;
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c2042b736b81..06a9759b6352 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -469,8 +469,6 @@ void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
 void            xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
-int             xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
-                               xfs_ino_t , uint, uint, struct xfs_inode **);
 void            xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
 void            xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
 void            xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index c5bbbc45db91..acdb92f14d51 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,74 +28,138 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t);
+struct workqueue_struct *xfs_ail_wq;    /* AIL workqueue */
-STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
-STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
 #ifdef DEBUG
-STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *);
+/*
-#else
+ * Check that the list is sorted as it should be.
+ */
+STATIC void
+xfs_ail_check(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip)
+{
+        xfs_log_item_t  *prev_lip;
+        if (list_empty(&ailp->xa_ail))
+                return;
+        /*
+         * Check the next and previous entries are valid.
+         */
+        ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+        prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
+        if (&prev_lip->li_ail != &ailp->xa_ail)
+                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+        prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
+        if (&prev_lip->li_ail != &ailp->xa_ail)
+                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
+#ifdef XFS_TRANS_DEBUG
+        /*
+         * Walk the list checking lsn ordering, and that every entry has the
+         * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
+         * when specifically debugging the transaction subsystem.
+         */
+        prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+        list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
+                if (&prev_lip->li_ail != &ailp->xa_ail)
+                        ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+                ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+                prev_lip = lip;
+        }
+#endif /* XFS_TRANS_DEBUG */
+}
+#else /* !DEBUG */
 #define xfs_ail_check(a,l)
 #endif /* DEBUG */
+/*
+ * Return a pointer to the first item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_min(
+        struct xfs_ail  *ailp)
+{
+        if (list_empty(&ailp->xa_ail))
+                return NULL;
+        return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
+}
+ /*
+ * Return a pointer to the last item in the AIL.  If the AIL is empty, then
+ * return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_max(
+        struct xfs_ail  *ailp)
+{
+        if (list_empty(&ailp->xa_ail))
+                return NULL;
+        return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail);
+}
+/*
+ * Return a pointer to the item which follows the given item in the AIL.  If
+ * the given item is the last item in the list, then return NULL.
+ */
+static xfs_log_item_t *
+xfs_ail_next(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip)
+{
+        if (lip->li_ail.next == &ailp->xa_ail)
+                return NULL;
+        return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
+}
 /*
- * This is called by the log manager code to determine the LSN
+ * This is called by the log manager code to determine the LSN of the tail of
- * of the tail of the log.  This is exactly the LSN of the first
+ * the log.  This is exactly the LSN of the first item in the AIL.  If the AIL
- * item in the AIL.  If the AIL is empty, then this function
+ * is empty, then this function returns 0.
- * returns 0.
 *
- * We need the AIL lock in order to get a coherent read of the
+ * We need the AIL lock in order to get a coherent read of the lsn of the last
- * lsn of the last item in the AIL.
+ * item in the AIL.
 */
 xfs_lsn_t
-xfs_trans_ail_tail(
+xfs_ail_min_lsn(
        struct xfs_ail  *ailp)
 {
-        xfs_lsn_t       lsn;
+        xfs_lsn_t       lsn = 0;
        xfs_log_item_t  *lip;
        spin_lock(&ailp->xa_lock);
        lip = xfs_ail_min(ailp);
-        if (lip == NULL) {
+        if (lip)
-                lsn = (xfs_lsn_t)0;
-        } else {
                lsn = lip->li_lsn;
-        }
        spin_unlock(&ailp->xa_lock);
        return lsn;
 }
 /*
- * xfs_trans_push_ail
+ * Return the maximum lsn held in the AIL, or zero if the AIL is empty.
- *
- * This routine is called to move the tail of the AIL forward.  It does this by
- * trying to flush items in the AIL whose lsns are below the given
- * threshold_lsn.
- *
- * the push is run asynchronously in a separate thread, so we return the tail
- * of the log right now instead of the tail after the push. This means we will
- * either continue right away, or we will sleep waiting on the async thread to
- * do its work.
- *
- * We do this unlocked - we only need to know whether there is anything in the
- * AIL at the time we are called. We don't need to access the contents of
- * any of the objects, so the lock is not needed.
 */
-void
+static xfs_lsn_t
-xfs_trans_ail_push(
+xfs_ail_max_lsn(
-        struct xfs_ail  *ailp,
+        struct xfs_ail  *ailp)
-        xfs_lsn_t       threshold_lsn)
 {
-        xfs_log_item_t  *lip;
+        xfs_lsn_t       lsn = 0;
+        xfs_log_item_t  *lip;
-        lip = xfs_ail_min(ailp);
+        spin_lock(&ailp->xa_lock);
-        if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+        lip = xfs_ail_max(ailp);
-                if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0)
+        if (lip)
-                        xfsaild_wakeup(ailp, threshold_lsn);
+                lsn = lip->li_lsn;
-        }
+        spin_unlock(&ailp->xa_lock);
+        return lsn;
 }
 /*
@@ -236,16 +300,57 @@ out:
 }
 /*
- * xfsaild_push does the work of pushing on the AIL.  Returning a timeout of
+ * splice the log item list into the AIL at the given LSN.
- * zero indicates that the caller should sleep until woken.
 */
-long
+static void
-xfsaild_push(
+xfs_ail_splice(
-        struct xfs_ail  *ailp,
+        struct xfs_ail  *ailp,
-        xfs_lsn_t       *last_lsn)
+        struct list_head *list,
+        xfs_lsn_t       lsn)
 {
-        long            tout = 0;
+        xfs_log_item_t  *next_lip;
-        xfs_lsn_t       last_pushed_lsn = *last_lsn;
+        /* If the list is empty, just insert the item.  */
+        if (list_empty(&ailp->xa_ail)) {
+                list_splice(list, &ailp->xa_ail);
+                return;
+        }
+        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
+                if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
+                        break;
+        }
+        ASSERT(&next_lip->li_ail == &ailp->xa_ail ||
+               XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0);
+        list_splice_init(list, &next_lip->li_ail);
+}
+/*
+ * Delete the given item from the AIL.  Return a pointer to the item.
+ */
+static void
+xfs_ail_delete(
+        struct xfs_ail  *ailp,
+        xfs_log_item_t  *lip)
+{
+        xfs_ail_check(ailp, lip);
+        list_del(&lip->li_ail);
+        xfs_trans_ail_cursor_clear(ailp, lip);
+}
+/*
+ * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself
+ * to run at a later time if there is more work to do to complete the push.
+ */
+STATIC void
+xfs_ail_worker(
+        struct work_struct *work)
+{
+        struct xfs_ail  *ailp = container_of(to_delayed_work(work),
+                                        struct xfs_ail, xa_work);
+        long            tout;
        xfs_lsn_t       target =  ailp->xa_target;
        xfs_lsn_t       lsn;
        xfs_log_item_t  *lip;
@@ -256,15 +361,15 @@ xfsaild_push(
        spin_lock(&ailp->xa_lock);
        xfs_trans_ail_cursor_init(ailp, cur);
-        lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn);
+        lip = xfs_trans_ail_cursor_first(ailp, cur, ailp->xa_last_pushed_lsn);
        if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
                /*
                 * AIL is empty or our push has reached the end.
                 */
                xfs_trans_ail_cursor_done(ailp, cur);
                spin_unlock(&ailp->xa_lock);
-                *last_lsn = 0;
+                ailp->xa_last_pushed_lsn = 0;
-                return tout;
+                return;
        }
        XFS_STATS_INC(xs_push_ail);
@@ -301,13 +406,13 @@ xfsaild_push(
                case XFS_ITEM_SUCCESS:
                        XFS_STATS_INC(xs_push_ail_success);
                        IOP_PUSH(lip);
-                        last_pushed_lsn = lsn;
+                        ailp->xa_last_pushed_lsn = lsn;
                        break;
                case XFS_ITEM_PUSHBUF:
                        XFS_STATS_INC(xs_push_ail_pushbuf);
                        IOP_PUSHBUF(lip);
-                        last_pushed_lsn = lsn;
+                        ailp->xa_last_pushed_lsn = lsn;
                        push_xfsbufd = 1;
                        break;
@@ -319,7 +424,7 @@ xfsaild_push(
                case XFS_ITEM_LOCKED:
                        XFS_STATS_INC(xs_push_ail_locked);
-                        last_pushed_lsn = lsn;
+                        ailp->xa_last_pushed_lsn = lsn;
                        stuck++;
                        break;
@@ -374,9 +479,23 @@ xfsaild_push(
                wake_up_process(mp->m_ddev_targp->bt_task);
        }
+        /* assume we have more work to do in a short while */
+        tout = 10;
        if (!count) {
                /* We're past our target or empty, so idle */
-                last_pushed_lsn = 0;
+                ailp->xa_last_pushed_lsn = 0;
+                /*
+                 * Check for an updated push target before clearing the
+                 * XFS_AIL_PUSHING_BIT. If the target changed, we've got more
+                 * work to do. Wait a bit longer before starting that work.
+                 */
+                smp_rmb();
+                if (ailp->xa_target == target) {
+                        clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags);
+                        return;
+                }
+                tout = 50;
        } else if (XFS_LSN_CMP(lsn, target) >= 0) {
                /*
                 * We reached the target so wait a bit longer for I/O to
@@ -384,7 +503,7 @@ xfsaild_push(
                 * start the next scan from the start of the AIL.
                 */
                tout = 50;
-                last_pushed_lsn = 0;
+                ailp->xa_last_pushed_lsn = 0;
        } else if ((stuck * 100) / count > 90) {
                /*
                 * Either there is a lot of contention on the AIL or we
@@ -396,14 +515,61 @@ xfsaild_push(
                 * continuing from where we were.
                 */
                tout = 20;
-        } else {
-                /* more to do, but wait a short while before continuing */
-                tout = 10;
        }
-        *last_lsn = last_pushed_lsn;
-        return tout;
+        /* There is more to do, requeue us.  */
+        queue_delayed_work(xfs_syncd_wq, &ailp->xa_work,
+                                        msecs_to_jiffies(tout));
+}
+/*
+ * This routine is called to move the tail of the AIL forward.  It does this by
+ * trying to flush items in the AIL whose lsns are below the given
+ * threshold_lsn.
+ *
+ * The push is run asynchronously in a workqueue, which means the caller needs
+ * to handle waiting on the async flush for space to become available.
+ * We don't want to interrupt any push that is in progress, hence we only queue
+ * work if we set the pushing bit approriately.
+ *
+ * We do this unlocked - we only need to know whether there is anything in the
+ * AIL at the time we are called. We don't need to access the contents of
+ * any of the objects, so the lock is not needed.
+ */
+void
+xfs_ail_push(
+        struct xfs_ail  *ailp,
+        xfs_lsn_t       threshold_lsn)
+{
+        xfs_log_item_t  *lip;
+        lip = xfs_ail_min(ailp);
+        if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) ||
+            XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0)
+                return;
+        /*
+         * Ensure that the new target is noticed in push code before it clears
+         * the XFS_AIL_PUSHING_BIT.
+         */
+        smp_wmb();
+        ailp->xa_target = threshold_lsn;
+        if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
+                queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0);
 }
+/*
+ * Push out all items in the AIL immediately
+ */
+void
+xfs_ail_push_all(
+        struct xfs_ail  *ailp)
+{
+        xfs_lsn_t       threshold_lsn = xfs_ail_max_lsn(ailp);
+        if (threshold_lsn)
+                xfs_ail_push(ailp, threshold_lsn);
+}
 /*
 * This is to be called when an item is unlocked that may have
@@ -563,7 +729,7 @@ xfs_trans_ail_delete_bulk(
                        spin_unlock(&ailp->xa_lock);
                        if (!XFS_FORCED_SHUTDOWN(mp)) {
-                                xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+                                xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
                "%s: attempting to delete a log item that is not in the AIL",
                                                __func__);
                                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -615,7 +781,6 @@ xfs_trans_ail_init(
        xfs_mount_t     *mp)
 {
        struct xfs_ail  *ailp;
-        int             error;
        ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
        if (!ailp)
@@ -624,15 +789,9 @@ xfs_trans_ail_init(
        ailp->xa_mount = mp;
        INIT_LIST_HEAD(&ailp->xa_ail);
        spin_lock_init(&ailp->xa_lock);
-        error = xfsaild_start(ailp);
+        INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker);
-        if (error)
-                goto out_free_ailp;
        mp->m_ail = ailp;
        return 0;
-out_free_ailp:
-        kmem_free(ailp);
-        return error;
 }
 void
@@ -641,124 +800,6 @@ xfs_trans_ail_destroy(
 {
        struct xfs_ail  *ailp = mp->m_ail;
-        xfsaild_stop(ailp);
+        cancel_delayed_work_sync(&ailp->xa_work);
        kmem_free(ailp);
 }
-/*
- * splice the log item list into the AIL at the given LSN.
- */
-STATIC void
-xfs_ail_splice(
-        struct xfs_ail  *ailp,
-        struct list_head *list,
-        xfs_lsn_t       lsn)
-{
-        xfs_log_item_t  *next_lip;
-        /*
-         * If the list is empty, just insert the item.
-         */
-        if (list_empty(&ailp->xa_ail)) {
-                list_splice(list, &ailp->xa_ail);
-                return;
-        }
-        list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-                if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
-                        break;
-        }
-        ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
-               (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
-        list_splice_init(list, &next_lip->li_ail);
-        return;
-}
-/*
- * Delete the given item from the AIL.  Return a pointer to the item.
- */
-STATIC void
-xfs_ail_delete(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-{
-        xfs_ail_check(ailp, lip);
-        list_del(&lip->li_ail);
-        xfs_trans_ail_cursor_clear(ailp, lip);
-}
-/*
- * Return a pointer to the first item in the AIL.
- * If the AIL is empty, then return NULL.
- */
-STATIC xfs_log_item_t *
-xfs_ail_min(
-        struct xfs_ail  *ailp)
-{
-        if (list_empty(&ailp->xa_ail))
-                return NULL;
-        return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-}
-/*
- * Return a pointer to the item which follows
- * the given item in the AIL.  If the given item
- * is the last item in the list, then return NULL.
- */
-STATIC xfs_log_item_t *
-xfs_ail_next(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-{
-        if (lip->li_ail.next == &ailp->xa_ail)
-                return NULL;
-        return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
-}
-#ifdef DEBUG
-/*
- * Check that the list is sorted as it should be.
- */
-STATIC void
-xfs_ail_check(
-        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip)
-{
-        xfs_log_item_t  *prev_lip;
-        if (list_empty(&ailp->xa_ail))
-                return;
-        /*
-         * Check the next and previous entries are valid.
-         */
-        ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
-        prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
-        if (&prev_lip->li_ail != &ailp->xa_ail)
-                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-        prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
-        if (&prev_lip->li_ail != &ailp->xa_ail)
-                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
-#ifdef XFS_TRANS_DEBUG
-        /*
-         * Walk the list checking lsn ordering, and that every entry has the
-         * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
-         * when specifically debugging the transaction subsystem.
-         */
-        prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-        list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
-                if (&prev_lip->li_ail != &ailp->xa_ail)
-                        ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-                ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
-                prev_lip = lip;
-        }
-#endif /* XFS_TRANS_DEBUG */
-}
-#endif /* DEBUG */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index c47918c302a5..03b3b7f85a3b 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -305,7 +305,7 @@ xfs_trans_read_buf(
                        if (xfs_error_target == target) {
                                if (((xfs_req_num++) % xfs_error_mod) == 0) {
                                        xfs_buf_relse(bp);
-                                        cmn_err(CE_DEBUG, "Returning error!\n");
+                                        xfs_debug(mp, "Returning error!");
                                        return XFS_ERROR(EIO);
                                }
                        }
@@ -383,7 +383,8 @@ xfs_trans_read_buf(
        bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
        if (bp == NULL) {
                *bpp = NULL;
-                return 0;
+                return (flags & XBF_TRYLOCK) ?
+                                        0 : XFS_ERROR(ENOMEM);
        }
        if (XFS_BUF_GETERROR(bp) != 0) {
            XFS_BUF_SUPER_STALE(bp);
@@ -403,7 +404,7 @@ xfs_trans_read_buf(
                                xfs_force_shutdown(tp->t_mountp,
                                                   SHUTDOWN_META_IO_ERROR);
                                xfs_buf_relse(bp);
-                                cmn_err(CE_DEBUG, "Returning trans error!\n");
+                                xfs_debug(mp, "Returning trans error!");
                                return XFS_ERROR(EIO);
                        }
                }
@@ -427,7 +428,7 @@ shutdown_abort:
         */
 #if defined(DEBUG)
        if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
-                cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp);
+                xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
 #endif
        ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
                                     (XBF_STALE|XBF_DELWRI));
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index ccb34532768b..048b0c689d3e 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -44,28 +44,6 @@ xfs_trans_inode_broot_debug(
 #endif
 /*
- * Get an inode and join it to the transaction.
- */
-int
-xfs_trans_iget(
-        xfs_mount_t     *mp,
-        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
-        uint            flags,
-        uint            lock_flags,
-        xfs_inode_t     **ipp)
-{
-        int                     error;
-        error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
-        if (!error && tp) {
-                xfs_trans_ijoin(tp, *ipp);
-                (*ipp)->i_itemp->ili_lock_flags = lock_flags;
-        }
-        return error;
-}
-/*
 * Add a locked inode to the transaction.
 *
 * The inode must be locked, and it cannot be associated with any transaction.
@@ -103,7 +81,7 @@ xfs_trans_ijoin(
 *
 *
 * Grabs a reference to the inode which will be dropped when the transaction
- * is commited.  The inode will also be unlocked at that point.  The inode
+ * is committed.  The inode will also be unlocked at that point.  The inode
 * must be locked, and it cannot be associated with any transaction.
 */
 void
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 35162c238fa3..6b164e9e9a1f 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -65,16 +65,22 @@ struct xfs_ail_cursor {
 struct xfs_ail {
        struct xfs_mount        *xa_mount;
        struct list_head        xa_ail;
-        uint                    xa_gen;
-        struct task_struct      *xa_task;
        xfs_lsn_t               xa_target;
        struct xfs_ail_cursor   xa_cursors;
        spinlock_t              xa_lock;
+        struct delayed_work     xa_work;
+        xfs_lsn_t               xa_last_pushed_lsn;
+        unsigned long           xa_flags;
 };
+#define XFS_AIL_PUSHING_BIT     0
 /*
 * From xfs_trans_ail.c
 */
+extern struct workqueue_struct  *xfs_ail_wq;    /* AIL workqueue */
 void    xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
                                struct xfs_log_item **log_items, int nr_items,
                                xfs_lsn_t lsn) __releases(ailp->xa_lock);
@@ -98,12 +104,13 @@ xfs_trans_ail_delete(
        xfs_trans_ail_delete_bulk(ailp, &lip, 1);
 }
-void                    xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
+void                    xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
+void                    xfs_ail_push_all(struct xfs_ail *);
+xfs_lsn_t               xfs_ail_min_lsn(struct xfs_ail *ailp);
 void                    xfs_trans_unlocked_item(struct xfs_ail *,
                                        xfs_log_item_t *);
-xfs_lsn_t               xfs_trans_ail_tail(struct xfs_ail *ailp);
 struct xfs_log_item     *xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
                                        struct xfs_ail_cursor *cur,
                                        xfs_lsn_t lsn);
@@ -112,11 +119,6 @@ struct xfs_log_item	*xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
 void                    xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
                                        struct xfs_ail_cursor *cur);
-long    xfsaild_push(struct xfs_ail *, xfs_lsn_t *);
-void    xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t);
-int     xfsaild_start(struct xfs_ail *);
-void    xfsaild_stop(struct xfs_ail *);
 #if BITS_PER_LONG != 64
 static inline void
 xfs_trans_ail_copy_lsn(
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index d8e6f8cd6f0c..b7a5fe7c52c8 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -953,7 +953,7 @@ xfs_release(
                 * If we previously truncated this file and removed old data
                 * in the process, we want to initiate "early" writeout on
                 * the last close.  This is an attempt to combat the notorious
-                 * NULL files problem which is particularly noticable from a
+                 * NULL files problem which is particularly noticeable from a
                 * truncate down, buffered (re-)write (delalloc), followed by
                 * a crash.  What we are effectively doing here is
                 * significantly reducing the time window where we'd otherwise
@@ -982,7 +982,7 @@ xfs_release(
                 *
                 * Further, check if the inode is being opened, written and
                 * closed frequently and we have delayed allocation blocks
-                 * oustanding (e.g. streaming writes from the NFS server),
+                 * outstanding (e.g. streaming writes from the NFS server),
                 * truncating the blocks past EOF will cause fragmentation to
                 * occur.
                 *
@@ -1189,9 +1189,8 @@ xfs_inactive(
                 * inode might be lost for a long time or forever.
                 */
                if (!XFS_FORCED_SHUTDOWN(mp)) {
-                        cmn_err(CE_NOTE,
+                        xfs_notice(mp, "%s: xfs_ifree returned error %d",
-                "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
+                                __func__, error);
-                                error, mp->m_fsname);
                        xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
                }
                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
@@ -1208,12 +1207,12 @@ xfs_inactive(
                 */
                error = xfs_bmap_finish(&tp,  &free_list, &committed);
                if (error)
-                        xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+                        xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
-                                "xfs_bmap_finish() returned error %d", error);
+                                __func__, error);
                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
                if (error)
-                        xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
+                        xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
-                                "xfs_trans_commit() returned error %d", error);
+                                __func__, error);
        }
        /*
@@ -1310,7 +1309,7 @@ xfs_create(
        error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
-                goto std_return;
+                return error;
        if (is_dir) {
                rdev = 0;
@@ -1390,12 +1389,6 @@ xfs_create(
        }
        /*
-         * At this point, we've gotten a newly allocated inode.
-         * It is locked (and joined to the transaction).
-         */
-        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        /*
         * Now we join the directory inode to the transaction.  We do not do it
         * earlier because xfs_dir_ialloc might commit the previous transaction
         * (and release all the locks).  An error from here on will result in
@@ -1440,22 +1433,13 @@ xfs_create(
         */
        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
-        /*
-         * xfs_trans_commit normally decrements the vnode ref count
-         * when it unlocks the inode. Since we want to return the
-         * vnode to the caller, we bump the vnode ref count now.
-         */
-        IHOLD(ip);
        error = xfs_bmap_finish(&tp, &free_list, &committed);
        if (error)
-                goto out_abort_rele;
+                goto out_bmap_cancel;
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        if (error) {
+        if (error)
-                IRELE(ip);
+                goto out_release_inode;
-                goto out_dqrele;
-        }
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
@@ -1469,27 +1453,21 @@ xfs_create(
        cancel_flags |= XFS_TRANS_ABORT;
 out_trans_cancel:
        xfs_trans_cancel(tp, cancel_flags);
- out_dqrele:
+ out_release_inode:
+        /*
+         * Wait until after the current transaction is aborted to
+         * release the inode.  This prevents recursive transactions
+         * and deadlocks from xfs_inactive.
+         */
+        if (ip)
+                IRELE(ip);
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
- std_return:
        return error;
- out_abort_rele:
-        /*
-         * Wait until after the current transaction is aborted to
-         * release the inode.  This prevents recursive transactions
-         * and deadlocks from xfs_inactive.
-         */
-        xfs_bmap_cancel(&free_list);
-        cancel_flags |= XFS_TRANS_ABORT;
-        xfs_trans_cancel(tp, cancel_flags);
-        IRELE(ip);
-        unlock_dp_on_error = B_FALSE;
-        goto out_dqrele;
 }
 #ifdef DEBUG
@@ -2114,9 +2092,8 @@ xfs_symlink(
                                  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
                                  &first_block, resblks, mval, &nmaps,
                                  &free_list);
-                if (error) {
+                if (error)
-                        goto error1;
+                        goto error2;
-                }
                if (resblks)
                        resblks -= fs_blocks;
@@ -2148,7 +2125,7 @@ xfs_symlink(
        error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
                                        &first_block, &free_list, resblks);
        if (error)
-                goto error1;
+                goto error2;
        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
@@ -2161,13 +2138,6 @@ xfs_symlink(
                xfs_trans_set_sync(tp);
        }
-        /*
-         * xfs_trans_commit normally decrements the vnode ref count
-         * when it unlocks the inode. Since we want to return the
-         * vnode to the caller, we bump the vnode ref count now.
-         */
-        IHOLD(ip);
        error = xfs_bmap_finish(&tp, &free_list, &committed);
        if (error) {
                goto error2;
@@ -2861,7 +2831,8 @@ xfs_change_file_space(
                ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        xfs_trans_set_sync(tp);
+        if (attr_flags & XFS_ATTR_SYNC)
+                xfs_trans_set_sync(tp);
        error = xfs_trans_commit(tp, 0);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index f6702927eee4..3bcd23353d6c 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,6 +18,7 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
 #define XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if operation would block */
 #define XFS_ATTR_NOLOCK         0x04    /* Don't grab any conflicting locks */
 #define XFS_ATTR_NOACL          0x08    /* Don't call xfs_acl_chmod */
+#define XFS_ATTR_SYNC           0x10    /* synchronous operation required */
 int xfs_readlink(struct xfs_inode *ip, char *link);
 int xfs_release(struct xfs_inode *ip);
author	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2011-05-24 03:06:26 -0400
committer	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2011-05-24 03:06:26 -0400
commit	b73077eb03f510a84b102fb97640e595a958403c (patch)
tree	8b639000418e2756bf6baece4e00e07d2534bccc /fs
parent	28350e330cfab46b60a1dbf763b678d859f9f3d9 (diff)
parent	9d2e173644bb5c42ff1b280fbdda3f195a7cf1f7 (diff)