Merge branch 'master' into for-next

Sync with Linus' tree to be able to apply patch to a newer code (namely drivers/gpu/drm/gma500/psb_intel_lvds.c)
author: Jiri Kosina <jkosina@suse.cz> 2012-02-03 17:12:42 -0500
committer: Jiri Kosina <jkosina@suse.cz> 2012-02-03 17:13:05 -0500
commit: 972c5ae961d6e5103e2b33d935cfa4145fd47140 (patch)
tree: 350b2a76b979ba8766c09838617df67ff330eca0 /fs
parent: 5196d20305d5e30d871111d3a876cf067dd94255 (diff)
parent: 7c7ed8ec337bf5f62cc5287a6eb6b2f1b7504c2f (diff)
292 files changed, 14423 insertions, 4685 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 945aa5f02f9b..a9ea73d6dcf3 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -62,8 +62,8 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
        uint16_t klen = 0;
        v9ses = (struct v9fs_session_info *)cookie_netfs_data;
-        P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses,
+        p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n",
-                   buffer, bufmax);
+                 v9ses, buffer, bufmax);
        if (v9ses->cachetag)
                klen = strlen(v9ses->cachetag);
@@ -72,7 +72,7 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
                return 0;
        memcpy(buffer, v9ses->cachetag, klen);
-        P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag);
+        p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag);
        return klen;
 }
@@ -91,14 +91,14 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
        v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
                                                &v9fs_cache_session_index_def,
                                                v9ses);
-        P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses,
+        p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
-                   v9ses->fscache);
+                 v9ses, v9ses->fscache);
 }
 void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
 {
-        P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses,
+        p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n",
-                   v9ses->fscache);
+                 v9ses, v9ses->fscache);
        fscache_relinquish_cookie(v9ses->fscache, 0);
        v9ses->fscache = NULL;
 }
@@ -109,8 +109,8 @@ static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
 {
        const struct v9fs_inode *v9inode = cookie_netfs_data;
        memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path));
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode,
+        p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n",
-                   v9inode->qid.path);
+                 &v9inode->vfs_inode, v9inode->qid.path);
        return sizeof(v9inode->qid.path);
 }
@@ -120,8 +120,8 @@ static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
        const struct v9fs_inode *v9inode = cookie_netfs_data;
        *size = i_size_read(&v9inode->vfs_inode);
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode,
+        p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n",
-                   *size);
+                 &v9inode->vfs_inode, *size);
 }
 static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
@@ -129,8 +129,8 @@ static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
 {
        const struct v9fs_inode *v9inode = cookie_netfs_data;
        memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version));
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode,
+        p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n",
-                   v9inode->qid.version);
+                 &v9inode->vfs_inode, v9inode->qid.version);
        return sizeof(v9inode->qid.version);
 }
@@ -206,8 +206,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
                                                  &v9fs_cache_inode_index_def,
                                                  v9inode);
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
+        p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
-                   v9inode->fscache);
+                 inode, v9inode->fscache);
 }
 void v9fs_cache_inode_put_cookie(struct inode *inode)
@@ -216,8 +216,8 @@ void v9fs_cache_inode_put_cookie(struct inode *inode)
        if (!v9inode->fscache)
                return;
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
+        p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n",
-                   v9inode->fscache);
+                 inode, v9inode->fscache);
        fscache_relinquish_cookie(v9inode->fscache, 0);
        v9inode->fscache = NULL;
@@ -229,8 +229,8 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode)
        if (!v9inode->fscache)
                return;
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
+        p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n",
-                   v9inode->fscache);
+                 inode, v9inode->fscache);
        fscache_relinquish_cookie(v9inode->fscache, 1);
        v9inode->fscache = NULL;
@@ -272,8 +272,8 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
        v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
                                                  &v9fs_cache_inode_index_def,
                                                  v9inode);
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
+        p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
-                   inode, old, v9inode->fscache);
+                 inode, old, v9inode->fscache);
        spin_unlock(&v9inode->fscache_lock);
 }
@@ -323,7 +323,7 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
        int ret;
        const struct v9fs_inode *v9inode = V9FS_I(inode);
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+        p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
        if (!v9inode->fscache)
                return -ENOBUFS;
@@ -335,13 +335,13 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
        switch (ret) {
        case -ENOBUFS:
        case -ENODATA:
-                P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret);
+                p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret);
                return 1;
        case 0:
-                P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+                p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
                return ret;
        default:
-                P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+                p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
                return ret;
        }
 }
@@ -361,7 +361,7 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
        int ret;
        const struct v9fs_inode *v9inode = V9FS_I(inode);
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
+        p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages);
        if (!v9inode->fscache)
                return -ENOBUFS;
@@ -373,15 +373,15 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
        switch (ret) {
        case -ENOBUFS:
        case -ENODATA:
-                P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret);
+                p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret);
                return 1;
        case 0:
                BUG_ON(!list_empty(pages));
                BUG_ON(*nr_pages != 0);
-                P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
+                p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
                return ret;
        default:
-                P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
+                p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
                return ret;
        }
 }
@@ -396,9 +396,9 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
        int ret;
        const struct v9fs_inode *v9inode = V9FS_I(inode);
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+        p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
        ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
-        P9_DPRINTK(P9_DEBUG_FSC, "ret =  %d", ret);
+        p9_debug(P9_DEBUG_FSC, "ret =  %d\n", ret);
        if (ret != 0)
                v9fs_uncache_page(inode, page);
 }
@@ -409,7 +409,7 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
 void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
 {
        const struct v9fs_inode *v9inode = V9FS_I(inode);
-        P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
+        p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
        if (PageFsCache(page))
                fscache_wait_on_page_write(v9inode->fscache, page);
 }
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 85b67ffa2a43..da8eefbe830d 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -45,8 +45,8 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
 {
        struct v9fs_dentry *dent;
-        P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n",
+        p9_debug(P9_DEBUG_VFS, "fid %d dentry %s\n",
-                                        fid->fid, dentry->d_name.name);
+                 fid->fid, dentry->d_name.name);
        dent = dentry->d_fsdata;
        if (!dent) {
@@ -79,8 +79,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
        struct v9fs_dentry *dent;
        struct p9_fid *fid, *ret;
-        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
+        p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
-                dentry->d_name.name, dentry, uid, any);
+                 dentry->d_name.name, dentry, uid, any);
        dent = (struct v9fs_dentry *) dentry->d_fsdata;
        ret = NULL;
        if (dent) {
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2b78014a124a..1964f98e74be 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -23,6 +23,8 @@
 *
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -85,15 +87,15 @@ static int get_cache_mode(char *s)
        if (!strcmp(s, "loose")) {
                version = CACHE_LOOSE;
-                P9_DPRINTK(P9_DEBUG_9P, "Cache mode: loose\n");
+                p9_debug(P9_DEBUG_9P, "Cache mode: loose\n");
        } else if (!strcmp(s, "fscache")) {
                version = CACHE_FSCACHE;
-                P9_DPRINTK(P9_DEBUG_9P, "Cache mode: fscache\n");
+                p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
        } else if (!strcmp(s, "none")) {
                version = CACHE_NONE;
-                P9_DPRINTK(P9_DEBUG_9P, "Cache mode: none\n");
+                p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
        } else
-                printk(KERN_INFO "9p: Unknown Cache mode %s.\n", s);
+                pr_info("Unknown Cache mode %s\n", s);
        return version;
 }
@@ -140,8 +142,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                case Opt_debug:
                        r = match_int(&args[0], &option);
                        if (r < 0) {
-                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                p9_debug(P9_DEBUG_ERROR,
-                                           "integer field, but no integer?\n");
+                                         "integer field, but no integer?\n");
                                ret = r;
                                continue;
                        }
@@ -154,8 +156,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                case Opt_dfltuid:
                        r = match_int(&args[0], &option);
                        if (r < 0) {
-                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                p9_debug(P9_DEBUG_ERROR,
-                                           "integer field, but no integer?\n");
+                                         "integer field, but no integer?\n");
                                ret = r;
                                continue;
                        }
@@ -164,8 +166,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                case Opt_dfltgid:
                        r = match_int(&args[0], &option);
                        if (r < 0) {
-                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                p9_debug(P9_DEBUG_ERROR,
-                                           "integer field, but no integer?\n");
+                                         "integer field, but no integer?\n");
                                ret = r;
                                continue;
                        }
@@ -174,8 +176,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                case Opt_afid:
                        r = match_int(&args[0], &option);
                        if (r < 0) {
-                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                p9_debug(P9_DEBUG_ERROR,
-                                           "integer field, but no integer?\n");
+                                         "integer field, but no integer?\n");
                                ret = r;
                                continue;
                        }
@@ -205,8 +207,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                        s = match_strdup(&args[0]);
                        if (!s) {
                                ret = -ENOMEM;
-                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                p9_debug(P9_DEBUG_ERROR,
-                                  "problem allocating copy of cache arg\n");
+                                         "problem allocating copy of cache arg\n");
                                goto free_and_return;
                        }
                        ret = get_cache_mode(s);
@@ -223,8 +225,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                        s = match_strdup(&args[0]);
                        if (!s) {
                                ret = -ENOMEM;
-                                P9_DPRINTK(P9_DEBUG_ERROR,
+                                p9_debug(P9_DEBUG_ERROR,
-                                  "problem allocating copy of access arg\n");
+                                         "problem allocating copy of access arg\n");
                                goto free_and_return;
                        }
@@ -240,8 +242,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
                                v9ses->uid = simple_strtoul(s, &e, 10);
                                if (*e != '\0') {
                                        ret = -EINVAL;
-                                        printk(KERN_INFO "9p: Unknown access "
+                                        pr_info("Unknown access argument %s\n",
-                                                        "argument %s.\n", s);
+                                                s);
                                        kfree(s);
                                        goto free_and_return;
                                }
@@ -254,9 +256,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 #ifdef CONFIG_9P_FS_POSIX_ACL
                        v9ses->flags |= V9FS_POSIX_ACL;
 #else
-                        P9_DPRINTK(P9_DEBUG_ERROR,
+                        p9_debug(P9_DEBUG_ERROR,
-                                        "Not defined CONFIG_9P_FS_POSIX_ACL. "
+                                 "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n");
-                                        "Ignoring posixacl option\n");
 #endif
                        break;
@@ -318,7 +319,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        if (IS_ERR(v9ses->clnt)) {
                retval = PTR_ERR(v9ses->clnt);
                v9ses->clnt = NULL;
-                P9_DPRINTK(P9_DEBUG_ERROR, "problem initializing 9p client\n");
+                p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
                goto error;
        }
@@ -371,7 +372,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        if (IS_ERR(fid)) {
                retval = PTR_ERR(fid);
                fid = NULL;
-                P9_DPRINTK(P9_DEBUG_ERROR, "cannot attach\n");
+                p9_debug(P9_DEBUG_ERROR, "cannot attach\n");
                goto error;
        }
@@ -429,7 +430,7 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
 */
 void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
-        P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
+        p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
        p9_client_disconnect(v9ses->clnt);
 }
@@ -442,7 +443,7 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
 void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
 {
-        P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
+        p9_debug(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
        p9_client_begin_disconnect(v9ses->clnt);
 }
@@ -591,23 +592,23 @@ static void v9fs_cache_unregister(void)
 static int __init init_v9fs(void)
 {
        int err;
-        printk(KERN_INFO "Installing v9fs 9p2000 file system support\n");
+        pr_info("Installing v9fs 9p2000 file system support\n");
        /* TODO: Setup list of registered trasnport modules */
        err = register_filesystem(&v9fs_fs_type);
        if (err < 0) {
-                printk(KERN_ERR "Failed to register filesystem\n");
+                pr_err("Failed to register filesystem\n");
                return err;
        }
        err = v9fs_cache_register();
        if (err < 0) {
-                printk(KERN_ERR "Failed to register v9fs for caching\n");
+                pr_err("Failed to register v9fs for caching\n");
                goto out_fs_unreg;
        }
        err = v9fs_sysfs_init();
        if (err < 0) {
-                printk(KERN_ERR "Failed to register with sysfs\n");
+                pr_err("Failed to register with sysfs\n");
                goto out_sysfs_cleanup;
        }
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 2524e4cbb8ea..0ad61c6a65a5 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -56,7 +56,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
        struct inode *inode;
        inode = page->mapping->host;
-        P9_DPRINTK(P9_DEBUG_VFS, "\n");
+        p9_debug(P9_DEBUG_VFS, "\n");
        BUG_ON(!PageLocked(page));
@@ -116,14 +116,14 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
        struct inode *inode;
        inode = mapping->host;
-        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
+        p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
        ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
        if (ret == 0)
                return ret;
        ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
-        P9_DPRINTK(P9_DEBUG_VFS, "  = %d\n", ret);
+        p9_debug(P9_DEBUG_VFS, "  = %d\n", ret);
        return ret;
 }
@@ -263,10 +263,9 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
         * Now that we do caching with cache mode enabled, We need
         * to support direct IO
         */
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
+        p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n",
-                        "off/no(%lld/%lu) EINVAL\n",
+                 iocb->ki_filp->f_path.dentry->d_name.name,
-                        iocb->ki_filp->f_path.dentry->d_name.name,
+                 (long long)pos, nr_segs);
-                        (long long) pos, nr_segs);
        return -EINVAL;
 }
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index e022890c6f40..d529437ff442 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -53,8 +53,8 @@
 static int v9fs_dentry_delete(const struct dentry *dentry)
 {
-        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+        p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
-                                                                        dentry);
+                 dentry->d_name.name, dentry);
        return 1;
 }
@@ -66,8 +66,8 @@ static int v9fs_dentry_delete(const struct dentry *dentry)
 */
 static int v9fs_cached_dentry_delete(const struct dentry *dentry)
 {
-        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n",
+        p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
-                   dentry->d_name.name, dentry);
+                 dentry->d_name.name, dentry);
        /* Don't cache negative dentries */
        if (!dentry->d_inode)
@@ -86,8 +86,8 @@ static void v9fs_dentry_release(struct dentry *dentry)
        struct v9fs_dentry *dent;
        struct p9_fid *temp, *current_fid;
-        P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name,
+        p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
-                                                                        dentry);
+                 dentry->d_name.name, dentry);
        dent = dentry->d_fsdata;
        if (dent) {
                list_for_each_entry_safe(current_fid, temp, &dent->fidlist,
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 598fff1a54e5..ff911e779651 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -140,7 +140,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
        int reclen = 0;
        struct p9_rdir *rdir;
-        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+        p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
        fid = filp->private_data;
        buflen = fid->clnt->msize - P9_IOHDRSZ;
@@ -168,7 +168,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
                                          rdir->tail - rdir->head, &st);
                        if (err) {
-                                P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+                                p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
                                err = -EIO;
                                p9stat_free(&st);
                                goto unlock_and_exit;
@@ -213,7 +213,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
        struct p9_dirent curdirent;
        u64 oldoffset = 0;
-        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
+        p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
        fid = filp->private_data;
        buflen = fid->clnt->msize - P9_READDIRHDRSZ;
@@ -244,7 +244,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
                                            rdir->tail - rdir->head,
                                            &curdirent);
                        if (err < 0) {
-                                P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+                                p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
                                err = -EIO;
                                goto unlock_and_exit;
                        }
@@ -290,9 +290,8 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
        struct p9_fid *fid;
        fid = filp->private_data;
-        P9_DPRINTK(P9_DEBUG_VFS,
+        p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n",
-                        "v9fs_dir_release: inode: %p filp: %p fid: %d\n",
+                 inode, filp, fid ? fid->fid : -1);
-                        inode, filp, fid ? fid->fid : -1);
        if (fid)
                p9_client_clunk(fid);
        return 0;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 62857a810a79..fc06fd27065e 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
        struct p9_fid *fid;
        int omode;
-        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
+        p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
        v9inode = V9FS_I(inode);
        v9ses = v9fs_inode2v9ses(inode);
        if (v9fs_proto_dotl(v9ses))
@@ -135,7 +135,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
        int res = 0;
        struct inode *inode = filp->f_path.dentry->d_inode;
-        P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
+        p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
        /* No mandatory locks */
        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -204,7 +204,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
                        break;
                if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
                        break;
-                schedule_timeout_interruptible(P9_LOCK_TIMEOUT);
+                if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0)
+                        break;
        }
        /* map 9p status to VFS status */
@@ -304,8 +305,8 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
        struct inode *inode = filp->f_path.dentry->d_inode;
        int ret = -ENOLCK;
-        P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
+        p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
-                                cmd, fl, filp->f_path.dentry->d_name.name);
+                 filp, cmd, fl, filp->f_path.dentry->d_name.name);
        /* No mandatory locks */
        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -340,8 +341,8 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
        struct inode *inode = filp->f_path.dentry->d_inode;
        int ret = -ENOLCK;
-        P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp,
+        p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
-                                cmd, fl, filp->f_path.dentry->d_name.name);
+                 filp, cmd, fl, filp->f_path.dentry->d_name.name);
        /* No mandatory locks */
        if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -384,8 +385,8 @@ v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
 {
        int n, total, size;
-        P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
+        p9_debug(P9_DEBUG_VFS, "fid %d offset %llu count %d\n",
-                   (long long unsigned) offset, count);
+                 fid->fid, (long long unsigned)offset, count);
        n = 0;
        total = 0;
        size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -443,7 +444,7 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
        struct p9_fid *fid;
        size_t size;
-        P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
+        p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
        fid = filp->private_data;
        size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -470,8 +471,8 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
        loff_t origin = *offset;
        unsigned long pg_start, pg_end;
-        P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
+        p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n",
-                (int)count, (int)*offset);
+                 data, (int)count, (int)*offset);
        clnt = fid->clnt;
        do {
@@ -552,7 +553,7 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
                return retval;
        mutex_lock(&inode->i_mutex);
-        P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
+        p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
        fid = filp->private_data;
        v9fs_blank_wstat(&wstat);
@@ -575,8 +576,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
                return retval;
        mutex_lock(&inode->i_mutex);
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n",
+        p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
-                        filp, datasync);
        fid = filp->private_data;
@@ -607,8 +607,8 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct inode *inode = filp->f_path.dentry->d_inode;
-        P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n",
+        p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
-                   page, (unsigned long)filp->private_data);
+                 page, (unsigned long)filp->private_data);
        v9inode = V9FS_I(inode);
        /* make sure the cache has finished storing the page */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index e0f20de6aa2b..014c8dd62962 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -23,6 +23,8 @@
 *
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -88,6 +90,32 @@ static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode)
 }
 /**
+ * p9mode2perm- convert plan9 mode bits to unix permission bits
+ * @v9ses: v9fs session information
+ * @stat: p9_wstat from which mode need to be derived
+ *
+ */
+static int p9mode2perm(struct v9fs_session_info *v9ses,
+                       struct p9_wstat *stat)
+{
+        int res;
+        int mode = stat->mode;
+        res = mode & S_IALLUGO;
+        if (v9fs_proto_dotu(v9ses)) {
+                if ((mode & P9_DMSETUID) == P9_DMSETUID)
+                        res |= S_ISUID;
+                if ((mode & P9_DMSETGID) == P9_DMSETGID)
+                        res |= S_ISGID;
+                if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
+                        res |= S_ISVTX;
+        }
+        return res;
+}
+/**
 * p9mode2unixmode- convert plan9 mode bits to unix mode bits
 * @v9ses: v9fs session information
 * @stat: p9_wstat from which mode need to be derived
@@ -100,8 +128,8 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
        int res;
        u32 mode = stat->mode;
-        res = mode & S_IALLUGO;
        *rdev = 0;
+        res = p9mode2perm(v9ses, stat);
        if ((mode & P9_DMDIR) == P9_DMDIR)
                res |= S_IFDIR;
@@ -128,24 +156,13 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
                        res |= S_IFBLK;
                        break;
                default:
-                        P9_DPRINTK(P9_DEBUG_ERROR,
+                        p9_debug(P9_DEBUG_ERROR, "Unknown special type %c %s\n",
-                                "Unknown special type %c %s\n", type,
+                                 type, stat->extension);
-                                stat->extension);
                };
                *rdev = MKDEV(major, minor);
        } else
                res |= S_IFREG;
-        if (v9fs_proto_dotu(v9ses)) {
-                if ((mode & P9_DMSETUID) == P9_DMSETUID)
-                        res |= S_ISUID;
-                if ((mode & P9_DMSETGID) == P9_DMSETGID)
-                        res |= S_ISGID;
-                if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
-                        res |= S_ISVTX;
-        }
        return res;
 }
@@ -275,8 +292,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
                } else if (v9fs_proto_dotu(v9ses)) {
                        inode->i_op = &v9fs_file_inode_operations;
                } else {
-                        P9_DPRINTK(P9_DEBUG_ERROR,
+                        p9_debug(P9_DEBUG_ERROR,
-                                   "special files without extended mode\n");
+                                 "special files without extended mode\n");
                        err = -EINVAL;
                        goto error;
                }
@@ -301,8 +318,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
                break;
        case S_IFLNK:
                if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
-                        P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
+                        p9_debug(P9_DEBUG_ERROR,
-                                                "legacy protocol.\n");
+                                 "extended modes used with legacy protocol\n");
                        err = -EINVAL;
                        goto error;
                }
@@ -329,8 +346,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
                break;
        default:
-                P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n",
+                p9_debug(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n",
-                           mode, mode & S_IFMT);
+                         mode, mode & S_IFMT);
                err = -EINVAL;
                goto error;
        }
@@ -352,11 +369,12 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
        struct inode *inode;
        struct v9fs_session_info *v9ses = sb->s_fs_info;
-        P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode);
+        p9_debug(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode);
        inode = new_inode(sb);
        if (!inode) {
-                P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
+                pr_warn("%s (%d): Problem allocating inode\n",
+                        __func__, task_pid_nr(current));
                return ERR_PTR(-ENOMEM);
        }
        err = v9fs_init_inode(v9ses, inode, mode, rdev);
@@ -573,15 +591,15 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
        struct p9_fid *v9fid, *dfid;
        struct v9fs_session_info *v9ses;
-        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n",
+        p9_debug(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n",
-                   dir, dentry, flags);
+                 dir, dentry, flags);
        v9ses = v9fs_inode2v9ses(dir);
        inode = dentry->d_inode;
        dfid = v9fs_fid_lookup(dentry->d_parent);
        if (IS_ERR(dfid)) {
                retval = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", retval);
+                p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval);
                return retval;
        }
        if (v9fs_proto_dotl(v9ses))
@@ -630,7 +648,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        struct p9_fid *dfid, *ofid, *fid;
        struct inode *inode;
-        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+        p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
        err = 0;
        ofid = NULL;
@@ -639,7 +657,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        dfid = v9fs_fid_lookup(dentry->d_parent);
        if (IS_ERR(dfid)) {
                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
                return ERR_PTR(err);
        }
@@ -647,36 +665,41 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        ofid = p9_client_walk(dfid, 0, NULL, 1);
        if (IS_ERR(ofid)) {
                err = PTR_ERR(ofid);
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
                return ERR_PTR(err);
        }
        err = p9_client_fcreate(ofid, name, perm, mode, extension);
        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
+                p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
-                goto error;
-        }
-        /* now walk from the parent so we can get unopened fid */
-        fid = p9_client_walk(dfid, 1, &name, 1);
-        if (IS_ERR(fid)) {
-                err = PTR_ERR(fid);
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-                fid = NULL;
                goto error;
        }
-        /* instantiate inode and assign the unopened fid to the dentry */
+        if (!(perm & P9_DMLINK)) {
-        inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+                /* now walk from the parent so we can get unopened fid */
-        if (IS_ERR(inode)) {
+                fid = p9_client_walk(dfid, 1, &name, 1);
-                err = PTR_ERR(inode);
+                if (IS_ERR(fid)) {
-                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+                        err = PTR_ERR(fid);
-                goto error;
+                        p9_debug(P9_DEBUG_VFS,
+                                   "p9_client_walk failed %d\n", err);
+                        fid = NULL;
+                        goto error;
+                }
+                /*
+                 * instantiate inode and assign the unopened fid to the dentry
+                 */
+                inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+                if (IS_ERR(inode)) {
+                        err = PTR_ERR(inode);
+                        p9_debug(P9_DEBUG_VFS,
+                                   "inode creation failed %d\n", err);
+                        goto error;
+                }
+                err = v9fs_fid_add(dentry, fid);
+                if (err < 0)
+                        goto error;
+                d_instantiate(dentry, inode);
        }
-        err = v9fs_fid_add(dentry, fid);
-        if (err < 0)
-                goto error;
-        d_instantiate(dentry, inode);
        return ofid;
 error:
        if (ofid)
@@ -788,7 +811,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
        struct p9_fid *fid;
        struct v9fs_session_info *v9ses;
-        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+        p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
        err = 0;
        v9ses = v9fs_inode2v9ses(dir);
        perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
@@ -826,8 +849,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        char *name;
        int result = 0;
-        P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
+        p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
-                dir, dentry->d_name.name, dentry, nameidata);
+                 dir, dentry->d_name.name, dentry, nameidata);
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
@@ -933,7 +956,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct p9_fid *newdirfid;
        struct p9_wstat wstat;
-        P9_DPRINTK(P9_DEBUG_VFS, "\n");
+        p9_debug(P9_DEBUG_VFS, "\n");
        retval = 0;
        old_inode = old_dentry->d_inode;
        new_inode = new_dentry->d_inode;
@@ -969,8 +992,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                 * 9P .u can only handle file rename in the same directory
                 */
-                P9_DPRINTK(P9_DEBUG_ERROR,
+                p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n");
-                                "old dir and new dir are different\n");
                retval = -EXDEV;
                goto clunk_newdir;
        }
@@ -1026,7 +1048,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        struct p9_fid *fid;
        struct p9_wstat *st;
-        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+        p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
        err = -EPERM;
        v9ses = v9fs_dentry2v9ses(dentry);
        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
@@ -1063,7 +1085,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
        struct p9_fid *fid;
        struct p9_wstat wstat;
-        P9_DPRINTK(P9_DEBUG_VFS, "\n");
+        p9_debug(P9_DEBUG_VFS, "\n");
        retval = inode_change_ok(dentry->d_inode, iattr);
        if (retval)
                return retval;
@@ -1162,7 +1184,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
                                set_nlink(inode, i_nlink);
                }
        }
-        mode = stat->mode & S_IALLUGO;
+        mode = p9mode2perm(v9ses, stat);
        mode |= inode->i_mode & ~S_IALLUGO;
        inode->i_mode = mode;
        i_size_write(inode, stat->length);
@@ -1208,7 +1230,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
        struct p9_fid *fid;
        struct p9_wstat *st;
-        P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
+        p9_debug(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
        retval = -EPERM;
        v9ses = v9fs_dentry2v9ses(dentry);
        fid = v9fs_fid_lookup(dentry);
@@ -1230,8 +1252,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
        /* copy extension buffer into buffer */
        strncpy(buffer, st->extension, buflen);
-        P9_DPRINTK(P9_DEBUG_VFS,
+        p9_debug(P9_DEBUG_VFS, "%s -> %s (%s)\n",
-                "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer);
+                 dentry->d_name.name, st->extension, buffer);
        retval = strnlen(buffer, buflen);
 done:
@@ -1252,7 +1274,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
        int len = 0;
        char *link = __getname();
-        P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name);
+        p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
        if (!link)
                link = ERR_PTR(-ENOMEM);
@@ -1283,8 +1305,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
        char *s = nd_get_link(nd);
-        P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name,
+        p9_debug(P9_DEBUG_VFS, " %s %s\n",
-                IS_ERR(s) ? "<error>" : s);
+                 dentry->d_name.name, IS_ERR(s) ? "<error>" : s);
        if (!IS_ERR(s))
                __putname(s);
 }
@@ -1306,7 +1328,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
        v9ses = v9fs_inode2v9ses(dir);
        if (!v9fs_proto_dotu(v9ses)) {
-                P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n");
+                p9_debug(P9_DEBUG_ERROR, "not extended\n");
                return -EPERM;
        }
@@ -1333,8 +1355,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 static int
 v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 {
-        P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino,
+        p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
-                                        dentry->d_name.name, symname);
+                 dir->i_ino, dentry->d_name.name, symname);
        return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname);
 }
@@ -1355,9 +1377,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
        char *name;
        struct p9_fid *oldfid;
-        P9_DPRINTK(P9_DEBUG_VFS,
+        p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
-                " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
+                 dir->i_ino, dentry->d_name.name, old_dentry->d_name.name);
-                old_dentry->d_name.name);
        oldfid = v9fs_fid_clone(old_dentry);
        if (IS_ERR(oldfid))
@@ -1398,9 +1419,9 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
        char *name;
        u32 perm;
-        P9_DPRINTK(P9_DEBUG_VFS,
+        p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n",
-                " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", dir->i_ino,
+                 dir->i_ino, dentry->d_name.name, mode,
-                dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+                 MAJOR(rdev), MINOR(rdev));
        if (!new_valid_dev(rdev))
                return -EINVAL;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 8ef152ac6a16..a1e6c990cd41 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -283,13 +283,13 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
        }
        name = (char *) dentry->d_name.name;
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x "
+        p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n",
-                        "mode:0x%hx\n", name, flags, omode);
+                 name, flags, omode);
        dfid = v9fs_fid_lookup(dentry->d_parent);
        if (IS_ERR(dfid)) {
                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
                return err;
        }
@@ -297,7 +297,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
        ofid = p9_client_walk(dfid, 0, NULL, 1);
        if (IS_ERR(ofid)) {
                err = PTR_ERR(ofid);
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
                return err;
        }
@@ -307,16 +307,15 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
        /* Update mode based on ACL value */
        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
+                p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
-                           "Failed to get acl values in creat %d\n", err);
+                         err);
                goto error;
        }
        err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
                                    mode, gid, &qid);
        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS,
+                p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
-                                "p9_client_open_dotl failed in creat %d\n",
+                         err);
-                                err);
                goto error;
        }
        v9fs_invalidate_inode_attr(dir);
@@ -325,14 +324,14 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
        fid = p9_client_walk(dfid, 1, &name, 1);
        if (IS_ERR(fid)) {
                err = PTR_ERR(fid);
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+                p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
                fid = NULL;
                goto error;
        }
        inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
-                P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
+                p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
                goto error;
        }
        err = v9fs_fid_add(dentry, fid);
@@ -408,7 +407,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
        struct dentry *dir_dentry;
        struct posix_acl *dacl = NULL, *pacl = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+        p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
        err = 0;
        v9ses = v9fs_inode2v9ses(dir);
@@ -420,7 +419,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
        dfid = v9fs_fid_lookup(dir_dentry);
        if (IS_ERR(dfid)) {
                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
                dfid = NULL;
                goto error;
        }
@@ -430,8 +429,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
        /* Update mode based on ACL value */
        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
+                p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mkdir %d\n",
-                           "Failed to get acl values in mkdir %d\n", err);
+                         err);
                goto error;
        }
        name = (char *) dentry->d_name.name;
@@ -444,8 +443,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
                fid = p9_client_walk(dfid, 1, &name, 1);
                if (IS_ERR(fid)) {
                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                        p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
+                                 err);
                        fid = NULL;
                        goto error;
                }
@@ -453,8 +452,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
                inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                        p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
+                                 err);
                        goto error;
                }
                err = v9fs_fid_add(dentry, fid);
@@ -495,7 +494,7 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
        struct p9_fid *fid;
        struct p9_stat_dotl *st;
-        P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
+        p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
        err = -EPERM;
        v9ses = v9fs_dentry2v9ses(dentry);
        if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
@@ -523,6 +522,46 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
        return 0;
 }
+/*
+ * Attribute flags.
+ */
+#define P9_ATTR_MODE            (1 << 0)
+#define P9_ATTR_UID             (1 << 1)
+#define P9_ATTR_GID             (1 << 2)
+#define P9_ATTR_SIZE            (1 << 3)
+#define P9_ATTR_ATIME           (1 << 4)
+#define P9_ATTR_MTIME           (1 << 5)
+#define P9_ATTR_CTIME           (1 << 6)
+#define P9_ATTR_ATIME_SET       (1 << 7)
+#define P9_ATTR_MTIME_SET       (1 << 8)
+struct dotl_iattr_map {
+        int iattr_valid;
+        int p9_iattr_valid;
+};
+static int v9fs_mapped_iattr_valid(int iattr_valid)
+{
+        int i;
+        int p9_iattr_valid = 0;
+        struct dotl_iattr_map dotl_iattr_map[] = {
+                { ATTR_MODE,            P9_ATTR_MODE },
+                { ATTR_UID,             P9_ATTR_UID },
+                { ATTR_GID,             P9_ATTR_GID },
+                { ATTR_SIZE,            P9_ATTR_SIZE },
+                { ATTR_ATIME,           P9_ATTR_ATIME },
+                { ATTR_MTIME,           P9_ATTR_MTIME },
+                { ATTR_CTIME,           P9_ATTR_CTIME },
+                { ATTR_ATIME_SET,       P9_ATTR_ATIME_SET },
+                { ATTR_MTIME_SET,       P9_ATTR_MTIME_SET },
+        };
+        for (i = 0; i < ARRAY_SIZE(dotl_iattr_map); i++) {
+                if (iattr_valid & dotl_iattr_map[i].iattr_valid)
+                        p9_iattr_valid |= dotl_iattr_map[i].p9_iattr_valid;
+        }
+        return p9_iattr_valid;
+}
 /**
 * v9fs_vfs_setattr_dotl - set file metadata
 * @dentry: file whose metadata to set
@@ -537,13 +576,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
        struct p9_fid *fid;
        struct p9_iattr_dotl p9attr;
-        P9_DPRINTK(P9_DEBUG_VFS, "\n");
+        p9_debug(P9_DEBUG_VFS, "\n");
        retval = inode_change_ok(dentry->d_inode, iattr);
        if (retval)
                return retval;
-        p9attr.valid = iattr->ia_valid;
+        p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid);
        p9attr.mode = iattr->ia_mode;
        p9attr.uid = iattr->ia_uid;
        p9attr.gid = iattr->ia_gid;
@@ -670,14 +709,13 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
        struct v9fs_session_info *v9ses;
        name = (char *) dentry->d_name.name;
-        P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
+        p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname);
-                        dir->i_ino, name, symname);
        v9ses = v9fs_inode2v9ses(dir);
        dfid = v9fs_fid_lookup(dentry->d_parent);
        if (IS_ERR(dfid)) {
                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
                return err;
        }
@@ -687,7 +725,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
        err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
+                p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
                goto error;
        }
@@ -697,8 +735,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                fid = p9_client_walk(dfid, 1, &name, 1);
                if (IS_ERR(fid)) {
                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                        p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                        err);
+                                 err);
                        fid = NULL;
                        goto error;
                }
@@ -707,8 +745,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
                inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                        p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                        err);
+                                 err);
                        goto error;
                }
                err = v9fs_fid_add(dentry, fid);
@@ -751,9 +789,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
        struct p9_fid *dfid, *oldfid;
        struct v9fs_session_info *v9ses;
-        P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
+        p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
-                        dir->i_ino, old_dentry->d_name.name,
+                 dir->i_ino, old_dentry->d_name.name, dentry->d_name.name);
-                        dentry->d_name.name);
        v9ses = v9fs_inode2v9ses(dir);
        dir_dentry = v9fs_dentry_from_dir_inode(dir);
@@ -770,7 +807,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
        err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
        if (err < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
+                p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
                return err;
        }
@@ -813,9 +850,9 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
        struct dentry *dir_dentry;
        struct posix_acl *dacl = NULL, *pacl = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS,
+        p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n",
-                " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", dir->i_ino,
+                 dir->i_ino, dentry->d_name.name, omode,
-                dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev));
+                 MAJOR(rdev), MINOR(rdev));
        if (!new_valid_dev(rdev))
                return -EINVAL;
@@ -825,7 +862,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
        dfid = v9fs_fid_lookup(dir_dentry);
        if (IS_ERR(dfid)) {
                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
+                p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
                dfid = NULL;
                goto error;
        }
@@ -835,8 +872,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
        /* Update mode based on ACL value */
        err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
        if (err) {
-                P9_DPRINTK(P9_DEBUG_VFS,
+                p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mknod %d\n",
-                           "Failed to get acl values in mknod %d\n", err);
+                         err);
                goto error;
        }
        name = (char *) dentry->d_name.name;
@@ -851,8 +888,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
                fid = p9_client_walk(dfid, 1, &name, 1);
                if (IS_ERR(fid)) {
                        err = PTR_ERR(fid);
-                        P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
+                        p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-                                err);
+                                 err);
                        fid = NULL;
                        goto error;
                }
@@ -860,8 +897,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
                inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
                if (IS_ERR(inode)) {
                        err = PTR_ERR(inode);
-                        P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
+                        p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
-                                err);
+                                 err);
                        goto error;
                }
                err = v9fs_fid_add(dentry, fid);
@@ -905,7 +942,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
        char *link = __getname();
        char *target;
-        P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
+        p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
        if (!link) {
                link = ERR_PTR(-ENOMEM);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index f68ff65a32a5..7b0cd87b07c2 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -121,7 +121,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
        struct p9_fid *fid;
        int retval = 0;
-        P9_DPRINTK(P9_DEBUG_VFS, " \n");
+        p9_debug(P9_DEBUG_VFS, "\n");
        v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
        if (!v9ses)
@@ -191,7 +191,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
                goto release_sb;
        v9fs_fid_add(root, fid);
-        P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
+        p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n");
        return dget(sb->s_root);
 clunk_fid:
@@ -223,7 +223,7 @@ static void v9fs_kill_super(struct super_block *s)
 {
        struct v9fs_session_info *v9ses = s->s_fs_info;
-        P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s);
+        p9_debug(P9_DEBUG_VFS, " %p\n", s);
        kill_anon_super(s);
@@ -231,7 +231,7 @@ static void v9fs_kill_super(struct super_block *s)
        v9fs_session_close(v9ses);
        kfree(v9ses);
        s->s_fs_info = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
+        p9_debug(P9_DEBUG_VFS, "exiting kill_super\n");
 }
 static void
@@ -303,7 +303,7 @@ static int v9fs_write_inode(struct inode *inode,
         * send an fsync request to server irrespective of
         * wbc->sync_mode.
         */
-        P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+        p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
        v9inode = V9FS_I(inode);
        if (!v9inode->writeback_fid)
                return 0;
@@ -326,7 +326,7 @@ static int v9fs_write_inode_dotl(struct inode *inode,
         * send an fsync request to server irrespective of
         * wbc->sync_mode.
         */
-        P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
+        p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
        v9inode = V9FS_I(inode);
        if (!v9inode->writeback_fid)
                return 0;
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index d288773871b3..29653b70a9c3 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -32,8 +32,8 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
        attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
        if (IS_ERR(attr_fid)) {
                retval = PTR_ERR(attr_fid);
-                P9_DPRINTK(P9_DEBUG_VFS,
+                p9_debug(P9_DEBUG_VFS, "p9_client_attrwalk failed %zd\n",
-                        "p9_client_attrwalk failed %zd\n", retval);
+                         retval);
                attr_fid = NULL;
                goto error;
        }
@@ -87,8 +87,8 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
 {
        struct p9_fid *fid;
-        P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n",
+        p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n",
-                __func__, name, buffer_size);
+                 name, buffer_size);
        fid = v9fs_fid_lookup(dentry);
        if (IS_ERR(fid))
                return PTR_ERR(fid);
@@ -115,8 +115,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
        int retval, msize, write_count;
        struct p9_fid *fid = NULL;
-        P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n",
+        p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
-                __func__, name, value_len, flags);
+                 name, value_len, flags);
        fid = v9fs_fid_clone(dentry);
        if (IS_ERR(fid)) {
@@ -129,8 +129,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
         */
        retval = p9_client_xattrcreate(fid, name, value_len, flags);
        if (retval < 0) {
-                P9_DPRINTK(P9_DEBUG_VFS,
+                p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
-                        "p9_client_xattrcreate failed %d\n", retval);
+                         retval);
                goto error;
        }
        msize = fid->clnt->msize;
diff --git a/fs/Kconfig b/fs/Kconfig
index 30145d886bc2..d621f02a3f9e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -218,6 +218,8 @@ source "fs/exofs/Kconfig"
 endif # MISC_FILESYSTEMS
+source "fs/exofs/Kconfig.ore"
 menuconfig NETWORK_FILESYSTEMS
        bool "Network File Systems"
        default y
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 79e2ca7973b7..e95d1b64082c 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -27,6 +27,9 @@ config COMPAT_BINFMT_ELF
        bool
        depends on COMPAT && BINFMT_ELF
+config ARCH_BINFMT_ELF_RANDOMIZE_PIE
+        bool
 config BINFMT_ELF_FDPIC
        bool "Kernel support for FDPIC ELF binaries"
        default y
diff --git a/fs/aio.c b/fs/aio.c
index 78c514cfd212..969beb0e2231 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -476,14 +476,21 @@ static void kiocb_batch_init(struct kiocb_batch *batch, long total)
        batch->count = total;
 }
-static void kiocb_batch_free(struct kiocb_batch *batch)
+static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
 {
        struct kiocb *req, *n;
+        if (list_empty(&batch->head))
+                return;
+        spin_lock_irq(&ctx->ctx_lock);
        list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
                list_del(&req->ki_batch);
+                list_del(&req->ki_list);
                kmem_cache_free(kiocb_cachep, req);
+                ctx->reqs_active--;
        }
+        spin_unlock_irq(&ctx->ctx_lock);
 }
 /*
@@ -1742,7 +1749,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
        }
        blk_finish_plug(&plug);
-        kiocb_batch_free(&batch);
+        kiocb_batch_free(ctx, &batch);
        put_ioctx(ctx);
        return i ? i : ret;
 }
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 5869d4e974a9..d8d8e7ba6a1e 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -116,6 +116,7 @@ struct autofs_sb_info {
        int needs_reghost;
        struct super_block *sb;
        struct mutex wq_mutex;
+        struct mutex pipe_mutex;
        spinlock_t fs_lock;
        struct autofs_wait_queue *queues; /* Wait queue pointer */
        spinlock_t lookup_lock;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 2ba44c79d548..e16980b00b8d 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -225,6 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        sbi->min_proto = 0;
        sbi->max_proto = 0;
        mutex_init(&sbi->wq_mutex);
+        mutex_init(&sbi->pipe_mutex);
        spin_lock_init(&sbi->fs_lock);
        sbi->queues = NULL;
        spin_lock_init(&sbi->lookup_lock);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e1fbdeef85db..da8876d38a7b 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -56,26 +56,27 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
        mutex_unlock(&sbi->wq_mutex);
 }
-static int autofs4_write(struct file *file, const void *addr, int bytes)
+static int autofs4_write(struct autofs_sb_info *sbi,
+                         struct file *file, const void *addr, int bytes)
 {
        unsigned long sigpipe, flags;
        mm_segment_t fs;
        const char *data = (const char *)addr;
        ssize_t wr = 0;
-        /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
        sigpipe = sigismember(&current->pending.signal, SIGPIPE);
        /* Save pointer to user space and point back to kernel space */
        fs = get_fs();
        set_fs(KERNEL_DS);
+        mutex_lock(&sbi->pipe_mutex);
        while (bytes &&
               (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
                data += wr;
                bytes -= wr;
        }
+        mutex_unlock(&sbi->pipe_mutex);
        set_fs(fs);
@@ -110,6 +111,13 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
        pkt.hdr.proto_version = sbi->version;
        pkt.hdr.type = type;
+        mutex_lock(&sbi->wq_mutex);
+        /* Check if we have become catatonic */
+        if (sbi->catatonic) {
+                mutex_unlock(&sbi->wq_mutex);
+                return;
+        }
        switch (type) {
        /* Kernel protocol v4 missing and expire packets */
        case autofs_ptype_missing:
@@ -163,22 +171,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
        }
        default:
                printk("autofs4_notify_daemon: bad type %d!\n", type);
+                mutex_unlock(&sbi->wq_mutex);
                return;
        }
-        /* Check if we have become catatonic */
+        pipe = sbi->pipe;
-        mutex_lock(&sbi->wq_mutex);
+        get_file(pipe);
-        if (!sbi->catatonic) {
-                pipe = sbi->pipe;
-                get_file(pipe);
-        }
        mutex_unlock(&sbi->wq_mutex);
-        if (pipe) {
+        if (autofs4_write(sbi, pipe, &pkt, pktsz))
-                if (autofs4_write(pipe, &pkt, pktsz))
+                autofs4_catatonic_mode(sbi);
-                        autofs4_catatonic_mode(sbi);
+        fput(pipe);
-                fput(pipe);
-        }
 }
 static int autofs4_getpath(struct autofs_sb_info *sbi,
@@ -257,6 +261,9 @@ static int validate_request(struct autofs_wait_queue **wait,
        struct autofs_wait_queue *wq;
        struct autofs_info *ino;
+        if (sbi->catatonic)
+                return -ENOENT;
        /* Wait in progress, continue; */
        wq = autofs4_find_wait(sbi, qstr);
        if (wq) {
@@ -289,6 +296,9 @@ static int validate_request(struct autofs_wait_queue **wait,
                        if (mutex_lock_interruptible(&sbi->wq_mutex))
                                return -EINTR;
+                        if (sbi->catatonic)
+                                return -ENOENT;
                        wq = autofs4_find_wait(sbi, qstr);
                        if (wq) {
                                *wait = wq;
@@ -389,7 +399,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
        ret = validate_request(&wq, sbi, &qstr, dentry, notify);
        if (ret <= 0) {
-                if (ret == 0)
+                if (ret != -EINTR)
                        mutex_unlock(&sbi->wq_mutex);
                kfree(qstr.name);
                return ret;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 21ac5ee4b43f..bcb884e2d613 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -794,7 +794,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
                         * default mmap base, as well as whatever program they
                         * might try to exec.  This is because the brk will
                         * follow the loader, and is not movable.  */
-#if defined(CONFIG_X86) || defined(CONFIG_ARM)
+#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
                        /* Memory randomization might have been switched off
                         * in runtime via sysctl.
                         * If that is the case, retain the original non-zero
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 69a5b6fbee2b..0e575d1304b4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,7 +25,6 @@
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
-#include <linux/kmemleak.h>
 #include <linux/cleancache.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -521,7 +520,7 @@ static struct super_block *blockdev_superblock __read_mostly;
 void __init bdev_cache_init(void)
 {
        int err;
-        struct vfsmount *bd_mnt;
+        static struct vfsmount *bd_mnt;
        bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
                        0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -533,12 +532,7 @@ void __init bdev_cache_init(void)
        bd_mnt = kern_mount(&bd_type);
        if (IS_ERR(bd_mnt))
                panic("Cannot create bdev pseudo-fs");
-        /*
+        blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
-         * This vfsmount structure is only used to obtain the
-         * blockdev_superblock, so tell kmemleak not to report it.
-         */
-        kmemleak_not_leak(bd_mnt);
-        blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
 }
 /*
@@ -1145,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        mutex_lock_nested(&bdev->bd_mutex, for_part);
        if (!bdev->bd_openers) {
                bdev->bd_disk = disk;
+                bdev->bd_queue = disk->queue;
                bdev->bd_contains = bdev;
                if (!partno) {
                        struct backing_dev_info *bdi;
@@ -1165,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                        disk_put_part(bdev->bd_part);
                                        bdev->bd_part = NULL;
                                        bdev->bd_disk = NULL;
+                                        bdev->bd_queue = NULL;
                                        mutex_unlock(&bdev->bd_mutex);
                                        disk_unblock_events(disk);
                                        put_disk(disk);
@@ -1238,6 +1234,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        disk_put_part(bdev->bd_part);
        bdev->bd_disk = NULL;
        bdev->bd_part = NULL;
+        bdev->bd_queue = NULL;
        bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
        if (bdev != bdev->bd_contains)
                __blkdev_put(bdev->bd_contains, mode, 1);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ecb9fd3be143..d33f01c08b60 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL
          Linux website <http://acl.bestbits.at/>.
          If you don't know what Access Control Lists are, say N
+config BTRFS_FS_CHECK_INTEGRITY
+        bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+        depends on BTRFS_FS
+        help
+          Adds code that examines all block write requests (including
+          writes of the super block). The goal is to verify that the
+          state of the filesystem on disk is always consistent, i.e.,
+          after a power-loss or kernel panic event the filesystem is
+          in a consistent state.
+          If the integrity check tool is included and activated in
+          the mount options, plenty of kernel memory is used, and
+          plenty of additional CPU cycles are spent. Enabling this
+          functionality is not intended for normal use.
+          In most cases, unless you are a btrfs developer who needs
+          to verify the integrity of (super)-block write requests
+          during the run of a regression test, say N
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd29c5e5..0c4fa2befae7 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           export.o tree-log.o free-space-cache.o zlib.o lzo.o \
           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-           reada.o backref.o
+           reada.o backref.o ulist.o
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 22c64fff1bd5..633c701a287d 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -19,18 +19,789 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
+#include "ulist.h"
+#include "transaction.h"
+#include "delayed-ref.h"
-struct __data_ref {
+/*
+ * this structure records all encountered refs on the way up to the root
+ */
+struct __prelim_ref {
        struct list_head list;
-        u64 inum;
+        u64 root_id;
-        u64 root;
+        struct btrfs_key key;
-        u64 extent_data_item_offset;
+        int level;
+        int count;
+        u64 parent;
+        u64 wanted_disk_byte;
 };
-struct __shared_ref {
+static int __add_prelim_ref(struct list_head *head, u64 root_id,
-        struct list_head list;
+                            struct btrfs_key *key, int level, u64 parent,
+                            u64 wanted_disk_byte, int count)
+{
+        struct __prelim_ref *ref;
+        /* in case we're adding delayed refs, we're holding the refs spinlock */
+        ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
+        if (!ref)
+                return -ENOMEM;
+        ref->root_id = root_id;
+        if (key)
+                ref->key = *key;
+        else
+                memset(&ref->key, 0, sizeof(ref->key));
+        ref->level = level;
+        ref->count = count;
+        ref->parent = parent;
+        ref->wanted_disk_byte = wanted_disk_byte;
+        list_add_tail(&ref->list, head);
+        return 0;
+}
+static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+                                struct ulist *parents,
+                                struct extent_buffer *eb, int level,
+                                u64 wanted_objectid, u64 wanted_disk_byte)
+{
+        int ret;
+        int slot;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
        u64 disk_byte;
-};
+add_parent:
+        ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+        if (ret < 0)
+                return ret;
+        if (level != 0)
+                return 0;
+        /*
+         * if the current leaf is full with EXTENT_DATA items, we must
+         * check the next one if that holds a reference as well.
+         * ref->count cannot be used to skip this check.
+         * repeat this until we don't find any additional EXTENT_DATA items.
+         */
+        while (1) {
+                ret = btrfs_next_leaf(root, path);
+                if (ret < 0)
+                        return ret;
+                if (ret)
+                        return 0;
+                eb = path->nodes[0];
+                for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
+                        btrfs_item_key_to_cpu(eb, &key, slot);
+                        if (key.objectid != wanted_objectid ||
+                            key.type != BTRFS_EXTENT_DATA_KEY)
+                                return 0;
+                        fi = btrfs_item_ptr(eb, slot,
+                                                struct btrfs_file_extent_item);
+                        disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+                        if (disk_byte == wanted_disk_byte)
+                                goto add_parent;
+                }
+        }
+        return 0;
+}
+/*
+ * resolve an indirect backref in the form (root_id, key, level)
+ * to a logical address
+ */
+static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+                                        struct __prelim_ref *ref,
+                                        struct ulist *parents)
+{
+        struct btrfs_path *path;
+        struct btrfs_root *root;
+        struct btrfs_key root_key;
+        struct btrfs_key key = {0};
+        struct extent_buffer *eb;
+        int ret = 0;
+        int root_level;
+        int level = ref->level;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        root_key.objectid = ref->root_id;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root_key.offset = (u64)-1;
+        root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+        if (IS_ERR(root)) {
+                ret = PTR_ERR(root);
+                goto out;
+        }
+        rcu_read_lock();
+        root_level = btrfs_header_level(root->node);
+        rcu_read_unlock();
+        if (root_level + 1 == level)
+                goto out;
+        path->lowest_level = level;
+        ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
+        pr_debug("search slot in root %llu (level %d, ref count %d) returned "
+                 "%d for key (%llu %u %llu)\n",
+                 (unsigned long long)ref->root_id, level, ref->count, ret,
+                 (unsigned long long)ref->key.objectid, ref->key.type,
+                 (unsigned long long)ref->key.offset);
+        if (ret < 0)
+                goto out;
+        eb = path->nodes[level];
+        if (!eb) {
+                WARN_ON(1);
+                ret = 1;
+                goto out;
+        }
+        if (level == 0) {
+                if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret)
+                                goto out;
+                        eb = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+        }
+        /* the last two parameters will only be used for level == 0 */
+        ret = add_all_parents(root, path, parents, eb, level, key.objectid,
+                                ref->wanted_disk_byte);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * resolve all indirect backrefs from the list
+ */
+static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+                                   struct list_head *head)
+{
+        int err;
+        int ret = 0;
+        struct __prelim_ref *ref;
+        struct __prelim_ref *ref_safe;
+        struct __prelim_ref *new_ref;
+        struct ulist *parents;
+        struct ulist_node *node;
+        parents = ulist_alloc(GFP_NOFS);
+        if (!parents)
+                return -ENOMEM;
+        /*
+         * _safe allows us to insert directly after the current item without
+         * iterating over the newly inserted items.
+         * we're also allowed to re-assign ref during iteration.
+         */
+        list_for_each_entry_safe(ref, ref_safe, head, list) {
+                if (ref->parent)        /* already direct */
+                        continue;
+                if (ref->count == 0)
+                        continue;
+                err = __resolve_indirect_ref(fs_info, ref, parents);
+                if (err) {
+                        if (ret == 0)
+                                ret = err;
+                        continue;
+                }
+                /* we put the first parent into the ref at hand */
+                node = ulist_next(parents, NULL);
+                ref->parent = node ? node->val : 0;
+                /* additional parents require new refs being added here */
+                while ((node = ulist_next(parents, node))) {
+                        new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
+                        if (!new_ref) {
+                                ret = -ENOMEM;
+                                break;
+                        }
+                        memcpy(new_ref, ref, sizeof(*ref));
+                        new_ref->parent = node->val;
+                        list_add(&new_ref->list, &ref->list);
+                }
+                ulist_reinit(parents);
+        }
+        ulist_free(parents);
+        return ret;
+}
+/*
+ * merge two lists of backrefs and adjust counts accordingly
+ *
+ * mode = 1: merge identical keys, if key is set
+ * mode = 2: merge identical parents
+ */
+static int __merge_refs(struct list_head *head, int mode)
+{
+        struct list_head *pos1;
+        list_for_each(pos1, head) {
+                struct list_head *n2;
+                struct list_head *pos2;
+                struct __prelim_ref *ref1;
+                ref1 = list_entry(pos1, struct __prelim_ref, list);
+                if (mode == 1 && ref1->key.type == 0)
+                        continue;
+                for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
+                     pos2 = n2, n2 = pos2->next) {
+                        struct __prelim_ref *ref2;
+                        ref2 = list_entry(pos2, struct __prelim_ref, list);
+                        if (mode == 1) {
+                                if (memcmp(&ref1->key, &ref2->key,
+                                           sizeof(ref1->key)) ||
+                                    ref1->level != ref2->level ||
+                                    ref1->root_id != ref2->root_id)
+                                        continue;
+                                ref1->count += ref2->count;
+                        } else {
+                                if (ref1->parent != ref2->parent)
+                                        continue;
+                                ref1->count += ref2->count;
+                        }
+                        list_del(&ref2->list);
+                        kfree(ref2);
+                }
+        }
+        return 0;
+}
+/*
+ * add all currently queued delayed refs from this head whose seq nr is
+ * smaller or equal that seq to the list
+ */
+static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+                              struct btrfs_key *info_key,
+                              struct list_head *prefs)
+{
+        struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+        struct rb_node *n = &head->node.rb_node;
+        int sgn;
+        int ret = 0;
+        if (extent_op && extent_op->update_key)
+                btrfs_disk_key_to_cpu(info_key, &extent_op->key);
+        while ((n = rb_prev(n))) {
+                struct btrfs_delayed_ref_node *node;
+                node = rb_entry(n, struct btrfs_delayed_ref_node,
+                                rb_node);
+                if (node->bytenr != head->node.bytenr)
+                        break;
+                WARN_ON(node->is_head);
+                if (node->seq > seq)
+                        continue;
+                switch (node->action) {
+                case BTRFS_ADD_DELAYED_EXTENT:
+                case BTRFS_UPDATE_DELAYED_HEAD:
+                        WARN_ON(1);
+                        continue;
+                case BTRFS_ADD_DELAYED_REF:
+                        sgn = 1;
+                        break;
+                case BTRFS_DROP_DELAYED_REF:
+                        sgn = -1;
+                        break;
+                default:
+                        BUG_ON(1);
+                }
+                switch (node->type) {
+                case BTRFS_TREE_BLOCK_REF_KEY: {
+                        struct btrfs_delayed_tree_ref *ref;
+                        ref = btrfs_delayed_node_to_tree_ref(node);
+                        ret = __add_prelim_ref(prefs, ref->root, info_key,
+                                               ref->level + 1, 0, node->bytenr,
+                                               node->ref_mod * sgn);
+                        break;
+                }
+                case BTRFS_SHARED_BLOCK_REF_KEY: {
+                        struct btrfs_delayed_tree_ref *ref;
+                        ref = btrfs_delayed_node_to_tree_ref(node);
+                        ret = __add_prelim_ref(prefs, ref->root, info_key,
+                                               ref->level + 1, ref->parent,
+                                               node->bytenr,
+                                               node->ref_mod * sgn);
+                        break;
+                }
+                case BTRFS_EXTENT_DATA_REF_KEY: {
+                        struct btrfs_delayed_data_ref *ref;
+                        struct btrfs_key key;
+                        ref = btrfs_delayed_node_to_data_ref(node);
+                        key.objectid = ref->objectid;
+                        key.type = BTRFS_EXTENT_DATA_KEY;
+                        key.offset = ref->offset;
+                        ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
+                                               node->bytenr,
+                                               node->ref_mod * sgn);
+                        break;
+                }
+                case BTRFS_SHARED_DATA_REF_KEY: {
+                        struct btrfs_delayed_data_ref *ref;
+                        struct btrfs_key key;
+                        ref = btrfs_delayed_node_to_data_ref(node);
+                        key.objectid = ref->objectid;
+                        key.type = BTRFS_EXTENT_DATA_KEY;
+                        key.offset = ref->offset;
+                        ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+                                               ref->parent, node->bytenr,
+                                               node->ref_mod * sgn);
+                        break;
+                }
+                default:
+                        WARN_ON(1);
+                }
+                BUG_ON(ret);
+        }
+        return 0;
+}
+/*
+ * add all inline backrefs for bytenr to the list
+ */
+static int __add_inline_refs(struct btrfs_fs_info *fs_info,
+                             struct btrfs_path *path, u64 bytenr,
+                             struct btrfs_key *info_key, int *info_level,
+                             struct list_head *prefs)
+{
+        int ret = 0;
+        int slot;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        unsigned long ptr;
+        unsigned long end;
+        struct btrfs_extent_item *ei;
+        u64 flags;
+        u64 item_size;
+        /*
+         * enumerate all inline refs
+         */
+        leaf = path->nodes[0];
+        slot = path->slots[0] - 1;
+        item_size = btrfs_item_size_nr(leaf, slot);
+        BUG_ON(item_size < sizeof(*ei));
+        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+        flags = btrfs_extent_flags(leaf, ei);
+        ptr = (unsigned long)(ei + 1);
+        end = (unsigned long)ei + item_size;
+        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                struct btrfs_tree_block_info *info;
+                struct btrfs_disk_key disk_key;
+                info = (struct btrfs_tree_block_info *)ptr;
+                *info_level = btrfs_tree_block_level(leaf, info);
+                btrfs_tree_block_key(leaf, info, &disk_key);
+                btrfs_disk_key_to_cpu(info_key, &disk_key);
+                ptr += sizeof(struct btrfs_tree_block_info);
+                BUG_ON(ptr > end);
+        } else {
+                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+        }
+        while (ptr < end) {
+                struct btrfs_extent_inline_ref *iref;
+                u64 offset;
+                int type;
+                iref = (struct btrfs_extent_inline_ref *)ptr;
+                type = btrfs_extent_inline_ref_type(leaf, iref);
+                offset = btrfs_extent_inline_ref_offset(leaf, iref);
+                switch (type) {
+                case BTRFS_SHARED_BLOCK_REF_KEY:
+                        ret = __add_prelim_ref(prefs, 0, info_key,
+                                                *info_level + 1, offset,
+                                                bytenr, 1);
+                        break;
+                case BTRFS_SHARED_DATA_REF_KEY: {
+                        struct btrfs_shared_data_ref *sdref;
+                        int count;
+                        sdref = (struct btrfs_shared_data_ref *)(iref + 1);
+                        count = btrfs_shared_data_ref_count(leaf, sdref);
+                        ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
+                                               bytenr, count);
+                        break;
+                }
+                case BTRFS_TREE_BLOCK_REF_KEY:
+                        ret = __add_prelim_ref(prefs, offset, info_key,
+                                               *info_level + 1, 0, bytenr, 1);
+                        break;
+                case BTRFS_EXTENT_DATA_REF_KEY: {
+                        struct btrfs_extent_data_ref *dref;
+                        int count;
+                        u64 root;
+                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                        count = btrfs_extent_data_ref_count(leaf, dref);
+                        key.objectid = btrfs_extent_data_ref_objectid(leaf,
+                                                                      dref);
+                        key.type = BTRFS_EXTENT_DATA_KEY;
+                        key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+                        root = btrfs_extent_data_ref_root(leaf, dref);
+                        ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
+                                                count);
+                        break;
+                }
+                default:
+                        WARN_ON(1);
+                }
+                BUG_ON(ret);
+                ptr += btrfs_extent_inline_ref_size(type);
+        }
+        return 0;
+}
+/*
+ * add all non-inline backrefs for bytenr to the list
+ */
+static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
+                            struct btrfs_path *path, u64 bytenr,
+                            struct btrfs_key *info_key, int info_level,
+                            struct list_head *prefs)
+{
+        struct btrfs_root *extent_root = fs_info->extent_root;
+        int ret;
+        int slot;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        while (1) {
+                ret = btrfs_next_item(extent_root, path);
+                if (ret < 0)
+                        break;
+                if (ret) {
+                        ret = 0;
+                        break;
+                }
+                slot = path->slots[0];
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (key.objectid != bytenr)
+                        break;
+                if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
+                        continue;
+                if (key.type > BTRFS_SHARED_DATA_REF_KEY)
+                        break;
+                switch (key.type) {
+                case BTRFS_SHARED_BLOCK_REF_KEY:
+                        ret = __add_prelim_ref(prefs, 0, info_key,
+                                                info_level + 1, key.offset,
+                                                bytenr, 1);
+                        break;
+                case BTRFS_SHARED_DATA_REF_KEY: {
+                        struct btrfs_shared_data_ref *sdref;
+                        int count;
+                        sdref = btrfs_item_ptr(leaf, slot,
+                                              struct btrfs_shared_data_ref);
+                        count = btrfs_shared_data_ref_count(leaf, sdref);
+                        ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
+                                                bytenr, count);
+                        break;
+                }
+                case BTRFS_TREE_BLOCK_REF_KEY:
+                        ret = __add_prelim_ref(prefs, key.offset, info_key,
+                                                info_level + 1, 0, bytenr, 1);
+                        break;
+                case BTRFS_EXTENT_DATA_REF_KEY: {
+                        struct btrfs_extent_data_ref *dref;
+                        int count;
+                        u64 root;
+                        dref = btrfs_item_ptr(leaf, slot,
+                                              struct btrfs_extent_data_ref);
+                        count = btrfs_extent_data_ref_count(leaf, dref);
+                        key.objectid = btrfs_extent_data_ref_objectid(leaf,
+                                                                      dref);
+                        key.type = BTRFS_EXTENT_DATA_KEY;
+                        key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+                        root = btrfs_extent_data_ref_root(leaf, dref);
+                        ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+                                                bytenr, count);
+                        break;
+                }
+                default:
+                        WARN_ON(1);
+                }
+                BUG_ON(ret);
+        }
+        return ret;
+}
+/*
+ * this adds all existing backrefs (inline backrefs, backrefs and delayed
+ * refs) for the given bytenr to the refs list, merges duplicates and resolves
+ * indirect refs to their parent bytenr.
+ * When roots are found, they're added to the roots list
+ *
+ * FIXME some caching might speed things up
+ */
+static int find_parent_nodes(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info, u64 bytenr,
+                             u64 seq, struct ulist *refs, struct ulist *roots)
+{
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct btrfs_key info_key = { 0 };
+        struct btrfs_delayed_ref_root *delayed_refs = NULL;
+        struct btrfs_delayed_ref_head *head = NULL;
+        int info_level = 0;
+        int ret;
+        struct list_head prefs_delayed;
+        struct list_head prefs;
+        struct __prelim_ref *ref;
+        INIT_LIST_HEAD(&prefs);
+        INIT_LIST_HEAD(&prefs_delayed);
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = (u64)-1;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        /*
+         * grab both a lock on the path and a lock on the delayed ref head.
+         * We need both to get a consistent picture of how the refs look
+         * at a specified point in time
+         */
+again:
+        ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        BUG_ON(ret == 0);
+        /*
+         * look if there are updates for this ref queued and lock the head
+         */
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        head = btrfs_find_delayed_ref_head(trans, bytenr);
+        if (head) {
+                if (!mutex_trylock(&head->mutex)) {
+                        atomic_inc(&head->node.refs);
+                        spin_unlock(&delayed_refs->lock);
+                        btrfs_release_path(path);
+                        /*
+                         * Mutex was contended, block until it's
+                         * released and try again
+                         */
+                        mutex_lock(&head->mutex);
+                        mutex_unlock(&head->mutex);
+                        btrfs_put_delayed_ref(&head->node);
+                        goto again;
+                }
+                ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
+                if (ret)
+                        goto out;
+        }
+        spin_unlock(&delayed_refs->lock);
+        if (path->slots[0]) {
+                struct extent_buffer *leaf;
+                int slot;
+                leaf = path->nodes[0];
+                slot = path->slots[0] - 1;
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (key.objectid == bytenr &&
+                    key.type == BTRFS_EXTENT_ITEM_KEY) {
+                        ret = __add_inline_refs(fs_info, path, bytenr,
+                                                &info_key, &info_level, &prefs);
+                        if (ret)
+                                goto out;
+                        ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
+                                               info_level, &prefs);
+                        if (ret)
+                                goto out;
+                }
+        }
+        btrfs_release_path(path);
+        /*
+         * when adding the delayed refs above, the info_key might not have
+         * been known yet. Go over the list and replace the missing keys
+         */
+        list_for_each_entry(ref, &prefs_delayed, list) {
+                if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
+                        memcpy(&ref->key, &info_key, sizeof(ref->key));
+        }
+        list_splice_init(&prefs_delayed, &prefs);
+        ret = __merge_refs(&prefs, 1);
+        if (ret)
+                goto out;
+        ret = __resolve_indirect_refs(fs_info, &prefs);
+        if (ret)
+                goto out;
+        ret = __merge_refs(&prefs, 2);
+        if (ret)
+                goto out;
+        while (!list_empty(&prefs)) {
+                ref = list_first_entry(&prefs, struct __prelim_ref, list);
+                list_del(&ref->list);
+                if (ref->count < 0)
+                        WARN_ON(1);
+                if (ref->count && ref->root_id && ref->parent == 0) {
+                        /* no parent == root of tree */
+                        ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+                        BUG_ON(ret < 0);
+                }
+                if (ref->count && ref->parent) {
+                        ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
+                        BUG_ON(ret < 0);
+                }
+                kfree(ref);
+        }
+out:
+        if (head)
+                mutex_unlock(&head->mutex);
+        btrfs_free_path(path);
+        while (!list_empty(&prefs)) {
+                ref = list_first_entry(&prefs, struct __prelim_ref, list);
+                list_del(&ref->list);
+                kfree(ref);
+        }
+        while (!list_empty(&prefs_delayed)) {
+                ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
+                                       list);
+                list_del(&ref->list);
+                kfree(ref);
+        }
+        return ret;
+}
+/*
+ * Finds all leafs with a reference to the specified combination of bytenr and
+ * offset. key_list_head will point to a list of corresponding keys (caller must
+ * free each list element). The leafs will be stored in the leafs ulist, which
+ * must be freed with ulist_free.
+ *
+ * returns 0 on success, <0 on error
+ */
+static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+                                struct btrfs_fs_info *fs_info, u64 bytenr,
+                                u64 num_bytes, u64 seq, struct ulist **leafs)
+{
+        struct ulist *tmp;
+        int ret;
+        tmp = ulist_alloc(GFP_NOFS);
+        if (!tmp)
+                return -ENOMEM;
+        *leafs = ulist_alloc(GFP_NOFS);
+        if (!*leafs) {
+                ulist_free(tmp);
+                return -ENOMEM;
+        }
+        ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
+        ulist_free(tmp);
+        if (ret < 0 && ret != -ENOENT) {
+                ulist_free(*leafs);
+                return ret;
+        }
+        return 0;
+}
+/*
+ * walk all backrefs for a given extent to find all roots that reference this
+ * extent. Walking a backref means finding all extents that reference this
+ * extent and in turn walk the backrefs of those, too. Naturally this is a
+ * recursive process, but here it is implemented in an iterative fashion: We
+ * find all referencing extents for the extent in question and put them on a
+ * list. In turn, we find all referencing extents for those, further appending
+ * to the list. The way we iterate the list allows adding more elements after
+ * the current while iterating. The process stops when we reach the end of the
+ * list. Found roots are added to the roots list.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+                                struct btrfs_fs_info *fs_info, u64 bytenr,
+                                u64 num_bytes, u64 seq, struct ulist **roots)
+{
+        struct ulist *tmp;
+        struct ulist_node *node = NULL;
+        int ret;
+        tmp = ulist_alloc(GFP_NOFS);
+        if (!tmp)
+                return -ENOMEM;
+        *roots = ulist_alloc(GFP_NOFS);
+        if (!*roots) {
+                ulist_free(tmp);
+                return -ENOMEM;
+        }
+        while (1) {
+                ret = find_parent_nodes(trans, fs_info, bytenr, seq,
+                                        tmp, *roots);
+                if (ret < 0 && ret != -ENOENT) {
+                        ulist_free(tmp);
+                        ulist_free(*roots);
+                        return ret;
+                }
+                node = ulist_next(tmp, node);
+                if (!node)
+                        break;
+                bytenr = node->val;
+        }
+        ulist_free(tmp);
+        return 0;
+}
 static int __inode_info(u64 inum, u64 ioff, u8 key_type,
                        struct btrfs_root *fs_root, struct btrfs_path *path,
@@ -181,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
        btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
        if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
            found_key->objectid > logical ||
-            found_key->objectid + found_key->offset <= logical)
+            found_key->objectid + found_key->offset <= logical) {
+                pr_debug("logical %llu is not within any extent\n",
+                         (unsigned long long)logical);
                return -ENOENT;
+        }
        eb = path->nodes[0];
        item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -191,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
        flags = btrfs_extent_flags(eb, ei);
+        pr_debug("logical %llu is at position %llu within the extent (%llu "
+                 "EXTENT_ITEM %llu) flags %#llx size %u\n",
+                 (unsigned long long)logical,
+                 (unsigned long long)(logical - found_key->objectid),
+                 (unsigned long long)found_key->objectid,
+                 (unsigned long long)found_key->offset,
+                 (unsigned long long)flags, item_size);
        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                return BTRFS_EXTENT_FLAG_TREE_BLOCK;
        if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -287,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
        return 0;
 }
-static int __data_list_add(struct list_head *head, u64 inum,
+static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
-                                u64 extent_data_item_offset, u64 root)
+                                struct btrfs_path *path, u64 logical,
-{
+                                u64 orig_extent_item_objectid,
-        struct __data_ref *ref;
+                                u64 extent_item_pos, u64 root,
+                                iterate_extent_inodes_t *iterate, void *ctx)
-        ref = kmalloc(sizeof(*ref), GFP_NOFS);
-        if (!ref)
-                return -ENOMEM;
-        ref->inum = inum;
-        ref->extent_data_item_offset = extent_data_item_offset;
-        ref->root = root;
-        list_add_tail(&ref->list, head);
-        return 0;
-}
-static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
-                                struct btrfs_extent_data_ref *dref)
-{
-        return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
-                                btrfs_extent_data_ref_offset(eb, dref),
-                                btrfs_extent_data_ref_root(eb, dref));
-}
-static int __shared_list_add(struct list_head *head, u64 disk_byte)
-{
-        struct __shared_ref *ref;
-        ref = kmalloc(sizeof(*ref), GFP_NOFS);
-        if (!ref)
-                return -ENOMEM;
-        ref->disk_byte = disk_byte;
-        list_add_tail(&ref->list, head);
-        return 0;
-}
-static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
-                                           u64 logical, u64 inum,
-                                           u64 extent_data_item_offset,
-                                           u64 extent_offset,
-                                           struct btrfs_path *path,
-                                           struct list_head *data_refs,
-                                           iterate_extent_inodes_t *iterate,
-                                           void *ctx)
-{
-        u64 ref_root;
-        u32 item_size;
-        struct btrfs_key key;
-        struct extent_buffer *eb;
-        struct btrfs_extent_item *ei;
-        struct btrfs_extent_inline_ref *eiref;
-        struct __data_ref *ref;
-        int ret;
-        int type;
-        int last;
-        unsigned long ptr = 0;
-        WARN_ON(!list_empty(data_refs));
-        ret = extent_from_logical(fs_info, logical, path, &key);
-        if (ret & BTRFS_EXTENT_FLAG_DATA)
-                ret = -EIO;
-        if (ret < 0)
-                goto out;
-        eb = path->nodes[0];
-        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
-        item_size = btrfs_item_size_nr(eb, path->slots[0]);
-        ret = 0;
-        ref_root = 0;
-        /*
-         * as done in iterate_extent_inodes, we first build a list of refs to
-         * iterate, then free the path and then iterate them to avoid deadlocks.
-         */
-        do {
-                last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
-                                                &eiref, &type);
-                if (last < 0) {
-                        ret = last;
-                        goto out;
-                }
-                if (type == BTRFS_TREE_BLOCK_REF_KEY ||
-                    type == BTRFS_SHARED_BLOCK_REF_KEY) {
-                        ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
-                        ret = __data_list_add(data_refs, inum,
-                                                extent_data_item_offset,
-                                                ref_root);
-                }
-        } while (!ret && !last);
-        btrfs_release_path(path);
-        if (ref_root == 0) {
-                printk(KERN_ERR "btrfs: failed to find tree block ref "
-                        "for shared data backref %llu\n", logical);
-                WARN_ON(1);
-                ret = -EIO;
-        }
-out:
-        while (!list_empty(data_refs)) {
-                ref = list_first_entry(data_refs, struct __data_ref, list);
-                list_del(&ref->list);
-                if (!ret)
-                        ret = iterate(ref->inum, extent_offset +
-                                        ref->extent_data_item_offset,
-                                        ref->root, ctx);
-                kfree(ref);
-        }
-        return ret;
-}
-static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
-                                    u64 logical, u64 orig_extent_item_objectid,
-                                    u64 extent_offset, struct btrfs_path *path,
-                                    struct list_head *data_refs,
-                                    iterate_extent_inodes_t *iterate,
-                                    void *ctx)
 {
        u64 disk_byte;
        struct btrfs_key key;
@@ -416,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
        struct extent_buffer *eb;
        int slot;
        int nritems;
-        int ret;
+        int ret = 0;
-        int found = 0;
+        int extent_type;
+        u64 data_offset;
+        u64 data_len;
        eb = read_tree_block(fs_info->tree_root, logical,
                                fs_info->tree_root->leafsize, 0);
@@ -435,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
                if (key.type != BTRFS_EXTENT_DATA_KEY)
                        continue;
                fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
-                if (!fi) {
+                extent_type = btrfs_file_extent_type(eb, fi);
-                        free_extent_buffer(eb);
+                if (extent_type == BTRFS_FILE_EXTENT_INLINE)
-                        return -EIO;
+                        continue;
-                }
+                /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
                disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-                if (disk_byte != orig_extent_item_objectid) {
+                if (disk_byte != orig_extent_item_objectid)
-                        if (found)
+                        continue;
-                                break;
-                        else
-                                continue;
-                }
-                ++found;
-                ret = __iter_shared_inline_ref_inodes(fs_info, logical,
-                                                        key.objectid,
-                                                        key.offset,
-                                                        extent_offset, path,
-                                                        data_refs,
-                                                        iterate, ctx);
-                if (ret)
-                        break;
-        }
-        if (!found) {
+                data_offset = btrfs_file_extent_offset(eb, fi);
-                printk(KERN_ERR "btrfs: failed to follow shared data backref "
+                data_len = btrfs_file_extent_num_bytes(eb, fi);
-                        "to parent %llu\n", logical);
-                WARN_ON(1);
+                if (extent_item_pos < data_offset ||
-                ret = -EIO;
+                    extent_item_pos >= data_offset + data_len)
+                        continue;
+                pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
+                                "root %llu\n", orig_extent_item_objectid,
+                                key.objectid, key.offset, root);
+                ret = iterate(key.objectid,
+                                key.offset + (extent_item_pos - data_offset),
+                                root, ctx);
+                if (ret) {
+                        pr_debug("stopping iteration because ret=%d\n", ret);
+                        break;
+                }
        }
        free_extent_buffer(eb);
        return ret;
 }
 /*
 * calls iterate() for every inode that references the extent identified by
- * the given parameters. will use the path given as a parameter and return it
+ * the given parameters.
- * released.
 * when the iterator function returns a non-zero value, iteration stops.
+ * path is guaranteed to be in released state when iterate() is called.
 */
 int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
                                struct btrfs_path *path,
-                                u64 extent_item_objectid,
+                                u64 extent_item_objectid, u64 extent_item_pos,
-                                u64 extent_offset,
                                iterate_extent_inodes_t *iterate, void *ctx)
 {
-        unsigned long ptr = 0;
-        int last;
        int ret;
-        int type;
-        u64 logical;
-        u32 item_size;
-        struct btrfs_extent_inline_ref *eiref;
-        struct btrfs_extent_data_ref *dref;
-        struct extent_buffer *eb;
-        struct btrfs_extent_item *ei;
-        struct btrfs_key key;
        struct list_head data_refs = LIST_HEAD_INIT(data_refs);
        struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
-        struct __data_ref *ref_d;
+        struct btrfs_trans_handle *trans;
-        struct __shared_ref *ref_s;
+        struct ulist *refs;
+        struct ulist *roots;
+        struct ulist_node *ref_node = NULL;
+        struct ulist_node *root_node = NULL;
+        struct seq_list seq_elem;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        trans = btrfs_join_transaction(fs_info->extent_root);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
+        pr_debug("resolving all inodes for extent %llu\n",
+                        extent_item_objectid);
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        btrfs_get_delayed_seq(delayed_refs, &seq_elem);
+        spin_unlock(&delayed_refs->lock);
+        ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
+                                   extent_item_pos, seq_elem.seq,
+                                   &refs);
-        eb = path->nodes[0];
+        if (ret)
-        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+                goto out;
-        item_size = btrfs_item_size_nr(eb, path->slots[0]);
-        /* first we iterate the inline refs, ... */
-        do {
-                last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
-                                                &eiref, &type);
-                if (last == -ENOENT) {
-                        ret = 0;
-                        break;
-                }
-                if (last < 0) {
-                        ret = last;
-                        break;
-                }
-                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
-                        dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
-                        ret = __data_list_add_eb(&data_refs, eb, dref);
-                } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
-                        logical = btrfs_extent_inline_ref_offset(eb, eiref);
-                        ret = __shared_list_add(&shared_refs, logical);
-                }
-        } while (!ret && !last);
-        /* ... then we proceed to in-tree references and ... */
+        while (!ret && (ref_node = ulist_next(refs, ref_node))) {
-        while (!ret) {
+                ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
-                ++path->slots[0];
+                                                seq_elem.seq, &roots);
-                if (path->slots[0] > btrfs_header_nritems(eb)) {
+                if (ret)
-                        ret = btrfs_next_leaf(fs_info->extent_root, path);
-                        if (ret) {
-                                if (ret == 1)
-                                        ret = 0; /* we're done */
-                                break;
-                        }
-                        eb = path->nodes[0];
-                }
-                btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
-                if (key.objectid != extent_item_objectid)
                        break;
-                if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                while (!ret && (root_node = ulist_next(roots, root_node))) {
-                        dref = btrfs_item_ptr(eb, path->slots[0],
+                        pr_debug("root %llu references leaf %llu\n",
-                                                struct btrfs_extent_data_ref);
+                                        root_node->val, ref_node->val);
-                        ret = __data_list_add_eb(&data_refs, eb, dref);
+                        ret = iterate_leaf_refs(fs_info, path, ref_node->val,
-                } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+                                                extent_item_objectid,
-                        ret = __shared_list_add(&shared_refs, key.offset);
+                                                extent_item_pos, root_node->val,
+                                                iterate, ctx);
                }
        }
-        btrfs_release_path(path);
+        ulist_free(refs);
+        ulist_free(roots);
-        /*
+out:
-         * ... only at the very end we can process the refs we found. this is
+        btrfs_put_delayed_seq(delayed_refs, &seq_elem);
-         * because the iterator function we call is allowed to make tree lookups
+        btrfs_end_transaction(trans, fs_info->extent_root);
-         * and we have to avoid deadlocks. additionally, we need more tree
-         * lookups ourselves for shared data refs.
-         */
-        while (!list_empty(&data_refs)) {
-                ref_d = list_first_entry(&data_refs, struct __data_ref, list);
-                list_del(&ref_d->list);
-                if (!ret)
-                        ret = iterate(ref_d->inum, extent_offset +
-                                        ref_d->extent_data_item_offset,
-                                        ref_d->root, ctx);
-                kfree(ref_d);
-        }
-        while (!list_empty(&shared_refs)) {
-                ref_s = list_first_entry(&shared_refs, struct __shared_ref,
-                                        list);
-                list_del(&ref_s->list);
-                if (!ret)
-                        ret = __iter_shared_inline_ref(fs_info,
-                                                        ref_s->disk_byte,
-                                                        extent_item_objectid,
-                                                        extent_offset, path,
-                                                        &data_refs,
-                                                        iterate, ctx);
-                kfree(ref_s);
-        }
        return ret;
 }
@@ -586,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
                                iterate_extent_inodes_t *iterate, void *ctx)
 {
        int ret;
-        u64 offset;
+        u64 extent_item_pos;
        struct btrfs_key found_key;
        ret = extent_from_logical(fs_info, logical, path,
                                        &found_key);
+        btrfs_release_path(path);
        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                ret = -EINVAL;
        if (ret < 0)
                return ret;
-        offset = logical - found_key.objectid;
+        extent_item_pos = logical - found_key.objectid;
        ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
-                                        offset, iterate, ctx);
+                                        extent_item_pos, iterate, ctx);
        return ret;
 }
@@ -643,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
                for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
                        name_len = btrfs_inode_ref_name_len(eb, iref);
                        /* path must be released before calling iterate()! */
+                        pr_debug("following ref at offset %u for inode %llu in "
+                                 "tree %llu\n", cur,
+                                 (unsigned long long)found_key.objectid,
+                                 (unsigned long long)fs_root->objectid);
                        ret = iterate(parent, iref, eb, ctx);
                        if (ret) {
                                free_extent_buffer(eb);
@@ -683,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
                return PTR_ERR(fspath);
        if (fspath > fspath_min) {
+                pr_debug("path resolved: %s\n", fspath);
                ipath->fspath->val[i] = (u64)(unsigned long)fspath;
                ++ipath->fspath->elem_cnt;
                ipath->fspath->bytes_left = fspath - fspath_min;
        } else {
+                pr_debug("missed path, not enough space. missing bytes: %lu, "
+                         "constructed so far: %s\n",
+                         (unsigned long)(fspath_min - fspath), fspath_min);
                ++ipath->fspath->elem_missed;
                ipath->fspath->bytes_missing += fspath_min - fspath;
                ipath->fspath->bytes_left = 0;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 92618837cb8f..d00dfa9ca934 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -20,6 +20,7 @@
 #define __BTRFS_BACKREF__
 #include "ioctl.h"
+#include "ulist.h"
 struct inode_fs_paths {
        struct btrfs_path               *btrfs_path;
@@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+                                struct btrfs_fs_info *fs_info, u64 bytenr,
+                                u64 num_bytes, u64 seq, struct ulist **roots);
 struct btrfs_data_container *init_data_container(u32 total_bytes);
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
                                        struct btrfs_path *path);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 634608d2a6d0..9b9b15fd5204 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,9 @@ struct btrfs_inode {
        /* held while logging the inode in tree-log.c */
        struct mutex log_mutex;
+        /* held while doing delalloc reservations */
+        struct mutex delalloc_mutex;
        /* used to order data wrt metadata */
        struct btrfs_ordered_inode_tree ordered_tree;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644
index 000000000000..b669a7d8e499
--- /dev/null
+++ b/fs/btrfs/check-integrity.c
@@ -0,0 +1,3069 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+/*
+ * This module can be used to catch cases when the btrfs kernel
+ * code executes write requests to the disk that bring the file
+ * system in an inconsistent state. In such a state, a power-loss
+ * or kernel panic event would cause that the data on disk is
+ * lost or at least damaged.
+ *
+ * Code is added that examines all block write requests during
+ * runtime (including writes of the super block). Three rules
+ * are verified and an error is printed on violation of the
+ * rules:
+ * 1. It is not allowed to write a disk block which is
+ *    currently referenced by the super block (either directly
+ *    or indirectly).
+ * 2. When a super block is written, it is verified that all
+ *    referenced (directly or indirectly) blocks fulfill the
+ *    following requirements:
+ *    2a. All referenced blocks have either been present when
+ *        the file system was mounted, (i.e., they have been
+ *        referenced by the super block) or they have been
+ *        written since then and the write completion callback
+ *        was called and a FLUSH request to the device where
+ *        these blocks are located was received and completed.
+ *    2b. All referenced blocks need to have a generation
+ *        number which is equal to the parent's number.
+ *
+ * One issue that was found using this module was that the log
+ * tree on disk became temporarily corrupted because disk blocks
+ * that had been in use for the log tree had been freed and
+ * reused too early, while being referenced by the written super
+ * block.
+ *
+ * The search term in the kernel log that can be used to filter
+ * on the existence of detected integrity issues is
+ * "btrfs: attempt".
+ *
+ * The integrity check is enabled via mount options. These
+ * mount options are only supported if the integrity check
+ * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
+ *
+ * Example #1, apply integrity checks to all metadata:
+ * mount /dev/sdb1 /mnt -o check_int
+ *
+ * Example #2, apply integrity checks to all metadata and
+ * to data extents:
+ * mount /dev/sdb1 /mnt -o check_int_data
+ *
+ * Example #3, apply integrity checks to all metadata and dump
+ * the tree that the super block references to kernel messages
+ * each time after a super block was written:
+ * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
+ *
+ * If the integrity check tool is included and activated in
+ * the mount options, plenty of kernel memory is used, and
+ * plenty of additional CPU cycles are spent. Enabling this
+ * functionality is not intended for normal use. In most
+ * cases, unless you are a btrfs developer who needs to verify
+ * the integrity of (super)-block write requests, do not
+ * enable the config option BTRFS_FS_CHECK_INTEGRITY to
+ * include and compile the integrity check tool.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/crc32c.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "extent_io.h"
+#include "disk-io.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "check-integrity.h"
+#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
+#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
+#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
+#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
+#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
+#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)    /* in characters,
+                                                         * excluding " [...]" */
+#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
+#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
+/*
+ * The definition of the bitmask fields for the print_mask.
+ * They are specified with the mount option check_integrity_print_mask.
+ */
+#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE                     0x00000001
+#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION         0x00000002
+#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE                  0x00000004
+#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE                 0x00000008
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH                        0x00000010
+#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH                        0x00000020
+#define BTRFSIC_PRINT_MASK_VERBOSE                              0x00000040
+#define BTRFSIC_PRINT_MASK_VERY_VERBOSE                         0x00000080
+#define BTRFSIC_PRINT_MASK_INITIAL_TREE                         0x00000100
+#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES                    0x00000200
+#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE                     0x00000400
+#define BTRFSIC_PRINT_MASK_NUM_COPIES                           0x00000800
+#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS                0x00001000
+struct btrfsic_dev_state;
+struct btrfsic_state;
+struct btrfsic_block {
+        u32 magic_num;          /* only used for debug purposes */
+        unsigned int is_metadata:1;     /* if it is meta-data, not data-data */
+        unsigned int is_superblock:1;   /* if it is one of the superblocks */
+        unsigned int is_iodone:1;       /* if is done by lower subsystem */
+        unsigned int iodone_w_error:1;  /* error was indicated to endio */
+        unsigned int never_written:1;   /* block was added because it was
+                                         * referenced, not because it was
+                                         * written */
+        unsigned int mirror_num:2;      /* large enough to hold
+                                         * BTRFS_SUPER_MIRROR_MAX */
+        struct btrfsic_dev_state *dev_state;
+        u64 dev_bytenr;         /* key, physical byte num on disk */
+        u64 logical_bytenr;     /* logical byte num on disk */
+        u64 generation;
+        struct btrfs_disk_key disk_key; /* extra info to print in case of
+                                         * issues, will not always be correct */
+        struct list_head collision_resolving_node;      /* list node */
+        struct list_head all_blocks_node;       /* list node */
+        /* the following two lists contain block_link items */
+        struct list_head ref_to_list;   /* list */
+        struct list_head ref_from_list; /* list */
+        struct btrfsic_block *next_in_same_bio;
+        void *orig_bio_bh_private;
+        union {
+                bio_end_io_t *bio;
+                bh_end_io_t *bh;
+        } orig_bio_bh_end_io;
+        int submit_bio_bh_rw;
+        u64 flush_gen; /* only valid if !never_written */
+};
+/*
+ * Elements of this type are allocated dynamically and required because
+ * each block object can refer to and can be ref from multiple blocks.
+ * The key to lookup them in the hashtable is the dev_bytenr of
+ * the block ref to plus the one from the block refered from.
+ * The fact that they are searchable via a hashtable and that a
+ * ref_cnt is maintained is not required for the btrfs integrity
+ * check algorithm itself, it is only used to make the output more
+ * beautiful in case that an error is detected (an error is defined
+ * as a write operation to a block while that block is still referenced).
+ */
+struct btrfsic_block_link {
+        u32 magic_num;          /* only used for debug purposes */
+        u32 ref_cnt;
+        struct list_head node_ref_to;   /* list node */
+        struct list_head node_ref_from; /* list node */
+        struct list_head collision_resolving_node;      /* list node */
+        struct btrfsic_block *block_ref_to;
+        struct btrfsic_block *block_ref_from;
+        u64 parent_generation;
+};
+struct btrfsic_dev_state {
+        u32 magic_num;          /* only used for debug purposes */
+        struct block_device *bdev;
+        struct btrfsic_state *state;
+        struct list_head collision_resolving_node;      /* list node */
+        struct btrfsic_block dummy_block_for_bio_bh_flush;
+        u64 last_flush_gen;
+        char name[BDEVNAME_SIZE];
+};
+struct btrfsic_block_hashtable {
+        struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
+};
+struct btrfsic_block_link_hashtable {
+        struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
+};
+struct btrfsic_dev_state_hashtable {
+        struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
+};
+struct btrfsic_block_data_ctx {
+        u64 start;              /* virtual bytenr */
+        u64 dev_bytenr;         /* physical bytenr on device */
+        u32 len;
+        struct btrfsic_dev_state *dev;
+        char *data;
+        struct buffer_head *bh; /* do not use if set to NULL */
+};
+/* This structure is used to implement recursion without occupying
+ * any stack space, refer to btrfsic_process_metablock() */
+struct btrfsic_stack_frame {
+        u32 magic;
+        u32 nr;
+        int error;
+        int i;
+        int limit_nesting;
+        int num_copies;
+        int mirror_num;
+        struct btrfsic_block *block;
+        struct btrfsic_block_data_ctx *block_ctx;
+        struct btrfsic_block *next_block;
+        struct btrfsic_block_data_ctx next_block_ctx;
+        struct btrfs_header *hdr;
+        struct btrfsic_stack_frame *prev;
+};
+/* Some state per mounted filesystem */
+struct btrfsic_state {
+        u32 print_mask;
+        int include_extent_data;
+        int csum_size;
+        struct list_head all_blocks_list;
+        struct btrfsic_block_hashtable block_hashtable;
+        struct btrfsic_block_link_hashtable block_link_hashtable;
+        struct btrfs_root *root;
+        u64 max_superblock_generation;
+        struct btrfsic_block *latest_superblock;
+};
+static void btrfsic_block_init(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_alloc(void);
+static void btrfsic_block_free(struct btrfsic_block *b);
+static void btrfsic_block_link_init(struct btrfsic_block_link *n);
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
+static void btrfsic_block_link_free(struct btrfsic_block_link *n);
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+                                        struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+                struct block_device *bdev,
+                u64 dev_bytenr,
+                struct btrfsic_block_hashtable *h);
+static void btrfsic_block_link_hashtable_init(
+                struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_add(
+                struct btrfsic_block_link *l,
+                struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+                struct block_device *bdev_ref_to,
+                u64 dev_bytenr_ref_to,
+                struct block_device *bdev_ref_from,
+                u64 dev_bytenr_ref_from,
+                struct btrfsic_block_link_hashtable *h);
+static void btrfsic_dev_state_hashtable_init(
+                struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_add(
+                struct btrfsic_dev_state *ds,
+                struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+                struct block_device *bdev,
+                struct btrfsic_dev_state_hashtable *h);
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+                                      struct btrfs_fs_devices *fs_devices);
+static int btrfsic_process_metablock(struct btrfsic_state *state,
+                                     struct btrfsic_block *block,
+                                     struct btrfsic_block_data_ctx *block_ctx,
+                                     struct btrfs_header *hdr,
+                                     int limit_nesting, int force_iodone_flag);
+static int btrfsic_create_link_to_next_block(
+                struct btrfsic_state *state,
+                struct btrfsic_block *block,
+                struct btrfsic_block_data_ctx
+                *block_ctx, u64 next_bytenr,
+                int limit_nesting,
+                struct btrfsic_block_data_ctx *next_block_ctx,
+                struct btrfsic_block **next_blockp,
+                int force_iodone_flag,
+                int *num_copiesp, int *mirror_nump,
+                struct btrfs_disk_key *disk_key,
+                u64 parent_generation);
+static int btrfsic_handle_extent_data(struct btrfsic_state *state,
+                                      struct btrfsic_block *block,
+                                      struct btrfsic_block_data_ctx *block_ctx,
+                                      u32 item_offset, int force_iodone_flag);
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+                             struct btrfsic_block_data_ctx *block_ctx_out,
+                             int mirror_num);
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+                                  u32 len, struct block_device *bdev,
+                                  struct btrfsic_block_data_ctx *block_ctx_out);
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
+static int btrfsic_read_block(struct btrfsic_state *state,
+                              struct btrfsic_block_data_ctx *block_ctx);
+static void btrfsic_dump_database(struct btrfsic_state *state);
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+                                     const u8 *data, unsigned int size);
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+                                          u64 dev_bytenr, u8 *mapped_data,
+                                          unsigned int len, struct bio *bio,
+                                          int *bio_is_patched,
+                                          struct buffer_head *bh,
+                                          int submit_bio_bh_rw);
+static int btrfsic_process_written_superblock(
+                struct btrfsic_state *state,
+                struct btrfsic_block *const block,
+                struct btrfs_super_block *const super_hdr);
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
+static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
+                                              const struct btrfsic_block *block,
+                                              int recursion_level);
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+                                        struct btrfsic_block *const block,
+                                        int recursion_level);
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+                                   const struct btrfsic_block_link *l);
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+                                   const struct btrfsic_block_link *l);
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+                                   const struct btrfsic_block *block);
+static void btrfsic_dump_tree(const struct btrfsic_state *state);
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+                                  const struct btrfsic_block *block,
+                                  int indent_level);
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+                struct btrfsic_state *state,
+                struct btrfsic_block_data_ctx *next_block_ctx,
+                struct btrfsic_block *next_block,
+                struct btrfsic_block *from_block,
+                u64 parent_generation);
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+                struct btrfsic_state *state,
+                struct btrfsic_block_data_ctx *block_ctx,
+                const char *additional_string,
+                int is_metadata,
+                int is_iodone,
+                int never_written,
+                int mirror_num,
+                int *was_created);
+static int btrfsic_process_superblock_dev_mirror(
+                struct btrfsic_state *state,
+                struct btrfsic_dev_state *dev_state,
+                struct btrfs_device *device,
+                int superblock_mirror_num,
+                struct btrfsic_dev_state **selected_dev_state,
+                struct btrfs_super_block *selected_super);
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+                struct block_device *bdev);
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+                                           u64 bytenr,
+                                           struct btrfsic_dev_state *dev_state,
+                                           u64 dev_bytenr, char *data);
+static struct mutex btrfsic_mutex;
+static int btrfsic_is_initialized;
+static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
+static void btrfsic_block_init(struct btrfsic_block *b)
+{
+        b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
+        b->dev_state = NULL;
+        b->dev_bytenr = 0;
+        b->logical_bytenr = 0;
+        b->generation = BTRFSIC_GENERATION_UNKNOWN;
+        b->disk_key.objectid = 0;
+        b->disk_key.type = 0;
+        b->disk_key.offset = 0;
+        b->is_metadata = 0;
+        b->is_superblock = 0;
+        b->is_iodone = 0;
+        b->iodone_w_error = 0;
+        b->never_written = 0;
+        b->mirror_num = 0;
+        b->next_in_same_bio = NULL;
+        b->orig_bio_bh_private = NULL;
+        b->orig_bio_bh_end_io.bio = NULL;
+        INIT_LIST_HEAD(&b->collision_resolving_node);
+        INIT_LIST_HEAD(&b->all_blocks_node);
+        INIT_LIST_HEAD(&b->ref_to_list);
+        INIT_LIST_HEAD(&b->ref_from_list);
+        b->submit_bio_bh_rw = 0;
+        b->flush_gen = 0;
+}
+static struct btrfsic_block *btrfsic_block_alloc(void)
+{
+        struct btrfsic_block *b;
+        b = kzalloc(sizeof(*b), GFP_NOFS);
+        if (NULL != b)
+                btrfsic_block_init(b);
+        return b;
+}
+static void btrfsic_block_free(struct btrfsic_block *b)
+{
+        BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
+        kfree(b);
+}
+static void btrfsic_block_link_init(struct btrfsic_block_link *l)
+{
+        l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
+        l->ref_cnt = 1;
+        INIT_LIST_HEAD(&l->node_ref_to);
+        INIT_LIST_HEAD(&l->node_ref_from);
+        INIT_LIST_HEAD(&l->collision_resolving_node);
+        l->block_ref_to = NULL;
+        l->block_ref_from = NULL;
+}
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
+{
+        struct btrfsic_block_link *l;
+        l = kzalloc(sizeof(*l), GFP_NOFS);
+        if (NULL != l)
+                btrfsic_block_link_init(l);
+        return l;
+}
+static void btrfsic_block_link_free(struct btrfsic_block_link *l)
+{
+        BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
+        kfree(l);
+}
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
+{
+        ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
+        ds->bdev = NULL;
+        ds->state = NULL;
+        ds->name[0] = '\0';
+        INIT_LIST_HEAD(&ds->collision_resolving_node);
+        ds->last_flush_gen = 0;
+        btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
+        ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
+        ds->dummy_block_for_bio_bh_flush.dev_state = ds;
+}
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
+{
+        struct btrfsic_dev_state *ds;
+        ds = kzalloc(sizeof(*ds), GFP_NOFS);
+        if (NULL != ds)
+                btrfsic_dev_state_init(ds);
+        return ds;
+}
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
+{
+        BUG_ON(!(NULL == ds ||
+                 BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
+        kfree(ds);
+}
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
+{
+        int i;
+        for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
+                INIT_LIST_HEAD(h->table + i);
+}
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+                                        struct btrfsic_block_hashtable *h)
+{
+        const unsigned int hashval =
+            (((unsigned int)(b->dev_bytenr >> 16)) ^
+             ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
+             (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+        list_add(&b->collision_resolving_node, h->table + hashval);
+}
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
+{
+        list_del(&b->collision_resolving_node);
+}
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+                struct block_device *bdev,
+                u64 dev_bytenr,
+                struct btrfsic_block_hashtable *h)
+{
+        const unsigned int hashval =
+            (((unsigned int)(dev_bytenr >> 16)) ^
+             ((unsigned int)((uintptr_t)bdev))) &
+             (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+        struct list_head *elem;
+        list_for_each(elem, h->table + hashval) {
+                struct btrfsic_block *const b =
+                    list_entry(elem, struct btrfsic_block,
+                               collision_resolving_node);
+                if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
+                        return b;
+        }
+        return NULL;
+}
+static void btrfsic_block_link_hashtable_init(
+                struct btrfsic_block_link_hashtable *h)
+{
+        int i;
+        for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
+                INIT_LIST_HEAD(h->table + i);
+}
+static void btrfsic_block_link_hashtable_add(
+                struct btrfsic_block_link *l,
+                struct btrfsic_block_link_hashtable *h)
+{
+        const unsigned int hashval =
+            (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
+             ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
+             ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
+             ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
+             & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+        BUG_ON(NULL == l->block_ref_to);
+        BUG_ON(NULL == l->block_ref_from);
+        list_add(&l->collision_resolving_node, h->table + hashval);
+}
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
+{
+        list_del(&l->collision_resolving_node);
+}
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+                struct block_device *bdev_ref_to,
+                u64 dev_bytenr_ref_to,
+                struct block_device *bdev_ref_from,
+                u64 dev_bytenr_ref_from,
+                struct btrfsic_block_link_hashtable *h)
+{
+        const unsigned int hashval =
+            (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
+             ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
+             ((unsigned int)((uintptr_t)bdev_ref_to)) ^
+             ((unsigned int)((uintptr_t)bdev_ref_from))) &
+             (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+        struct list_head *elem;
+        list_for_each(elem, h->table + hashval) {
+                struct btrfsic_block_link *const l =
+                    list_entry(elem, struct btrfsic_block_link,
+                               collision_resolving_node);
+                BUG_ON(NULL == l->block_ref_to);
+                BUG_ON(NULL == l->block_ref_from);
+                if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
+                    l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
+                    l->block_ref_from->dev_state->bdev == bdev_ref_from &&
+                    l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
+                        return l;
+        }
+        return NULL;
+}
+static void btrfsic_dev_state_hashtable_init(
+                struct btrfsic_dev_state_hashtable *h)
+{
+        int i;
+        for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
+                INIT_LIST_HEAD(h->table + i);
+}
+static void btrfsic_dev_state_hashtable_add(
+                struct btrfsic_dev_state *ds,
+                struct btrfsic_dev_state_hashtable *h)
+{
+        const unsigned int hashval =
+            (((unsigned int)((uintptr_t)ds->bdev)) &
+             (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+        list_add(&ds->collision_resolving_node, h->table + hashval);
+}
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
+{
+        list_del(&ds->collision_resolving_node);
+}
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+                struct block_device *bdev,
+                struct btrfsic_dev_state_hashtable *h)
+{
+        const unsigned int hashval =
+            (((unsigned int)((uintptr_t)bdev)) &
+             (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+        struct list_head *elem;
+        list_for_each(elem, h->table + hashval) {
+                struct btrfsic_dev_state *const ds =
+                    list_entry(elem, struct btrfsic_dev_state,
+                               collision_resolving_node);
+                if (ds->bdev == bdev)
+                        return ds;
+        }
+        return NULL;
+}
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+                                      struct btrfs_fs_devices *fs_devices)
+{
+        int ret;
+        struct btrfs_super_block *selected_super;
+        struct list_head *dev_head = &fs_devices->devices;
+        struct btrfs_device *device;
+        struct btrfsic_dev_state *selected_dev_state = NULL;
+        int pass;
+        BUG_ON(NULL == state);
+        selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
+        if (NULL == selected_super) {
+                printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                return -1;
+        }
+        list_for_each_entry(device, dev_head, dev_list) {
+                int i;
+                struct btrfsic_dev_state *dev_state;
+                if (!device->bdev || !device->name)
+                        continue;
+                dev_state = btrfsic_dev_state_lookup(device->bdev);
+                BUG_ON(NULL == dev_state);
+                for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                        ret = btrfsic_process_superblock_dev_mirror(
+                                        state, dev_state, device, i,
+                                        &selected_dev_state, selected_super);
+                        if (0 != ret && 0 == i) {
+                                kfree(selected_super);
+                                return ret;
+                        }
+                }
+        }
+        if (NULL == state->latest_superblock) {
+                printk(KERN_INFO "btrfsic: no superblock found!\n");
+                kfree(selected_super);
+                return -1;
+        }
+        state->csum_size = btrfs_super_csum_size(selected_super);
+        for (pass = 0; pass < 3; pass++) {
+                int num_copies;
+                int mirror_num;
+                u64 next_bytenr;
+                switch (pass) {
+                case 0:
+                        next_bytenr = btrfs_super_root(selected_super);
+                        if (state->print_mask &
+                            BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                                printk(KERN_INFO "root@%llu\n",
+                                       (unsigned long long)next_bytenr);
+                        break;
+                case 1:
+                        next_bytenr = btrfs_super_chunk_root(selected_super);
+                        if (state->print_mask &
+                            BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                                printk(KERN_INFO "chunk@%llu\n",
+                                       (unsigned long long)next_bytenr);
+                        break;
+                case 2:
+                        next_bytenr = btrfs_super_log_root(selected_super);
+                        if (0 == next_bytenr)
+                                continue;
+                        if (state->print_mask &
+                            BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                                printk(KERN_INFO "log@%llu\n",
+                                       (unsigned long long)next_bytenr);
+                        break;
+                }
+                num_copies =
+                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                     next_bytenr, PAGE_SIZE);
+                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                               (unsigned long long)next_bytenr, num_copies);
+                for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                        struct btrfsic_block *next_block;
+                        struct btrfsic_block_data_ctx tmp_next_block_ctx;
+                        struct btrfsic_block_link *l;
+                        struct btrfs_header *hdr;
+                        ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                                                &tmp_next_block_ctx,
+                                                mirror_num);
+                        if (ret) {
+                                printk(KERN_INFO "btrfsic:"
+                                       " btrfsic_map_block(root @%llu,"
+                                       " mirror %d) failed!\n",
+                                       (unsigned long long)next_bytenr,
+                                       mirror_num);
+                                kfree(selected_super);
+                                return -1;
+                        }
+                        next_block = btrfsic_block_hashtable_lookup(
+                                        tmp_next_block_ctx.dev->bdev,
+                                        tmp_next_block_ctx.dev_bytenr,
+                                        &state->block_hashtable);
+                        BUG_ON(NULL == next_block);
+                        l = btrfsic_block_link_hashtable_lookup(
+                                        tmp_next_block_ctx.dev->bdev,
+                                        tmp_next_block_ctx.dev_bytenr,
+                                        state->latest_superblock->dev_state->
+                                        bdev,
+                                        state->latest_superblock->dev_bytenr,
+                                        &state->block_link_hashtable);
+                        BUG_ON(NULL == l);
+                        ret = btrfsic_read_block(state, &tmp_next_block_ctx);
+                        if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+                                printk(KERN_INFO
+                                       "btrfsic: read @logical %llu failed!\n",
+                                       (unsigned long long)
+                                       tmp_next_block_ctx.start);
+                                btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                                kfree(selected_super);
+                                return -1;
+                        }
+                        hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
+                        ret = btrfsic_process_metablock(state,
+                                                        next_block,
+                                                        &tmp_next_block_ctx,
+                                                        hdr,
+                                                        BTRFS_MAX_LEVEL + 3, 1);
+                        btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                }
+        }
+        kfree(selected_super);
+        return ret;
+}
+static int btrfsic_process_superblock_dev_mirror(
+                struct btrfsic_state *state,
+                struct btrfsic_dev_state *dev_state,
+                struct btrfs_device *device,
+                int superblock_mirror_num,
+                struct btrfsic_dev_state **selected_dev_state,
+                struct btrfs_super_block *selected_super)
+{
+        struct btrfs_super_block *super_tmp;
+        u64 dev_bytenr;
+        struct buffer_head *bh;
+        struct btrfsic_block *superblock_tmp;
+        int pass;
+        struct block_device *const superblock_bdev = device->bdev;
+        /* super block bytenr is always the unmapped device bytenr */
+        dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
+        bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
+        if (NULL == bh)
+                return -1;
+        super_tmp = (struct btrfs_super_block *)
+            (bh->b_data + (dev_bytenr & 4095));
+        if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
+            strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
+                    sizeof(super_tmp->magic)) ||
+            memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
+                brelse(bh);
+                return 0;
+        }
+        superblock_tmp =
+            btrfsic_block_hashtable_lookup(superblock_bdev,
+                                           dev_bytenr,
+                                           &state->block_hashtable);
+        if (NULL == superblock_tmp) {
+                superblock_tmp = btrfsic_block_alloc();
+                if (NULL == superblock_tmp) {
+                        printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                        brelse(bh);
+                        return -1;
+                }
+                /* for superblock, only the dev_bytenr makes sense */
+                superblock_tmp->dev_bytenr = dev_bytenr;
+                superblock_tmp->dev_state = dev_state;
+                superblock_tmp->logical_bytenr = dev_bytenr;
+                superblock_tmp->generation = btrfs_super_generation(super_tmp);
+                superblock_tmp->is_metadata = 1;
+                superblock_tmp->is_superblock = 1;
+                superblock_tmp->is_iodone = 1;
+                superblock_tmp->never_written = 0;
+                superblock_tmp->mirror_num = 1 + superblock_mirror_num;
+                if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+                        printk(KERN_INFO "New initial S-block (bdev %p, %s)"
+                               " @%llu (%s/%llu/%d)\n",
+                               superblock_bdev, device->name,
+                               (unsigned long long)dev_bytenr,
+                               dev_state->name,
+                               (unsigned long long)dev_bytenr,
+                               superblock_mirror_num);
+                list_add(&superblock_tmp->all_blocks_node,
+                         &state->all_blocks_list);
+                btrfsic_block_hashtable_add(superblock_tmp,
+                                            &state->block_hashtable);
+        }
+        /* select the one with the highest generation field */
+        if (btrfs_super_generation(super_tmp) >
+            state->max_superblock_generation ||
+            0 == state->max_superblock_generation) {
+                memcpy(selected_super, super_tmp, sizeof(*selected_super));
+                *selected_dev_state = dev_state;
+                state->max_superblock_generation =
+                    btrfs_super_generation(super_tmp);
+                state->latest_superblock = superblock_tmp;
+        }
+        for (pass = 0; pass < 3; pass++) {
+                u64 next_bytenr;
+                int num_copies;
+                int mirror_num;
+                const char *additional_string = NULL;
+                struct btrfs_disk_key tmp_disk_key;
+                tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+                tmp_disk_key.offset = 0;
+                switch (pass) {
+                case 0:
+                        tmp_disk_key.objectid =
+                            cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+                        additional_string = "initial root ";
+                        next_bytenr = btrfs_super_root(super_tmp);
+                        break;
+                case 1:
+                        tmp_disk_key.objectid =
+                            cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+                        additional_string = "initial chunk ";
+                        next_bytenr = btrfs_super_chunk_root(super_tmp);
+                        break;
+                case 2:
+                        tmp_disk_key.objectid =
+                            cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+                        additional_string = "initial log ";
+                        next_bytenr = btrfs_super_log_root(super_tmp);
+                        if (0 == next_bytenr)
+                                continue;
+                        break;
+                }
+                num_copies =
+                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                     next_bytenr, PAGE_SIZE);
+                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                               (unsigned long long)next_bytenr, num_copies);
+                for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                        struct btrfsic_block *next_block;
+                        struct btrfsic_block_data_ctx tmp_next_block_ctx;
+                        struct btrfsic_block_link *l;
+                        if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                                              &tmp_next_block_ctx,
+                                              mirror_num)) {
+                                printk(KERN_INFO "btrfsic: btrfsic_map_block("
+                                       "bytenr @%llu, mirror %d) failed!\n",
+                                       (unsigned long long)next_bytenr,
+                                       mirror_num);
+                                brelse(bh);
+                                return -1;
+                        }
+                        next_block = btrfsic_block_lookup_or_add(
+                                        state, &tmp_next_block_ctx,
+                                        additional_string, 1, 1, 0,
+                                        mirror_num, NULL);
+                        if (NULL == next_block) {
+                                btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                                brelse(bh);
+                                return -1;
+                        }
+                        next_block->disk_key = tmp_disk_key;
+                        next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+                        l = btrfsic_block_link_lookup_or_add(
+                                        state, &tmp_next_block_ctx,
+                                        next_block, superblock_tmp,
+                                        BTRFSIC_GENERATION_UNKNOWN);
+                        btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                        if (NULL == l) {
+                                brelse(bh);
+                                return -1;
+                        }
+                }
+        }
+        if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
+                btrfsic_dump_tree_sub(state, superblock_tmp, 0);
+        brelse(bh);
+        return 0;
+}
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
+{
+        struct btrfsic_stack_frame *sf;
+        sf = kzalloc(sizeof(*sf), GFP_NOFS);
+        if (NULL == sf)
+                printk(KERN_INFO "btrfsic: alloc memory failed!\n");
+        else
+                sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
+        return sf;
+}
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
+{
+        BUG_ON(!(NULL == sf ||
+                 BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
+        kfree(sf);
+}
+static int btrfsic_process_metablock(
+                struct btrfsic_state *state,
+                struct btrfsic_block *const first_block,
+                struct btrfsic_block_data_ctx *const first_block_ctx,
+                struct btrfs_header *const first_hdr,
+                int first_limit_nesting, int force_iodone_flag)
+{
+        struct btrfsic_stack_frame initial_stack_frame = { 0 };
+        struct btrfsic_stack_frame *sf;
+        struct btrfsic_stack_frame *next_stack;
+        sf = &initial_stack_frame;
+        sf->error = 0;
+        sf->i = -1;
+        sf->limit_nesting = first_limit_nesting;
+        sf->block = first_block;
+        sf->block_ctx = first_block_ctx;
+        sf->next_block = NULL;
+        sf->hdr = first_hdr;
+        sf->prev = NULL;
+continue_with_new_stack_frame:
+        sf->block->generation = le64_to_cpu(sf->hdr->generation);
+        if (0 == sf->hdr->level) {
+                struct btrfs_leaf *const leafhdr =
+                    (struct btrfs_leaf *)sf->hdr;
+                if (-1 == sf->i) {
+                        sf->nr = le32_to_cpu(leafhdr->header.nritems);
+                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                                printk(KERN_INFO
+                                       "leaf %llu items %d generation %llu"
+                                       " owner %llu\n",
+                                       (unsigned long long)
+                                       sf->block_ctx->start,
+                                       sf->nr,
+                                       (unsigned long long)
+                                       le64_to_cpu(leafhdr->header.generation),
+                                       (unsigned long long)
+                                       le64_to_cpu(leafhdr->header.owner));
+                }
+continue_with_current_leaf_stack_frame:
+                if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+                        sf->i++;
+                        sf->num_copies = 0;
+                }
+                if (sf->i < sf->nr) {
+                        struct btrfs_item *disk_item = leafhdr->items + sf->i;
+                        struct btrfs_disk_key *disk_key = &disk_item->key;
+                        u8 type;
+                        const u32 item_offset = le32_to_cpu(disk_item->offset);
+                        type = disk_key->type;
+                        if (BTRFS_ROOT_ITEM_KEY == type) {
+                                const struct btrfs_root_item *const root_item =
+                                    (struct btrfs_root_item *)
+                                    (sf->block_ctx->data +
+                                     offsetof(struct btrfs_leaf, items) +
+                                     item_offset);
+                                const u64 next_bytenr =
+                                    le64_to_cpu(root_item->bytenr);
+                                sf->error =
+                                    btrfsic_create_link_to_next_block(
+                                                state,
+                                                sf->block,
+                                                sf->block_ctx,
+                                                next_bytenr,
+                                                sf->limit_nesting,
+                                                &sf->next_block_ctx,
+                                                &sf->next_block,
+                                                force_iodone_flag,
+                                                &sf->num_copies,
+                                                &sf->mirror_num,
+                                                disk_key,
+                                                le64_to_cpu(root_item->
+                                                generation));
+                                if (sf->error)
+                                        goto one_stack_frame_backwards;
+                                if (NULL != sf->next_block) {
+                                        struct btrfs_header *const next_hdr =
+                                            (struct btrfs_header *)
+                                            sf->next_block_ctx.data;
+                                        next_stack =
+                                            btrfsic_stack_frame_alloc();
+                                        if (NULL == next_stack) {
+                                                btrfsic_release_block_ctx(
+                                                                &sf->
+                                                                next_block_ctx);
+                                                goto one_stack_frame_backwards;
+                                        }
+                                        next_stack->i = -1;
+                                        next_stack->block = sf->next_block;
+                                        next_stack->block_ctx =
+                                            &sf->next_block_ctx;
+                                        next_stack->next_block = NULL;
+                                        next_stack->hdr = next_hdr;
+                                        next_stack->limit_nesting =
+                                            sf->limit_nesting - 1;
+                                        next_stack->prev = sf;
+                                        sf = next_stack;
+                                        goto continue_with_new_stack_frame;
+                                }
+                        } else if (BTRFS_EXTENT_DATA_KEY == type &&
+                                   state->include_extent_data) {
+                                sf->error = btrfsic_handle_extent_data(
+                                                state,
+                                                sf->block,
+                                                sf->block_ctx,
+                                                item_offset,
+                                                force_iodone_flag);
+                                if (sf->error)
+                                        goto one_stack_frame_backwards;
+                        }
+                        goto continue_with_current_leaf_stack_frame;
+                }
+        } else {
+                struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
+                if (-1 == sf->i) {
+                        sf->nr = le32_to_cpu(nodehdr->header.nritems);
+                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                                printk(KERN_INFO "node %llu level %d items %d"
+                                       " generation %llu owner %llu\n",
+                                       (unsigned long long)
+                                       sf->block_ctx->start,
+                                       nodehdr->header.level, sf->nr,
+                                       (unsigned long long)
+                                       le64_to_cpu(nodehdr->header.generation),
+                                       (unsigned long long)
+                                       le64_to_cpu(nodehdr->header.owner));
+                }
+continue_with_current_node_stack_frame:
+                if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+                        sf->i++;
+                        sf->num_copies = 0;
+                }
+                if (sf->i < sf->nr) {
+                        struct btrfs_key_ptr *disk_key_ptr =
+                            nodehdr->ptrs + sf->i;
+                        const u64 next_bytenr =
+                            le64_to_cpu(disk_key_ptr->blockptr);
+                        sf->error = btrfsic_create_link_to_next_block(
+                                        state,
+                                        sf->block,
+                                        sf->block_ctx,
+                                        next_bytenr,
+                                        sf->limit_nesting,
+                                        &sf->next_block_ctx,
+                                        &sf->next_block,
+                                        force_iodone_flag,
+                                        &sf->num_copies,
+                                        &sf->mirror_num,
+                                        &disk_key_ptr->key,
+                                        le64_to_cpu(disk_key_ptr->generation));
+                        if (sf->error)
+                                goto one_stack_frame_backwards;
+                        if (NULL != sf->next_block) {
+                                struct btrfs_header *const next_hdr =
+                                    (struct btrfs_header *)
+                                    sf->next_block_ctx.data;
+                                next_stack = btrfsic_stack_frame_alloc();
+                                if (NULL == next_stack)
+                                        goto one_stack_frame_backwards;
+                                next_stack->i = -1;
+                                next_stack->block = sf->next_block;
+                                next_stack->block_ctx = &sf->next_block_ctx;
+                                next_stack->next_block = NULL;
+                                next_stack->hdr = next_hdr;
+                                next_stack->limit_nesting =
+                                    sf->limit_nesting - 1;
+                                next_stack->prev = sf;
+                                sf = next_stack;
+                                goto continue_with_new_stack_frame;
+                        }
+                        goto continue_with_current_node_stack_frame;
+                }
+        }
+one_stack_frame_backwards:
+        if (NULL != sf->prev) {
+                struct btrfsic_stack_frame *const prev = sf->prev;
+                /* the one for the initial block is freed in the caller */
+                btrfsic_release_block_ctx(sf->block_ctx);
+                if (sf->error) {
+                        prev->error = sf->error;
+                        btrfsic_stack_frame_free(sf);
+                        sf = prev;
+                        goto one_stack_frame_backwards;
+                }
+                btrfsic_stack_frame_free(sf);
+                sf = prev;
+                goto continue_with_new_stack_frame;
+        } else {
+                BUG_ON(&initial_stack_frame != sf);
+        }
+        return sf->error;
+}
+static int btrfsic_create_link_to_next_block(
+                struct btrfsic_state *state,
+                struct btrfsic_block *block,
+                struct btrfsic_block_data_ctx *block_ctx,
+                u64 next_bytenr,
+                int limit_nesting,
+                struct btrfsic_block_data_ctx *next_block_ctx,
+                struct btrfsic_block **next_blockp,
+                int force_iodone_flag,
+                int *num_copiesp, int *mirror_nump,
+                struct btrfs_disk_key *disk_key,
+                u64 parent_generation)
+{
+        struct btrfsic_block *next_block = NULL;
+        int ret;
+        struct btrfsic_block_link *l;
+        int did_alloc_block_link;
+        int block_was_created;
+        *next_blockp = NULL;
+        if (0 == *num_copiesp) {
+                *num_copiesp =
+                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                     next_bytenr, PAGE_SIZE);
+                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                               (unsigned long long)next_bytenr, *num_copiesp);
+                *mirror_nump = 1;
+        }
+        if (*mirror_nump > *num_copiesp)
+                return 0;
+        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                printk(KERN_INFO
+                       "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
+                       *mirror_nump);
+        ret = btrfsic_map_block(state, next_bytenr,
+                                BTRFSIC_BLOCK_SIZE,
+                                next_block_ctx, *mirror_nump);
+        if (ret) {
+                printk(KERN_INFO
+                       "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+                       (unsigned long long)next_bytenr, *mirror_nump);
+                btrfsic_release_block_ctx(next_block_ctx);
+                *next_blockp = NULL;
+                return -1;
+        }
+        next_block = btrfsic_block_lookup_or_add(state,
+                                                 next_block_ctx, "referenced ",
+                                                 1, force_iodone_flag,
+                                                 !force_iodone_flag,
+                                                 *mirror_nump,
+                                                 &block_was_created);
+        if (NULL == next_block) {
+                btrfsic_release_block_ctx(next_block_ctx);
+                *next_blockp = NULL;
+                return -1;
+        }
+        if (block_was_created) {
+                l = NULL;
+                next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+        } else {
+                if (next_block->logical_bytenr != next_bytenr &&
+                    !(!next_block->is_metadata &&
+                      0 == next_block->logical_bytenr)) {
+                        printk(KERN_INFO
+                               "Referenced block @%llu (%s/%llu/%d)"
+                               " found in hash table, %c,"
+                               " bytenr mismatch (!= stored %llu).\n",
+                               (unsigned long long)next_bytenr,
+                               next_block_ctx->dev->name,
+                               (unsigned long long)next_block_ctx->dev_bytenr,
+                               *mirror_nump,
+                               btrfsic_get_block_type(state, next_block),
+                               (unsigned long long)next_block->logical_bytenr);
+                } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                        printk(KERN_INFO
+                               "Referenced block @%llu (%s/%llu/%d)"
+                               " found in hash table, %c.\n",
+                               (unsigned long long)next_bytenr,
+                               next_block_ctx->dev->name,
+                               (unsigned long long)next_block_ctx->dev_bytenr,
+                               *mirror_nump,
+                               btrfsic_get_block_type(state, next_block));
+                next_block->logical_bytenr = next_bytenr;
+                next_block->mirror_num = *mirror_nump;
+                l = btrfsic_block_link_hashtable_lookup(
+                                next_block_ctx->dev->bdev,
+                                next_block_ctx->dev_bytenr,
+                                block_ctx->dev->bdev,
+                                block_ctx->dev_bytenr,
+                                &state->block_link_hashtable);
+        }
+        next_block->disk_key = *disk_key;
+        if (NULL == l) {
+                l = btrfsic_block_link_alloc();
+                if (NULL == l) {
+                        printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                        btrfsic_release_block_ctx(next_block_ctx);
+                        *next_blockp = NULL;
+                        return -1;
+                }
+                did_alloc_block_link = 1;
+                l->block_ref_to = next_block;
+                l->block_ref_from = block;
+                l->ref_cnt = 1;
+                l->parent_generation = parent_generation;
+                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                        btrfsic_print_add_link(state, l);
+                list_add(&l->node_ref_to, &block->ref_to_list);
+                list_add(&l->node_ref_from, &next_block->ref_from_list);
+                btrfsic_block_link_hashtable_add(l,
+                                                 &state->block_link_hashtable);
+        } else {
+                did_alloc_block_link = 0;
+                if (0 == limit_nesting) {
+                        l->ref_cnt++;
+                        l->parent_generation = parent_generation;
+                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                                btrfsic_print_add_link(state, l);
+                }
+        }
+        if (limit_nesting > 0 && did_alloc_block_link) {
+                ret = btrfsic_read_block(state, next_block_ctx);
+                if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+                        printk(KERN_INFO
+                               "btrfsic: read block @logical %llu failed!\n",
+                               (unsigned long long)next_bytenr);
+                        btrfsic_release_block_ctx(next_block_ctx);
+                        *next_blockp = NULL;
+                        return -1;
+                }
+                *next_blockp = next_block;
+        } else {
+                *next_blockp = NULL;
+        }
+        (*mirror_nump)++;
+        return 0;
+}
+static int btrfsic_handle_extent_data(
+                struct btrfsic_state *state,
+                struct btrfsic_block *block,
+                struct btrfsic_block_data_ctx *block_ctx,
+                u32 item_offset, int force_iodone_flag)
+{
+        int ret;
+        struct btrfs_file_extent_item *file_extent_item =
+            (struct btrfs_file_extent_item *)(block_ctx->data +
+                                              offsetof(struct btrfs_leaf,
+                                                       items) + item_offset);
+        u64 next_bytenr =
+            le64_to_cpu(file_extent_item->disk_bytenr) +
+            le64_to_cpu(file_extent_item->offset);
+        u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
+        u64 generation = le64_to_cpu(file_extent_item->generation);
+        struct btrfsic_block_link *l;
+        if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+                printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
+                       " offset = %llu, num_bytes = %llu\n",
+                       file_extent_item->type,
+                       (unsigned long long)
+                       le64_to_cpu(file_extent_item->disk_bytenr),
+                       (unsigned long long)
+                       le64_to_cpu(file_extent_item->offset),
+                       (unsigned long long)
+                       le64_to_cpu(file_extent_item->num_bytes));
+        if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
+            ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
+                return 0;
+        while (num_bytes > 0) {
+                u32 chunk_len;
+                int num_copies;
+                int mirror_num;
+                if (num_bytes > BTRFSIC_BLOCK_SIZE)
+                        chunk_len = BTRFSIC_BLOCK_SIZE;
+                else
+                        chunk_len = num_bytes;
+                num_copies =
+                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                     next_bytenr, PAGE_SIZE);
+                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                               (unsigned long long)next_bytenr, num_copies);
+                for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                        struct btrfsic_block_data_ctx next_block_ctx;
+                        struct btrfsic_block *next_block;
+                        int block_was_created;
+                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                                printk(KERN_INFO "btrfsic_handle_extent_data("
+                                       "mirror_num=%d)\n", mirror_num);
+                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+                                printk(KERN_INFO
+                                       "\tdisk_bytenr = %llu, num_bytes %u\n",
+                                       (unsigned long long)next_bytenr,
+                                       chunk_len);
+                        ret = btrfsic_map_block(state, next_bytenr,
+                                                chunk_len, &next_block_ctx,
+                                                mirror_num);
+                        if (ret) {
+                                printk(KERN_INFO
+                                       "btrfsic: btrfsic_map_block(@%llu,"
+                                       " mirror=%d) failed!\n",
+                                       (unsigned long long)next_bytenr,
+                                       mirror_num);
+                                return -1;
+                        }
+                        next_block = btrfsic_block_lookup_or_add(
+                                        state,
+                                        &next_block_ctx,
+                                        "referenced ",
+                                        0,
+                                        force_iodone_flag,
+                                        !force_iodone_flag,
+                                        mirror_num,
+                                        &block_was_created);
+                        if (NULL == next_block) {
+                                printk(KERN_INFO
+                                       "btrfsic: error, kmalloc failed!\n");
+                                btrfsic_release_block_ctx(&next_block_ctx);
+                                return -1;
+                        }
+                        if (!block_was_created) {
+                                if (next_block->logical_bytenr != next_bytenr &&
+                                    !(!next_block->is_metadata &&
+                                      0 == next_block->logical_bytenr)) {
+                                        printk(KERN_INFO
+                                               "Referenced block"
+                                               " @%llu (%s/%llu/%d)"
+                                               " found in hash table, D,"
+                                               " bytenr mismatch"
+                                               " (!= stored %llu).\n",
+                                               (unsigned long long)next_bytenr,
+                                               next_block_ctx.dev->name,
+                                               (unsigned long long)
+                                               next_block_ctx.dev_bytenr,
+                                               mirror_num,
+                                               (unsigned long long)
+                                               next_block->logical_bytenr);
+                                }
+                                next_block->logical_bytenr = next_bytenr;
+                                next_block->mirror_num = mirror_num;
+                        }
+                        l = btrfsic_block_link_lookup_or_add(state,
+                                                             &next_block_ctx,
+                                                             next_block, block,
+                                                             generation);
+                        btrfsic_release_block_ctx(&next_block_ctx);
+                        if (NULL == l)
+                                return -1;
+                }
+                next_bytenr += chunk_len;
+                num_bytes -= chunk_len;
+        }
+        return 0;
+}
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+                             struct btrfsic_block_data_ctx *block_ctx_out,
+                             int mirror_num)
+{
+        int ret;
+        u64 length;
+        struct btrfs_bio *multi = NULL;
+        struct btrfs_device *device;
+        length = len;
+        ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+                              bytenr, &length, &multi, mirror_num);
+        device = multi->stripes[0].dev;
+        block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
+        block_ctx_out->dev_bytenr = multi->stripes[0].physical;
+        block_ctx_out->start = bytenr;
+        block_ctx_out->len = len;
+        block_ctx_out->data = NULL;
+        block_ctx_out->bh = NULL;
+        if (0 == ret)
+                kfree(multi);
+        if (NULL == block_ctx_out->dev) {
+                ret = -ENXIO;
+                printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
+        }
+        return ret;
+}
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+                                  u32 len, struct block_device *bdev,
+                                  struct btrfsic_block_data_ctx *block_ctx_out)
+{
+        block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
+        block_ctx_out->dev_bytenr = bytenr;
+        block_ctx_out->start = bytenr;
+        block_ctx_out->len = len;
+        block_ctx_out->data = NULL;
+        block_ctx_out->bh = NULL;
+        if (NULL != block_ctx_out->dev) {
+                return 0;
+        } else {
+                printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
+                return -ENXIO;
+        }
+}
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
+{
+        if (NULL != block_ctx->bh) {
+                brelse(block_ctx->bh);
+                block_ctx->bh = NULL;
+        }
+}
+static int btrfsic_read_block(struct btrfsic_state *state,
+                              struct btrfsic_block_data_ctx *block_ctx)
+{
+        block_ctx->bh = NULL;
+        if (block_ctx->dev_bytenr & 4095) {
+                printk(KERN_INFO
+                       "btrfsic: read_block() with unaligned bytenr %llu\n",
+                       (unsigned long long)block_ctx->dev_bytenr);
+                return -1;
+        }
+        if (block_ctx->len > 4096) {
+                printk(KERN_INFO
+                       "btrfsic: read_block() with too huge size %d\n",
+                       block_ctx->len);
+                return -1;
+        }
+        block_ctx->bh = __bread(block_ctx->dev->bdev,
+                                block_ctx->dev_bytenr >> 12, 4096);
+        if (NULL == block_ctx->bh)
+                return -1;
+        block_ctx->data = block_ctx->bh->b_data;
+        return block_ctx->len;
+}
+static void btrfsic_dump_database(struct btrfsic_state *state)
+{
+        struct list_head *elem_all;
+        BUG_ON(NULL == state);
+        printk(KERN_INFO "all_blocks_list:\n");
+        list_for_each(elem_all, &state->all_blocks_list) {
+                const struct btrfsic_block *const b_all =
+                    list_entry(elem_all, struct btrfsic_block,
+                               all_blocks_node);
+                struct list_head *elem_ref_to;
+                struct list_head *elem_ref_from;
+                printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
+                       btrfsic_get_block_type(state, b_all),
+                       (unsigned long long)b_all->logical_bytenr,
+                       b_all->dev_state->name,
+                       (unsigned long long)b_all->dev_bytenr,
+                       b_all->mirror_num);
+                list_for_each(elem_ref_to, &b_all->ref_to_list) {
+                        const struct btrfsic_block_link *const l =
+                            list_entry(elem_ref_to,
+                                       struct btrfsic_block_link,
+                                       node_ref_to);
+                        printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+                               " refers %u* to"
+                               " %c @%llu (%s/%llu/%d)\n",
+                               btrfsic_get_block_type(state, b_all),
+                               (unsigned long long)b_all->logical_bytenr,
+                               b_all->dev_state->name,
+                               (unsigned long long)b_all->dev_bytenr,
+                               b_all->mirror_num,
+                               l->ref_cnt,
+                               btrfsic_get_block_type(state, l->block_ref_to),
+                               (unsigned long long)
+                               l->block_ref_to->logical_bytenr,
+                               l->block_ref_to->dev_state->name,
+                               (unsigned long long)l->block_ref_to->dev_bytenr,
+                               l->block_ref_to->mirror_num);
+                }
+                list_for_each(elem_ref_from, &b_all->ref_from_list) {
+                        const struct btrfsic_block_link *const l =
+                            list_entry(elem_ref_from,
+                                       struct btrfsic_block_link,
+                                       node_ref_from);
+                        printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+                               " is ref %u* from"
+                               " %c @%llu (%s/%llu/%d)\n",
+                               btrfsic_get_block_type(state, b_all),
+                               (unsigned long long)b_all->logical_bytenr,
+                               b_all->dev_state->name,
+                               (unsigned long long)b_all->dev_bytenr,
+                               b_all->mirror_num,
+                               l->ref_cnt,
+                               btrfsic_get_block_type(state, l->block_ref_from),
+                               (unsigned long long)
+                               l->block_ref_from->logical_bytenr,
+                               l->block_ref_from->dev_state->name,
+                               (unsigned long long)
+                               l->block_ref_from->dev_bytenr,
+                               l->block_ref_from->mirror_num);
+                }
+                printk(KERN_INFO "\n");
+        }
+}
+/*
+ * Test whether the disk block contains a tree block (leaf or node)
+ * (note that this test fails for the super block)
+ */
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+                                     const u8 *data, unsigned int size)
+{
+        struct btrfs_header *h;
+        u8 csum[BTRFS_CSUM_SIZE];
+        u32 crc = ~(u32)0;
+        int fail = 0;
+        int crc_fail = 0;
+        h = (struct btrfs_header *)data;
+        if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
+                fail++;
+        crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
+        btrfs_csum_final(crc, csum);
+        if (memcmp(csum, h->csum, state->csum_size))
+                crc_fail++;
+        return fail || crc_fail;
+}
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+                                          u64 dev_bytenr,
+                                          u8 *mapped_data, unsigned int len,
+                                          struct bio *bio,
+                                          int *bio_is_patched,
+                                          struct buffer_head *bh,
+                                          int submit_bio_bh_rw)
+{
+        int is_metadata;
+        struct btrfsic_block *block;
+        struct btrfsic_block_data_ctx block_ctx;
+        int ret;
+        struct btrfsic_state *state = dev_state->state;
+        struct block_device *bdev = dev_state->bdev;
+        WARN_ON(len > PAGE_SIZE);
+        is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
+        if (NULL != bio_is_patched)
+                *bio_is_patched = 0;
+        block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
+                                               &state->block_hashtable);
+        if (NULL != block) {
+                u64 bytenr = 0;
+                struct list_head *elem_ref_to;
+                struct list_head *tmp_ref_to;
+                if (block->is_superblock) {
+                        bytenr = le64_to_cpu(((struct btrfs_super_block *)
+                                              mapped_data)->bytenr);
+                        is_metadata = 1;
+                        if (state->print_mask &
+                            BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
+                                printk(KERN_INFO
+                                       "[before new superblock is written]:\n");
+                                btrfsic_dump_tree_sub(state, block, 0);
+                        }
+                }
+                if (is_metadata) {
+                        if (!block->is_superblock) {
+                                bytenr = le64_to_cpu(((struct btrfs_header *)
+                                                      mapped_data)->bytenr);
+                                btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
+                                                               dev_state,
+                                                               dev_bytenr,
+                                                               mapped_data);
+                        }
+                        if (block->logical_bytenr != bytenr) {
+                                printk(KERN_INFO
+                                       "Written block @%llu (%s/%llu/%d)"
+                                       " found in hash table, %c,"
+                                       " bytenr mismatch"
+                                       " (!= stored %llu).\n",
+                                       (unsigned long long)bytenr,
+                                       dev_state->name,
+                                       (unsigned long long)dev_bytenr,
+                                       block->mirror_num,
+                                       btrfsic_get_block_type(state, block),
+                                       (unsigned long long)
+                                       block->logical_bytenr);
+                                block->logical_bytenr = bytenr;
+                        } else if (state->print_mask &
+                                   BTRFSIC_PRINT_MASK_VERBOSE)
+                                printk(KERN_INFO
+                                       "Written block @%llu (%s/%llu/%d)"
+                                       " found in hash table, %c.\n",
+                                       (unsigned long long)bytenr,
+                                       dev_state->name,
+                                       (unsigned long long)dev_bytenr,
+                                       block->mirror_num,
+                                       btrfsic_get_block_type(state, block));
+                } else {
+                        bytenr = block->logical_bytenr;
+                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                                printk(KERN_INFO
+                                       "Written block @%llu (%s/%llu/%d)"
+                                       " found in hash table, %c.\n",
+                                       (unsigned long long)bytenr,
+                                       dev_state->name,
+                                       (unsigned long long)dev_bytenr,
+                                       block->mirror_num,
+                                       btrfsic_get_block_type(state, block));
+                }
+                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                        printk(KERN_INFO
+                               "ref_to_list: %cE, ref_from_list: %cE\n",
+                               list_empty(&block->ref_to_list) ? ' ' : '!',
+                               list_empty(&block->ref_from_list) ? ' ' : '!');
+                if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
+                        printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+                               " @%llu (%s/%llu/%d), old(gen=%llu,"
+                               " objectid=%llu, type=%d, offset=%llu),"
+                               " new(gen=%llu),"
+                               " which is referenced by most recent superblock"
+                               " (superblockgen=%llu)!\n",
+                               btrfsic_get_block_type(state, block),
+                               (unsigned long long)bytenr,
+                               dev_state->name,
+                               (unsigned long long)dev_bytenr,
+                               block->mirror_num,
+                               (unsigned long long)block->generation,
+                               (unsigned long long)
+                               le64_to_cpu(block->disk_key.objectid),
+                               block->disk_key.type,
+                               (unsigned long long)
+                               le64_to_cpu(block->disk_key.offset),
+                               (unsigned long long)
+                               le64_to_cpu(((struct btrfs_header *)
+                                            mapped_data)->generation),
+                               (unsigned long long)
+                               state->max_superblock_generation);
+                        btrfsic_dump_tree(state);
+                }
+                if (!block->is_iodone && !block->never_written) {
+                        printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+                               " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
+                               " which is not yet iodone!\n",
+                               btrfsic_get_block_type(state, block),
+                               (unsigned long long)bytenr,
+                               dev_state->name,
+                               (unsigned long long)dev_bytenr,
+                               block->mirror_num,
+                               (unsigned long long)block->generation,
+                               (unsigned long long)
+                               le64_to_cpu(((struct btrfs_header *)
+                                            mapped_data)->generation));
+                        /* it would not be safe to go on */
+                        btrfsic_dump_tree(state);
+                        return;
+                }
+                /*
+                 * Clear all references of this block. Do not free
+                 * the block itself even if is not referenced anymore
+                 * because it still carries valueable information
+                 * like whether it was ever written and IO completed.
+                 */
+                list_for_each_safe(elem_ref_to, tmp_ref_to,
+                                   &block->ref_to_list) {
+                        struct btrfsic_block_link *const l =
+                            list_entry(elem_ref_to,
+                                       struct btrfsic_block_link,
+                                       node_ref_to);
+                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                                btrfsic_print_rem_link(state, l);
+                        l->ref_cnt--;
+                        if (0 == l->ref_cnt) {
+                                list_del(&l->node_ref_to);
+                                list_del(&l->node_ref_from);
+                                btrfsic_block_link_hashtable_remove(l);
+                                btrfsic_block_link_free(l);
+                        }
+                }
+                if (block->is_superblock)
+                        ret = btrfsic_map_superblock(state, bytenr, len,
+                                                     bdev, &block_ctx);
+                else
+                        ret = btrfsic_map_block(state, bytenr, len,
+                                                &block_ctx, 0);
+                if (ret) {
+                        printk(KERN_INFO
+                               "btrfsic: btrfsic_map_block(root @%llu)"
+                               " failed!\n", (unsigned long long)bytenr);
+                        return;
+                }
+                block_ctx.data = mapped_data;
+                /* the following is required in case of writes to mirrors,
+                 * use the same that was used for the lookup */
+                block_ctx.dev = dev_state;
+                block_ctx.dev_bytenr = dev_bytenr;
+                if (is_metadata || state->include_extent_data) {
+                        block->never_written = 0;
+                        block->iodone_w_error = 0;
+                        if (NULL != bio) {
+                                block->is_iodone = 0;
+                                BUG_ON(NULL == bio_is_patched);
+                                if (!*bio_is_patched) {
+                                        block->orig_bio_bh_private =
+                                            bio->bi_private;
+                                        block->orig_bio_bh_end_io.bio =
+                                            bio->bi_end_io;
+                                        block->next_in_same_bio = NULL;
+                                        bio->bi_private = block;
+                                        bio->bi_end_io = btrfsic_bio_end_io;
+                                        *bio_is_patched = 1;
+                                } else {
+                                        struct btrfsic_block *chained_block =
+                                            (struct btrfsic_block *)
+                                            bio->bi_private;
+                                        BUG_ON(NULL == chained_block);
+                                        block->orig_bio_bh_private =
+                                            chained_block->orig_bio_bh_private;
+                                        block->orig_bio_bh_end_io.bio =
+                                            chained_block->orig_bio_bh_end_io.
+                                            bio;
+                                        block->next_in_same_bio = chained_block;
+                                        bio->bi_private = block;
+                                }
+                        } else if (NULL != bh) {
+                                block->is_iodone = 0;
+                                block->orig_bio_bh_private = bh->b_private;
+                                block->orig_bio_bh_end_io.bh = bh->b_end_io;
+                                block->next_in_same_bio = NULL;
+                                bh->b_private = block;
+                                bh->b_end_io = btrfsic_bh_end_io;
+                        } else {
+                                block->is_iodone = 1;
+                                block->orig_bio_bh_private = NULL;
+                                block->orig_bio_bh_end_io.bio = NULL;
+                                block->next_in_same_bio = NULL;
+                        }
+                }
+                block->flush_gen = dev_state->last_flush_gen + 1;
+                block->submit_bio_bh_rw = submit_bio_bh_rw;
+                if (is_metadata) {
+                        block->logical_bytenr = bytenr;
+                        block->is_metadata = 1;
+                        if (block->is_superblock) {
+                                ret = btrfsic_process_written_superblock(
+                                                state,
+                                                block,
+                                                (struct btrfs_super_block *)
+                                                mapped_data);
+                                if (state->print_mask &
+                                    BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
+                                        printk(KERN_INFO
+                                        "[after new superblock is written]:\n");
+                                        btrfsic_dump_tree_sub(state, block, 0);
+                                }
+                        } else {
+                                block->mirror_num = 0;  /* unknown */
+                                ret = btrfsic_process_metablock(
+                                                state,
+                                                block,
+                                                &block_ctx,
+                                                (struct btrfs_header *)
+                                                block_ctx.data,
+                                                0, 0);
+                        }
+                        if (ret)
+                                printk(KERN_INFO
+                                       "btrfsic: btrfsic_process_metablock"
+                                       "(root @%llu) failed!\n",
+                                       (unsigned long long)dev_bytenr);
+                } else {
+                        block->is_metadata = 0;
+                        block->mirror_num = 0;  /* unknown */
+                        block->generation = BTRFSIC_GENERATION_UNKNOWN;
+                        if (!state->include_extent_data
+                            && list_empty(&block->ref_from_list)) {
+                                /*
+                                 * disk block is overwritten with extent
+                                 * data (not meta data) and we are configured
+                                 * to not include extent data: take the
+                                 * chance and free the block's memory
+                                 */
+                                btrfsic_block_hashtable_remove(block);
+                                list_del(&block->all_blocks_node);
+                                btrfsic_block_free(block);
+                        }
+                }
+                btrfsic_release_block_ctx(&block_ctx);
+        } else {
+                /* block has not been found in hash table */
+                u64 bytenr;
+                if (!is_metadata) {
+                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                                printk(KERN_INFO "Written block (%s/%llu/?)"
+                                       " !found in hash table, D.\n",
+                                       dev_state->name,
+                                       (unsigned long long)dev_bytenr);
+                        if (!state->include_extent_data)
+                                return; /* ignore that written D block */
+                        /* this is getting ugly for the
+                         * include_extent_data case... */
+                        bytenr = 0;     /* unknown */
+                        block_ctx.start = bytenr;
+                        block_ctx.len = len;
+                        block_ctx.bh = NULL;
+                } else {
+                        bytenr = le64_to_cpu(((struct btrfs_header *)
+                                              mapped_data)->bytenr);
+                        btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
+                                                       dev_bytenr,
+                                                       mapped_data);
+                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                                printk(KERN_INFO
+                                       "Written block @%llu (%s/%llu/?)"
+                                       " !found in hash table, M.\n",
+                                       (unsigned long long)bytenr,
+                                       dev_state->name,
+                                       (unsigned long long)dev_bytenr);
+                        ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
+                                                0);
+                        if (ret) {
+                                printk(KERN_INFO
+                                       "btrfsic: btrfsic_map_block(root @%llu)"
+                                       " failed!\n",
+                                       (unsigned long long)dev_bytenr);
+                                return;
+                        }
+                }
+                block_ctx.data = mapped_data;
+                /* the following is required in case of writes to mirrors,
+                 * use the same that was used for the lookup */
+                block_ctx.dev = dev_state;
+                block_ctx.dev_bytenr = dev_bytenr;
+                block = btrfsic_block_alloc();
+                if (NULL == block) {
+                        printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                        btrfsic_release_block_ctx(&block_ctx);
+                        return;
+                }
+                block->dev_state = dev_state;
+                block->dev_bytenr = dev_bytenr;
+                block->logical_bytenr = bytenr;
+                block->is_metadata = is_metadata;
+                block->never_written = 0;
+                block->iodone_w_error = 0;
+                block->mirror_num = 0;  /* unknown */
+                block->flush_gen = dev_state->last_flush_gen + 1;
+                block->submit_bio_bh_rw = submit_bio_bh_rw;
+                if (NULL != bio) {
+                        block->is_iodone = 0;
+                        BUG_ON(NULL == bio_is_patched);
+                        if (!*bio_is_patched) {
+                                block->orig_bio_bh_private = bio->bi_private;
+                                block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+                                block->next_in_same_bio = NULL;
+                                bio->bi_private = block;
+                                bio->bi_end_io = btrfsic_bio_end_io;
+                                *bio_is_patched = 1;
+                        } else {
+                                struct btrfsic_block *chained_block =
+                                    (struct btrfsic_block *)
+                                    bio->bi_private;
+                                BUG_ON(NULL == chained_block);
+                                block->orig_bio_bh_private =
+                                    chained_block->orig_bio_bh_private;
+                                block->orig_bio_bh_end_io.bio =
+                                    chained_block->orig_bio_bh_end_io.bio;
+                                block->next_in_same_bio = chained_block;
+                                bio->bi_private = block;
+                        }
+                } else if (NULL != bh) {
+                        block->is_iodone = 0;
+                        block->orig_bio_bh_private = bh->b_private;
+                        block->orig_bio_bh_end_io.bh = bh->b_end_io;
+                        block->next_in_same_bio = NULL;
+                        bh->b_private = block;
+                        bh->b_end_io = btrfsic_bh_end_io;
+                } else {
+                        block->is_iodone = 1;
+                        block->orig_bio_bh_private = NULL;
+                        block->orig_bio_bh_end_io.bio = NULL;
+                        block->next_in_same_bio = NULL;
+                }
+                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                        printk(KERN_INFO
+                               "New written %c-block @%llu (%s/%llu/%d)\n",
+                               is_metadata ? 'M' : 'D',
+                               (unsigned long long)block->logical_bytenr,
+                               block->dev_state->name,
+                               (unsigned long long)block->dev_bytenr,
+                               block->mirror_num);
+                list_add(&block->all_blocks_node, &state->all_blocks_list);
+                btrfsic_block_hashtable_add(block, &state->block_hashtable);
+                if (is_metadata) {
+                        ret = btrfsic_process_metablock(state, block,
+                                                        &block_ctx,
+                                                        (struct btrfs_header *)
+                                                        block_ctx.data, 0, 0);
+                        if (ret)
+                                printk(KERN_INFO
+                                       "btrfsic: process_metablock(root @%llu)"
+                                       " failed!\n",
+                                       (unsigned long long)dev_bytenr);
+                }
+                btrfsic_release_block_ctx(&block_ctx);
+        }
+}
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+{
+        struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+        int iodone_w_error;
+        /* mutex is not held! This is not save if IO is not yet completed
+         * on umount */
+        iodone_w_error = 0;
+        if (bio_error_status)
+                iodone_w_error = 1;
+        BUG_ON(NULL == block);
+        bp->bi_private = block->orig_bio_bh_private;
+        bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+        do {
+                struct btrfsic_block *next_block;
+                struct btrfsic_dev_state *const dev_state = block->dev_state;
+                if ((dev_state->state->print_mask &
+                     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+                        printk(KERN_INFO
+                               "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+                               bio_error_status,
+                               btrfsic_get_block_type(dev_state->state, block),
+                               (unsigned long long)block->logical_bytenr,
+                               dev_state->name,
+                               (unsigned long long)block->dev_bytenr,
+                               block->mirror_num);
+                next_block = block->next_in_same_bio;
+                block->iodone_w_error = iodone_w_error;
+                if (block->submit_bio_bh_rw & REQ_FLUSH) {
+                        dev_state->last_flush_gen++;
+                        if ((dev_state->state->print_mask &
+                             BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+                                printk(KERN_INFO
+                                       "bio_end_io() new %s flush_gen=%llu\n",
+                                       dev_state->name,
+                                       (unsigned long long)
+                                       dev_state->last_flush_gen);
+                }
+                if (block->submit_bio_bh_rw & REQ_FUA)
+                        block->flush_gen = 0; /* FUA completed means block is
+                                               * on disk */
+                block->is_iodone = 1; /* for FLUSH, this releases the block */
+                block = next_block;
+        } while (NULL != block);
+        bp->bi_end_io(bp, bio_error_status);
+}
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
+{
+        struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
+        int iodone_w_error = !uptodate;
+        struct btrfsic_dev_state *dev_state;
+        BUG_ON(NULL == block);
+        dev_state = block->dev_state;
+        if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+                printk(KERN_INFO
+                       "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
+                       iodone_w_error,
+                       btrfsic_get_block_type(dev_state->state, block),
+                       (unsigned long long)block->logical_bytenr,
+                       block->dev_state->name,
+                       (unsigned long long)block->dev_bytenr,
+                       block->mirror_num);
+        block->iodone_w_error = iodone_w_error;
+        if (block->submit_bio_bh_rw & REQ_FLUSH) {
+                dev_state->last_flush_gen++;
+                if ((dev_state->state->print_mask &
+                     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+                        printk(KERN_INFO
+                               "bh_end_io() new %s flush_gen=%llu\n",
+                               dev_state->name,
+                               (unsigned long long)dev_state->last_flush_gen);
+        }
+        if (block->submit_bio_bh_rw & REQ_FUA)
+                block->flush_gen = 0; /* FUA completed means block is on disk */
+        bh->b_private = block->orig_bio_bh_private;
+        bh->b_end_io = block->orig_bio_bh_end_io.bh;
+        block->is_iodone = 1; /* for FLUSH, this releases the block */
+        bh->b_end_io(bh, uptodate);
+}
+static int btrfsic_process_written_superblock(
+                struct btrfsic_state *state,
+                struct btrfsic_block *const superblock,
+                struct btrfs_super_block *const super_hdr)
+{
+        int pass;
+        superblock->generation = btrfs_super_generation(super_hdr);
+        if (!(superblock->generation > state->max_superblock_generation ||
+              0 == state->max_superblock_generation)) {
+                if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+                        printk(KERN_INFO
+                               "btrfsic: superblock @%llu (%s/%llu/%d)"
+                               " with old gen %llu <= %llu\n",
+                               (unsigned long long)superblock->logical_bytenr,
+                               superblock->dev_state->name,
+                               (unsigned long long)superblock->dev_bytenr,
+                               superblock->mirror_num,
+                               (unsigned long long)
+                               btrfs_super_generation(super_hdr),
+                               (unsigned long long)
+                               state->max_superblock_generation);
+        } else {
+                if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+                        printk(KERN_INFO
+                               "btrfsic: got new superblock @%llu (%s/%llu/%d)"
+                               " with new gen %llu > %llu\n",
+                               (unsigned long long)superblock->logical_bytenr,
+                               superblock->dev_state->name,
+                               (unsigned long long)superblock->dev_bytenr,
+                               superblock->mirror_num,
+                               (unsigned long long)
+                               btrfs_super_generation(super_hdr),
+                               (unsigned long long)
+                               state->max_superblock_generation);
+                state->max_superblock_generation =
+                    btrfs_super_generation(super_hdr);
+                state->latest_superblock = superblock;
+        }
+        for (pass = 0; pass < 3; pass++) {
+                int ret;
+                u64 next_bytenr;
+                struct btrfsic_block *next_block;
+                struct btrfsic_block_data_ctx tmp_next_block_ctx;
+                struct btrfsic_block_link *l;
+                int num_copies;
+                int mirror_num;
+                const char *additional_string = NULL;
+                struct btrfs_disk_key tmp_disk_key;
+                tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+                tmp_disk_key.offset = 0;
+                switch (pass) {
+                case 0:
+                        tmp_disk_key.objectid =
+                            cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+                        additional_string = "root ";
+                        next_bytenr = btrfs_super_root(super_hdr);
+                        if (state->print_mask &
+                            BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                                printk(KERN_INFO "root@%llu\n",
+                                       (unsigned long long)next_bytenr);
+                        break;
+                case 1:
+                        tmp_disk_key.objectid =
+                            cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+                        additional_string = "chunk ";
+                        next_bytenr = btrfs_super_chunk_root(super_hdr);
+                        if (state->print_mask &
+                            BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                                printk(KERN_INFO "chunk@%llu\n",
+                                       (unsigned long long)next_bytenr);
+                        break;
+                case 2:
+                        tmp_disk_key.objectid =
+                            cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+                        additional_string = "log ";
+                        next_bytenr = btrfs_super_log_root(super_hdr);
+                        if (0 == next_bytenr)
+                                continue;
+                        if (state->print_mask &
+                            BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                                printk(KERN_INFO "log@%llu\n",
+                                       (unsigned long long)next_bytenr);
+                        break;
+                }
+                num_copies =
+                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                     next_bytenr, PAGE_SIZE);
+                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                               (unsigned long long)next_bytenr, num_copies);
+                for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                        int was_created;
+                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                                printk(KERN_INFO
+                                       "btrfsic_process_written_superblock("
+                                       "mirror_num=%d)\n", mirror_num);
+                        ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                                                &tmp_next_block_ctx,
+                                                mirror_num);
+                        if (ret) {
+                                printk(KERN_INFO
+                                       "btrfsic: btrfsic_map_block(@%llu,"
+                                       " mirror=%d) failed!\n",
+                                       (unsigned long long)next_bytenr,
+                                       mirror_num);
+                                return -1;
+                        }
+                        next_block = btrfsic_block_lookup_or_add(
+                                        state,
+                                        &tmp_next_block_ctx,
+                                        additional_string,
+                                        1, 0, 1,
+                                        mirror_num,
+                                        &was_created);
+                        if (NULL == next_block) {
+                                printk(KERN_INFO
+                                       "btrfsic: error, kmalloc failed!\n");
+                                btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                                return -1;
+                        }
+                        next_block->disk_key = tmp_disk_key;
+                        if (was_created)
+                                next_block->generation =
+                                    BTRFSIC_GENERATION_UNKNOWN;
+                        l = btrfsic_block_link_lookup_or_add(
+                                        state,
+                                        &tmp_next_block_ctx,
+                                        next_block,
+                                        superblock,
+                                        BTRFSIC_GENERATION_UNKNOWN);
+                        btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                        if (NULL == l)
+                                return -1;
+                }
+        }
+        if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
+                WARN_ON(1);
+                btrfsic_dump_tree(state);
+        }
+        return 0;
+}
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+                                        struct btrfsic_block *const block,
+                                        int recursion_level)
+{
+        struct list_head *elem_ref_to;
+        int ret = 0;
+        if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+                /*
+                 * Note that this situation can happen and does not
+                 * indicate an error in regular cases. It happens
+                 * when disk blocks are freed and later reused.
+                 * The check-integrity module is not aware of any
+                 * block free operations, it just recognizes block
+                 * write operations. Therefore it keeps the linkage
+                 * information for a block until a block is
+                 * rewritten. This can temporarily cause incorrect
+                 * and even circular linkage informations. This
+                 * causes no harm unless such blocks are referenced
+                 * by the most recent super block.
+                 */
+                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                        printk(KERN_INFO
+                               "btrfsic: abort cyclic linkage (case 1).\n");
+                return ret;
+        }
+        /*
+         * This algorithm is recursive because the amount of used stack
+         * space is very small and the max recursion depth is limited.
+         */
+        list_for_each(elem_ref_to, &block->ref_to_list) {
+                const struct btrfsic_block_link *const l =
+                    list_entry(elem_ref_to, struct btrfsic_block_link,
+                               node_ref_to);
+                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                        printk(KERN_INFO
+                               "rl=%d, %c @%llu (%s/%llu/%d)"
+                               " %u* refers to %c @%llu (%s/%llu/%d)\n",
+                               recursion_level,
+                               btrfsic_get_block_type(state, block),
+                               (unsigned long long)block->logical_bytenr,
+                               block->dev_state->name,
+                               (unsigned long long)block->dev_bytenr,
+                               block->mirror_num,
+                               l->ref_cnt,
+                               btrfsic_get_block_type(state, l->block_ref_to),
+                               (unsigned long long)
+                               l->block_ref_to->logical_bytenr,
+                               l->block_ref_to->dev_state->name,
+                               (unsigned long long)l->block_ref_to->dev_bytenr,
+                               l->block_ref_to->mirror_num);
+                if (l->block_ref_to->never_written) {
+                        printk(KERN_INFO "btrfs: attempt to write superblock"
+                               " which references block %c @%llu (%s/%llu/%d)"
+                               " which is never written!\n",
+                               btrfsic_get_block_type(state, l->block_ref_to),
+                               (unsigned long long)
+                               l->block_ref_to->logical_bytenr,
+                               l->block_ref_to->dev_state->name,
+                               (unsigned long long)l->block_ref_to->dev_bytenr,
+                               l->block_ref_to->mirror_num);
+                        ret = -1;
+                } else if (!l->block_ref_to->is_iodone) {
+                        printk(KERN_INFO "btrfs: attempt to write superblock"
+                               " which references block %c @%llu (%s/%llu/%d)"
+                               " which is not yet iodone!\n",
+                               btrfsic_get_block_type(state, l->block_ref_to),
+                               (unsigned long long)
+                               l->block_ref_to->logical_bytenr,
+                               l->block_ref_to->dev_state->name,
+                               (unsigned long long)l->block_ref_to->dev_bytenr,
+                               l->block_ref_to->mirror_num);
+                        ret = -1;
+                } else if (l->parent_generation !=
+                           l->block_ref_to->generation &&
+                           BTRFSIC_GENERATION_UNKNOWN !=
+                           l->parent_generation &&
+                           BTRFSIC_GENERATION_UNKNOWN !=
+                           l->block_ref_to->generation) {
+                        printk(KERN_INFO "btrfs: attempt to write superblock"
+                               " which references block %c @%llu (%s/%llu/%d)"
+                               " with generation %llu !="
+                               " parent generation %llu!\n",
+                               btrfsic_get_block_type(state, l->block_ref_to),
+                               (unsigned long long)
+                               l->block_ref_to->logical_bytenr,
+                               l->block_ref_to->dev_state->name,
+                               (unsigned long long)l->block_ref_to->dev_bytenr,
+                               l->block_ref_to->mirror_num,
+                               (unsigned long long)l->block_ref_to->generation,
+                               (unsigned long long)l->parent_generation);
+                        ret = -1;
+                } else if (l->block_ref_to->flush_gen >
+                           l->block_ref_to->dev_state->last_flush_gen) {
+                        printk(KERN_INFO "btrfs: attempt to write superblock"
+                               " which references block %c @%llu (%s/%llu/%d)"
+                               " which is not flushed out of disk's write cache"
+                               " (block flush_gen=%llu,"
+                               " dev->flush_gen=%llu)!\n",
+                               btrfsic_get_block_type(state, l->block_ref_to),
+                               (unsigned long long)
+                               l->block_ref_to->logical_bytenr,
+                               l->block_ref_to->dev_state->name,
+                               (unsigned long long)l->block_ref_to->dev_bytenr,
+                               l->block_ref_to->mirror_num,
+                               (unsigned long long)block->flush_gen,
+                               (unsigned long long)
+                               l->block_ref_to->dev_state->last_flush_gen);
+                        ret = -1;
+                } else if (-1 == btrfsic_check_all_ref_blocks(state,
+                                                              l->block_ref_to,
+                                                              recursion_level +
+                                                              1)) {
+                        ret = -1;
+                }
+        }
+        return ret;
+}
+static int btrfsic_is_block_ref_by_superblock(
+                const struct btrfsic_state *state,
+                const struct btrfsic_block *block,
+                int recursion_level)
+{
+        struct list_head *elem_ref_from;
+        if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+                /* refer to comment at "abort cyclic linkage (case 1)" */
+                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                        printk(KERN_INFO
+                               "btrfsic: abort cyclic linkage (case 2).\n");
+                return 0;
+        }
+        /*
+         * This algorithm is recursive because the amount of used stack space
+         * is very small and the max recursion depth is limited.
+         */
+        list_for_each(elem_ref_from, &block->ref_from_list) {
+                const struct btrfsic_block_link *const l =
+                    list_entry(elem_ref_from, struct btrfsic_block_link,
+                               node_ref_from);
+                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                        printk(KERN_INFO
+                               "rl=%d, %c @%llu (%s/%llu/%d)"
+                               " is ref %u* from %c @%llu (%s/%llu/%d)\n",
+                               recursion_level,
+                               btrfsic_get_block_type(state, block),
+                               (unsigned long long)block->logical_bytenr,
+                               block->dev_state->name,
+                               (unsigned long long)block->dev_bytenr,
+                               block->mirror_num,
+                               l->ref_cnt,
+                               btrfsic_get_block_type(state, l->block_ref_from),
+                               (unsigned long long)
+                               l->block_ref_from->logical_bytenr,
+                               l->block_ref_from->dev_state->name,
+                               (unsigned long long)
+                               l->block_ref_from->dev_bytenr,
+                               l->block_ref_from->mirror_num);
+                if (l->block_ref_from->is_superblock &&
+                    state->latest_superblock->dev_bytenr ==
+                    l->block_ref_from->dev_bytenr &&
+                    state->latest_superblock->dev_state->bdev ==
+                    l->block_ref_from->dev_state->bdev)
+                        return 1;
+                else if (btrfsic_is_block_ref_by_superblock(state,
+                                                            l->block_ref_from,
+                                                            recursion_level +
+                                                            1))
+                        return 1;
+        }
+        return 0;
+}
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+                                   const struct btrfsic_block_link *l)
+{
+        printk(KERN_INFO
+               "Add %u* link from %c @%llu (%s/%llu/%d)"
+               " to %c @%llu (%s/%llu/%d).\n",
+               l->ref_cnt,
+               btrfsic_get_block_type(state, l->block_ref_from),
+               (unsigned long long)l->block_ref_from->logical_bytenr,
+               l->block_ref_from->dev_state->name,
+               (unsigned long long)l->block_ref_from->dev_bytenr,
+               l->block_ref_from->mirror_num,
+               btrfsic_get_block_type(state, l->block_ref_to),
+               (unsigned long long)l->block_ref_to->logical_bytenr,
+               l->block_ref_to->dev_state->name,
+               (unsigned long long)l->block_ref_to->dev_bytenr,
+               l->block_ref_to->mirror_num);
+}
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+                                   const struct btrfsic_block_link *l)
+{
+        printk(KERN_INFO
+               "Rem %u* link from %c @%llu (%s/%llu/%d)"
+               " to %c @%llu (%s/%llu/%d).\n",
+               l->ref_cnt,
+               btrfsic_get_block_type(state, l->block_ref_from),
+               (unsigned long long)l->block_ref_from->logical_bytenr,
+               l->block_ref_from->dev_state->name,
+               (unsigned long long)l->block_ref_from->dev_bytenr,
+               l->block_ref_from->mirror_num,
+               btrfsic_get_block_type(state, l->block_ref_to),
+               (unsigned long long)l->block_ref_to->logical_bytenr,
+               l->block_ref_to->dev_state->name,
+               (unsigned long long)l->block_ref_to->dev_bytenr,
+               l->block_ref_to->mirror_num);
+}
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+                                   const struct btrfsic_block *block)
+{
+        if (block->is_superblock &&
+            state->latest_superblock->dev_bytenr == block->dev_bytenr &&
+            state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
+                return 'S';
+        else if (block->is_superblock)
+                return 's';
+        else if (block->is_metadata)
+                return 'M';
+        else
+                return 'D';
+}
+static void btrfsic_dump_tree(const struct btrfsic_state *state)
+{
+        btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
+}
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+                                  const struct btrfsic_block *block,
+                                  int indent_level)
+{
+        struct list_head *elem_ref_to;
+        int indent_add;
+        static char buf[80];
+        int cursor_position;
+        /*
+         * Should better fill an on-stack buffer with a complete line and
+         * dump it at once when it is time to print a newline character.
+         */
+        /*
+         * This algorithm is recursive because the amount of used stack space
+         * is very small and the max recursion depth is limited.
+         */
+        indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
+                             btrfsic_get_block_type(state, block),
+                             (unsigned long long)block->logical_bytenr,
+                             block->dev_state->name,
+                             (unsigned long long)block->dev_bytenr,
+                             block->mirror_num);
+        if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+                printk("[...]\n");
+                return;
+        }
+        printk(buf);
+        indent_level += indent_add;
+        if (list_empty(&block->ref_to_list)) {
+                printk("\n");
+                return;
+        }
+        if (block->mirror_num > 1 &&
+            !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
+                printk(" [...]\n");
+                return;
+        }
+        cursor_position = indent_level;
+        list_for_each(elem_ref_to, &block->ref_to_list) {
+                const struct btrfsic_block_link *const l =
+                    list_entry(elem_ref_to, struct btrfsic_block_link,
+                               node_ref_to);
+                while (cursor_position < indent_level) {
+                        printk(" ");
+                        cursor_position++;
+                }
+                if (l->ref_cnt > 1)
+                        indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
+                else
+                        indent_add = sprintf(buf, " --> ");
+                if (indent_level + indent_add >
+                    BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+                        printk("[...]\n");
+                        cursor_position = 0;
+                        continue;
+                }
+                printk(buf);
+                btrfsic_dump_tree_sub(state, l->block_ref_to,
+                                      indent_level + indent_add);
+                cursor_position = 0;
+        }
+}
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+                struct btrfsic_state *state,
+                struct btrfsic_block_data_ctx *next_block_ctx,
+                struct btrfsic_block *next_block,
+                struct btrfsic_block *from_block,
+                u64 parent_generation)
+{
+        struct btrfsic_block_link *l;
+        l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
+                                                next_block_ctx->dev_bytenr,
+                                                from_block->dev_state->bdev,
+                                                from_block->dev_bytenr,
+                                                &state->block_link_hashtable);
+        if (NULL == l) {
+                l = btrfsic_block_link_alloc();
+                if (NULL == l) {
+                        printk(KERN_INFO
+                               "btrfsic: error, kmalloc" " failed!\n");
+                        return NULL;
+                }
+                l->block_ref_to = next_block;
+                l->block_ref_from = from_block;
+                l->ref_cnt = 1;
+                l->parent_generation = parent_generation;
+                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                        btrfsic_print_add_link(state, l);
+                list_add(&l->node_ref_to, &from_block->ref_to_list);
+                list_add(&l->node_ref_from, &next_block->ref_from_list);
+                btrfsic_block_link_hashtable_add(l,
+                                                 &state->block_link_hashtable);
+        } else {
+                l->ref_cnt++;
+                l->parent_generation = parent_generation;
+                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                        btrfsic_print_add_link(state, l);
+        }
+        return l;
+}
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+                struct btrfsic_state *state,
+                struct btrfsic_block_data_ctx *block_ctx,
+                const char *additional_string,
+                int is_metadata,
+                int is_iodone,
+                int never_written,
+                int mirror_num,
+                int *was_created)
+{
+        struct btrfsic_block *block;
+        block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
+                                               block_ctx->dev_bytenr,
+                                               &state->block_hashtable);
+        if (NULL == block) {
+                struct btrfsic_dev_state *dev_state;
+                block = btrfsic_block_alloc();
+                if (NULL == block) {
+                        printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                        return NULL;
+                }
+                dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
+                if (NULL == dev_state) {
+                        printk(KERN_INFO
+                               "btrfsic: error, lookup dev_state failed!\n");
+                        btrfsic_block_free(block);
+                        return NULL;
+                }
+                block->dev_state = dev_state;
+                block->dev_bytenr = block_ctx->dev_bytenr;
+                block->logical_bytenr = block_ctx->start;
+                block->is_metadata = is_metadata;
+                block->is_iodone = is_iodone;
+                block->never_written = never_written;
+                block->mirror_num = mirror_num;
+                if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                        printk(KERN_INFO
+                               "New %s%c-block @%llu (%s/%llu/%d)\n",
+                               additional_string,
+                               btrfsic_get_block_type(state, block),
+                               (unsigned long long)block->logical_bytenr,
+                               dev_state->name,
+                               (unsigned long long)block->dev_bytenr,
+                               mirror_num);
+                list_add(&block->all_blocks_node, &state->all_blocks_list);
+                btrfsic_block_hashtable_add(block, &state->block_hashtable);
+                if (NULL != was_created)
+                        *was_created = 1;
+        } else {
+                if (NULL != was_created)
+                        *was_created = 0;
+        }
+        return block;
+}
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+                                           u64 bytenr,
+                                           struct btrfsic_dev_state *dev_state,
+                                           u64 dev_bytenr, char *data)
+{
+        int num_copies;
+        int mirror_num;
+        int ret;
+        struct btrfsic_block_data_ctx block_ctx;
+        int match = 0;
+        num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                      bytenr, PAGE_SIZE);
+        for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+                                        &block_ctx, mirror_num);
+                if (ret) {
+                        printk(KERN_INFO "btrfsic:"
+                               " btrfsic_map_block(logical @%llu,"
+                               " mirror %d) failed!\n",
+                               (unsigned long long)bytenr, mirror_num);
+                        continue;
+                }
+                if (dev_state->bdev == block_ctx.dev->bdev &&
+                    dev_bytenr == block_ctx.dev_bytenr) {
+                        match++;
+                        btrfsic_release_block_ctx(&block_ctx);
+                        break;
+                }
+                btrfsic_release_block_ctx(&block_ctx);
+        }
+        if (!match) {
+                printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
+                       " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
+                       " phys_bytenr=%llu)!\n",
+                       (unsigned long long)bytenr, dev_state->name,
+                       (unsigned long long)dev_bytenr);
+                for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                        ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+                                                &block_ctx, mirror_num);
+                        if (ret)
+                                continue;
+                        printk(KERN_INFO "Read logical bytenr @%llu maps to"
+                               " (%s/%llu/%d)\n",
+                               (unsigned long long)bytenr,
+                               block_ctx.dev->name,
+                               (unsigned long long)block_ctx.dev_bytenr,
+                               mirror_num);
+                }
+                WARN_ON(1);
+        }
+}
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+                struct block_device *bdev)
+{
+        struct btrfsic_dev_state *ds;
+        ds = btrfsic_dev_state_hashtable_lookup(bdev,
+                                                &btrfsic_dev_state_hashtable);
+        return ds;
+}
+int btrfsic_submit_bh(int rw, struct buffer_head *bh)
+{
+        struct btrfsic_dev_state *dev_state;
+        if (!btrfsic_is_initialized)
+                return submit_bh(rw, bh);
+        mutex_lock(&btrfsic_mutex);
+        /* since btrfsic_submit_bh() might also be called before
+         * btrfsic_mount(), this might return NULL */
+        dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
+        /* Only called to write the superblock (incl. FLUSH/FUA) */
+        if (NULL != dev_state &&
+            (rw & WRITE) && bh->b_size > 0) {
+                u64 dev_bytenr;
+                dev_bytenr = 4096 * bh->b_blocknr;
+                if (dev_state->state->print_mask &
+                    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                        printk(KERN_INFO
+                               "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
+                               " size=%lu, data=%p, bdev=%p)\n",
+                               rw, (unsigned long)bh->b_blocknr,
+                               (unsigned long long)dev_bytenr,
+                               (unsigned long)bh->b_size, bh->b_data,
+                               bh->b_bdev);
+                btrfsic_process_written_block(dev_state, dev_bytenr,
+                                              bh->b_data, bh->b_size, NULL,
+                                              NULL, bh, rw);
+        } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+                if (dev_state->state->print_mask &
+                    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                        printk(KERN_INFO
+                               "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
+                               rw, bh->b_bdev);
+                if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+                        if ((dev_state->state->print_mask &
+                             (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                              BTRFSIC_PRINT_MASK_VERBOSE)))
+                                printk(KERN_INFO
+                                       "btrfsic_submit_bh(%s) with FLUSH"
+                                       " but dummy block already in use"
+                                       " (ignored)!\n",
+                                       dev_state->name);
+                } else {
+                        struct btrfsic_block *const block =
+                                &dev_state->dummy_block_for_bio_bh_flush;
+                        block->is_iodone = 0;
+                        block->never_written = 0;
+                        block->iodone_w_error = 0;
+                        block->flush_gen = dev_state->last_flush_gen + 1;
+                        block->submit_bio_bh_rw = rw;
+                        block->orig_bio_bh_private = bh->b_private;
+                        block->orig_bio_bh_end_io.bh = bh->b_end_io;
+                        block->next_in_same_bio = NULL;
+                        bh->b_private = block;
+                        bh->b_end_io = btrfsic_bh_end_io;
+                }
+        }
+        mutex_unlock(&btrfsic_mutex);
+        return submit_bh(rw, bh);
+}
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+        struct btrfsic_dev_state *dev_state;
+        if (!btrfsic_is_initialized) {
+                submit_bio(rw, bio);
+                return;
+        }
+        mutex_lock(&btrfsic_mutex);
+        /* since btrfsic_submit_bio() is also called before
+         * btrfsic_mount(), this might return NULL */
+        dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
+        if (NULL != dev_state &&
+            (rw & WRITE) && NULL != bio->bi_io_vec) {
+                unsigned int i;
+                u64 dev_bytenr;
+                int bio_is_patched;
+                dev_bytenr = 512 * bio->bi_sector;
+                bio_is_patched = 0;
+                if (dev_state->state->print_mask &
+                    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                        printk(KERN_INFO
+                               "submit_bio(rw=0x%x, bi_vcnt=%u,"
+                               " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
+                               rw, bio->bi_vcnt, (unsigned long)bio->bi_sector,
+                               (unsigned long long)dev_bytenr,
+                               bio->bi_bdev);
+                for (i = 0; i < bio->bi_vcnt; i++) {
+                        u8 *mapped_data;
+                        mapped_data = kmap(bio->bi_io_vec[i].bv_page);
+                        if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                             BTRFSIC_PRINT_MASK_VERBOSE) ==
+                            (dev_state->state->print_mask &
+                             (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                              BTRFSIC_PRINT_MASK_VERBOSE)))
+                                printk(KERN_INFO
+                                       "#%u: page=%p, mapped=%p, len=%u,"
+                                       " offset=%u\n",
+                                       i, bio->bi_io_vec[i].bv_page,
+                                       mapped_data,
+                                       bio->bi_io_vec[i].bv_len,
+                                       bio->bi_io_vec[i].bv_offset);
+                        btrfsic_process_written_block(dev_state, dev_bytenr,
+                                                      mapped_data,
+                                                      bio->bi_io_vec[i].bv_len,
+                                                      bio, &bio_is_patched,
+                                                      NULL, rw);
+                        kunmap(bio->bi_io_vec[i].bv_page);
+                        dev_bytenr += bio->bi_io_vec[i].bv_len;
+                }
+        } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+                if (dev_state->state->print_mask &
+                    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                        printk(KERN_INFO
+                               "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
+                               rw, bio->bi_bdev);
+                if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+                        if ((dev_state->state->print_mask &
+                             (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                              BTRFSIC_PRINT_MASK_VERBOSE)))
+                                printk(KERN_INFO
+                                       "btrfsic_submit_bio(%s) with FLUSH"
+                                       " but dummy block already in use"
+                                       " (ignored)!\n",
+                                       dev_state->name);
+                } else {
+                        struct btrfsic_block *const block =
+                                &dev_state->dummy_block_for_bio_bh_flush;
+                        block->is_iodone = 0;
+                        block->never_written = 0;
+                        block->iodone_w_error = 0;
+                        block->flush_gen = dev_state->last_flush_gen + 1;
+                        block->submit_bio_bh_rw = rw;
+                        block->orig_bio_bh_private = bio->bi_private;
+                        block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+                        block->next_in_same_bio = NULL;
+                        bio->bi_private = block;
+                        bio->bi_end_io = btrfsic_bio_end_io;
+                }
+        }
+        mutex_unlock(&btrfsic_mutex);
+        submit_bio(rw, bio);
+}
+int btrfsic_mount(struct btrfs_root *root,
+                  struct btrfs_fs_devices *fs_devices,
+                  int including_extent_data, u32 print_mask)
+{
+        int ret;
+        struct btrfsic_state *state;
+        struct list_head *dev_head = &fs_devices->devices;
+        struct btrfs_device *device;
+        state = kzalloc(sizeof(*state), GFP_NOFS);
+        if (NULL == state) {
+                printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
+                return -1;
+        }
+        if (!btrfsic_is_initialized) {
+                mutex_init(&btrfsic_mutex);
+                btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
+                btrfsic_is_initialized = 1;
+        }
+        mutex_lock(&btrfsic_mutex);
+        state->root = root;
+        state->print_mask = print_mask;
+        state->include_extent_data = including_extent_data;
+        state->csum_size = 0;
+        INIT_LIST_HEAD(&state->all_blocks_list);
+        btrfsic_block_hashtable_init(&state->block_hashtable);
+        btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
+        state->max_superblock_generation = 0;
+        state->latest_superblock = NULL;
+        list_for_each_entry(device, dev_head, dev_list) {
+                struct btrfsic_dev_state *ds;
+                char *p;
+                if (!device->bdev || !device->name)
+                        continue;
+                ds = btrfsic_dev_state_alloc();
+                if (NULL == ds) {
+                        printk(KERN_INFO
+                               "btrfs check-integrity: kmalloc() failed!\n");
+                        mutex_unlock(&btrfsic_mutex);
+                        return -1;
+                }
+                ds->bdev = device->bdev;
+                ds->state = state;
+                bdevname(ds->bdev, ds->name);
+                ds->name[BDEVNAME_SIZE - 1] = '\0';
+                for (p = ds->name; *p != '\0'; p++);
+                while (p > ds->name && *p != '/')
+                        p--;
+                if (*p == '/')
+                        p++;
+                strlcpy(ds->name, p, sizeof(ds->name));
+                btrfsic_dev_state_hashtable_add(ds,
+                                                &btrfsic_dev_state_hashtable);
+        }
+        ret = btrfsic_process_superblock(state, fs_devices);
+        if (0 != ret) {
+                mutex_unlock(&btrfsic_mutex);
+                btrfsic_unmount(root, fs_devices);
+                return ret;
+        }
+        if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
+                btrfsic_dump_database(state);
+        if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
+                btrfsic_dump_tree(state);
+        mutex_unlock(&btrfsic_mutex);
+        return 0;
+}
+void btrfsic_unmount(struct btrfs_root *root,
+                     struct btrfs_fs_devices *fs_devices)
+{
+        struct list_head *elem_all;
+        struct list_head *tmp_all;
+        struct btrfsic_state *state;
+        struct list_head *dev_head = &fs_devices->devices;
+        struct btrfs_device *device;
+        if (!btrfsic_is_initialized)
+                return;
+        mutex_lock(&btrfsic_mutex);
+        state = NULL;
+        list_for_each_entry(device, dev_head, dev_list) {
+                struct btrfsic_dev_state *ds;
+                if (!device->bdev || !device->name)
+                        continue;
+                ds = btrfsic_dev_state_hashtable_lookup(
+                                device->bdev,
+                                &btrfsic_dev_state_hashtable);
+                if (NULL != ds) {
+                        state = ds->state;
+                        btrfsic_dev_state_hashtable_remove(ds);
+                        btrfsic_dev_state_free(ds);
+                }
+        }
+        if (NULL == state) {
+                printk(KERN_INFO
+                       "btrfsic: error, cannot find state information"
+                       " on umount!\n");
+                mutex_unlock(&btrfsic_mutex);
+                return;
+        }
+        /*
+         * Don't care about keeping the lists' state up to date,
+         * just free all memory that was allocated dynamically.
+         * Free the blocks and the block_links.
+         */
+        list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
+                struct btrfsic_block *const b_all =
+                    list_entry(elem_all, struct btrfsic_block,
+                               all_blocks_node);
+                struct list_head *elem_ref_to;
+                struct list_head *tmp_ref_to;
+                list_for_each_safe(elem_ref_to, tmp_ref_to,
+                                   &b_all->ref_to_list) {
+                        struct btrfsic_block_link *const l =
+                            list_entry(elem_ref_to,
+                                       struct btrfsic_block_link,
+                                       node_ref_to);
+                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                                btrfsic_print_rem_link(state, l);
+                        l->ref_cnt--;
+                        if (0 == l->ref_cnt)
+                                btrfsic_block_link_free(l);
+                }
+                if (b_all->is_iodone)
+                        btrfsic_block_free(b_all);
+                else
+                        printk(KERN_INFO "btrfs: attempt to free %c-block"
+                               " @%llu (%s/%llu/%d) on umount which is"
+                               " not yet iodone!\n",
+                               btrfsic_get_block_type(state, b_all),
+                               (unsigned long long)b_all->logical_bytenr,
+                               b_all->dev_state->name,
+                               (unsigned long long)b_all->dev_bytenr,
+                               b_all->mirror_num);
+        }
+        mutex_unlock(&btrfsic_mutex);
+        kfree(state);
+}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644
index 000000000000..8b59175cc502
--- /dev/null
+++ b/fs/btrfs/check-integrity.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#if !defined(__BTRFS_CHECK_INTEGRITY__)
+#define __BTRFS_CHECK_INTEGRITY__
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+int btrfsic_submit_bh(int rw, struct buffer_head *bh);
+void btrfsic_submit_bio(int rw, struct bio *bio);
+#else
+#define btrfsic_submit_bh submit_bh
+#define btrfsic_submit_bio submit_bio
+#endif
+int btrfsic_mount(struct btrfs_root *root,
+                  struct btrfs_fs_devices *fs_devices,
+                  int including_extent_data, u32 print_mask);
+void btrfsic_unmount(struct btrfs_root *root,
+                     struct btrfs_fs_devices *fs_devices);
+#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dede441bdeee..0639a555e16e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
                                     new_root_objectid, &disk_key, level,
-                                     buf->start, 0);
+                                     buf->start, 0, 1);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
@@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        WARN_ON(btrfs_header_generation(buf) > trans->transid);
        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-                ret = btrfs_inc_ref(trans, root, cow, 1);
+                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
        else
-                ret = btrfs_inc_ref(trans, root, cow, 0);
+                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
        if (ret)
                return ret;
@@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                if ((owner == root->root_key.objectid ||
                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
-                        ret = btrfs_inc_ref(trans, root, buf, 1);
+                        ret = btrfs_inc_ref(trans, root, buf, 1, 1);
                        BUG_ON(ret);
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID) {
-                                ret = btrfs_dec_ref(trans, root, buf, 0);
+                                ret = btrfs_dec_ref(trans, root, buf, 0, 1);
                                BUG_ON(ret);
-                                ret = btrfs_inc_ref(trans, root, cow, 1);
+                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
                                BUG_ON(ret);
                        }
                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID)
-                                ret = btrfs_inc_ref(trans, root, cow, 1);
+                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
                        else
-                                ret = btrfs_inc_ref(trans, root, cow, 0);
+                                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
                        BUG_ON(ret);
                }
                if (new_flags != 0) {
@@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID)
-                                ret = btrfs_inc_ref(trans, root, cow, 1);
+                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
                        else
-                                ret = btrfs_inc_ref(trans, root, cow, 0);
+                                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
                        BUG_ON(ret);
-                        ret = btrfs_dec_ref(trans, root, buf, 1);
+                        ret = btrfs_dec_ref(trans, root, buf, 1, 1);
                        BUG_ON(ret);
                }
                clean_tree_block(trans, root, buf);
@@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
                                     root->root_key.objectid, &disk_key,
-                                     level, search_start, empty_size);
+                                     level, search_start, empty_size, 1);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
@@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                rcu_assign_pointer(root->node, cow);
                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                      last_ref);
+                                      last_ref, 1);
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
@@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                      last_ref);
+                                      last_ref, 1);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
@@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                free_extent_buffer(mid);
                root_sub_used(root, mid->len);
-                btrfs_free_tree_block(trans, root, mid, 0, 1);
+                btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
                /* once for the root ptr */
                free_extent_buffer(mid);
                return 0;
@@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        if (wret)
                                ret = wret;
                        root_sub_used(root, right->len);
-                        btrfs_free_tree_block(trans, root, right, 0, 1);
+                        btrfs_free_tree_block(trans, root, right, 0, 1, 0);
                        free_extent_buffer(right);
                        right = NULL;
                } else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (wret)
                        ret = wret;
                root_sub_used(root, mid->len);
-                btrfs_free_tree_block(trans, root, mid, 0, 1);
+                btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
                free_extent_buffer(mid);
                mid = NULL;
        } else {
@@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
                                   root->root_key.objectid, &lower_key,
-                                   level, root->node->start, 0);
+                                   level, root->node->start, 0, 0);
        if (IS_ERR(c))
                return PTR_ERR(c);
@@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
                                        root->root_key.objectid,
-                                        &disk_key, level, c->start, 0);
+                                        &disk_key, level, c->start, 0, 0);
        if (IS_ERR(split))
                return PTR_ERR(split);
@@ -2970,7 +2970,7 @@ again:
        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                        root->root_key.objectid,
-                                        &disk_key, 0, l->start, 0);
+                                        &disk_key, 0, l->start, 0, 0);
        if (IS_ERR(right))
                return PTR_ERR(right);
@@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
        root_sub_used(root, leaf->len);
-        btrfs_free_tree_block(trans, root, leaf, 0, 1);
+        btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
        return 0;
 }
 /*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323d..27ebe61d3ccc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
 /* holds checksums of all the data extents */
 #define BTRFS_CSUM_TREE_OBJECTID 7ULL
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
        __le16 name_len;
 } __attribute__ ((__packed__));
+struct btrfs_disk_balance_args {
+        /*
+         * profiles to operate on, single is denoted by
+         * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+         */
+        __le64 profiles;
+        /* usage filter */
+        __le64 usage;
+        /* devid filter */
+        __le64 devid;
+        /* devid subset filter [pstart..pend) */
+        __le64 pstart;
+        __le64 pend;
+        /* btrfs virtual address space subset filter [vstart..vend) */
+        __le64 vstart;
+        __le64 vend;
+        /*
+         * profile to convert to, single is denoted by
+         * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+         */
+        __le64 target;
+        /* BTRFS_BALANCE_ARGS_* */
+        __le64 flags;
+        __le64 unused[8];
+} __attribute__ ((__packed__));
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+        /* BTRFS_BALANCE_* */
+        __le64 flags;
+        struct btrfs_disk_balance_args data;
+        struct btrfs_disk_balance_args meta;
+        struct btrfs_disk_balance_args sys;
+        __le64 unused[4];
+} __attribute__ ((__packed__));
 #define BTRFS_FILE_EXTENT_INLINE 0
 #define BTRFS_FILE_EXTENT_REG 1
 #define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -751,14 +802,32 @@ struct btrfs_csum_item {
 } __attribute__ ((__packed__));
 /* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
+#define BTRFS_BLOCK_GROUP_DATA          (1ULL << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
+#define BTRFS_BLOCK_GROUP_SYSTEM        (1ULL << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
+#define BTRFS_BLOCK_GROUP_METADATA      (1ULL << 2)
-#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
+#define BTRFS_BLOCK_GROUP_RAID0         (1ULL << 3)
-#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
+#define BTRFS_BLOCK_GROUP_RAID1         (1ULL << 4)
-#define BTRFS_BLOCK_GROUP_DUP      (1 << 5)
+#define BTRFS_BLOCK_GROUP_DUP           (1ULL << 5)
-#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
+#define BTRFS_BLOCK_GROUP_RAID10        (1ULL << 6)
-#define BTRFS_NR_RAID_TYPES        5
+#define BTRFS_BLOCK_GROUP_RESERVED      BTRFS_AVAIL_ALLOC_BIT_SINGLE
+#define BTRFS_NR_RAID_TYPES             5
+#define BTRFS_BLOCK_GROUP_TYPE_MASK     (BTRFS_BLOCK_GROUP_DATA |    \
+                                         BTRFS_BLOCK_GROUP_SYSTEM |  \
+                                         BTRFS_BLOCK_GROUP_METADATA)
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK  (BTRFS_BLOCK_GROUP_RAID0 |   \
+                                         BTRFS_BLOCK_GROUP_RAID1 |   \
+                                         BTRFS_BLOCK_GROUP_DUP |     \
+                                         BTRFS_BLOCK_GROUP_RAID10)
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available.  This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE    (1ULL << 48)
 struct btrfs_block_group_item {
        __le64 used;
@@ -916,6 +985,7 @@ struct btrfs_block_group_cache {
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
+struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct btrfs_fs_info {
        u8 fsid[BTRFS_FSID_SIZE];
@@ -971,7 +1041,7 @@ struct btrfs_fs_info {
         * is required instead of the faster short fsync log commits
         */
        u64 last_trans_log_full_commit;
-        unsigned long mount_opt:20;
+        unsigned long mount_opt:21;
        unsigned long compress_type:4;
        u64 max_inline;
        u64 alloc_start;
@@ -1132,12 +1202,23 @@ struct btrfs_fs_info {
        spinlock_t ref_cache_lock;
        u64 total_ref_cache_size;
+        /*
+         * these three are in extended format (availability of single
+         * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+         * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+         */
        u64 avail_data_alloc_bits;
        u64 avail_metadata_alloc_bits;
        u64 avail_system_alloc_bits;
-        u64 data_alloc_profile;
-        u64 metadata_alloc_profile;
+        /* restriper state */
-        u64 system_alloc_profile;
+        spinlock_t balance_lock;
+        struct mutex balance_mutex;
+        atomic_t balance_running;
+        atomic_t balance_pause_req;
+        atomic_t balance_cancel_req;
+        struct btrfs_balance_control *balance_ctl;
+        wait_queue_head_t balance_wait_q;
        unsigned data_chunk_allocations;
        unsigned metadata_ratio;
@@ -1155,6 +1236,10 @@ struct btrfs_fs_info {
        int scrub_workers_refcnt;
        struct btrfs_workers scrub_workers;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+        u32 check_integrity_print_mask;
+#endif
        /* filesystem state */
        u64 fs_state;
@@ -1383,6 +1468,8 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_ITEM_KEY      216
 #define BTRFS_CHUNK_ITEM_KEY    228
+#define BTRFS_BALANCE_ITEM_KEY  248
 /*
 * string items are for debugging.  They just store a short string of
 * data in the FS
@@ -1413,6 +1500,9 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_AUTO_DEFRAG         (1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE     (1 << 17)
 #define BTRFS_MOUNT_RECOVERY            (1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE        (1 << 19)
+#define BTRFS_MOUNT_CHECK_INTEGRITY     (1 << 20)
+#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -2077,8 +2167,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
 BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
                   num_devices, 64);
-/* struct btrfs_super_block */
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+                                      struct btrfs_balance_item *bi,
+                                      struct btrfs_disk_balance_args *ba)
+{
+        read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+                                          struct btrfs_balance_item *bi,
+                                          struct btrfs_disk_balance_args *ba)
+{
+        write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+                                      struct btrfs_balance_item *bi,
+                                      struct btrfs_disk_balance_args *ba)
+{
+        read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+                                          struct btrfs_balance_item *bi,
+                                          struct btrfs_disk_balance_args *ba)
+{
+        write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+                                     struct btrfs_balance_item *bi,
+                                     struct btrfs_disk_balance_args *ba)
+{
+        read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+                                         struct btrfs_balance_item *bi,
+                                         struct btrfs_disk_balance_args *ba)
+{
+        write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+                               struct btrfs_disk_balance_args *disk)
+{
+        memset(cpu, 0, sizeof(*cpu));
+        cpu->profiles = le64_to_cpu(disk->profiles);
+        cpu->usage = le64_to_cpu(disk->usage);
+        cpu->devid = le64_to_cpu(disk->devid);
+        cpu->pstart = le64_to_cpu(disk->pstart);
+        cpu->pend = le64_to_cpu(disk->pend);
+        cpu->vstart = le64_to_cpu(disk->vstart);
+        cpu->vend = le64_to_cpu(disk->vend);
+        cpu->target = le64_to_cpu(disk->target);
+        cpu->flags = le64_to_cpu(disk->flags);
+}
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+                               struct btrfs_balance_args *cpu)
+{
+        memset(disk, 0, sizeof(*disk));
+        disk->profiles = cpu_to_le64(cpu->profiles);
+        disk->usage = cpu_to_le64(cpu->usage);
+        disk->devid = cpu_to_le64(cpu->devid);
+        disk->pstart = cpu_to_le64(cpu->pstart);
+        disk->pend = cpu_to_le64(cpu->pend);
+        disk->vstart = cpu_to_le64(cpu->vstart);
+        disk->vend = cpu_to_le64(cpu->vend);
+        disk->target = cpu_to_le64(cpu->target);
+        disk->flags = cpu_to_le64(cpu->flags);
+}
+/* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2196,7 +2364,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
        return btrfs_item_size(eb, e) - offset;
 }
-static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
        return sb->s_fs_info;
 }
@@ -2277,11 +2445,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root, u32 blocksize,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
-                                        u64 hint, u64 empty_size);
+                                        u64 hint, u64 empty_size, int for_cow);
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                           u64 parent, int last_ref);
+                           u64 parent, int last_ref, int for_cow);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -2301,17 +2469,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
                                  u64 search_end, struct btrfs_key *ins,
                                  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *buf, int full_backref);
+                  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *buf, int full_backref);
+                  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 flags,
                                int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
-                      u64 bytenr, u64 num_bytes, u64 parent,
+                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-                      u64 root_objectid, u64 owner, u64 offset);
+                      u64 owner, u64 offset, int for_cow);
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -2323,7 +2491,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                         u64 root_objectid, u64 owner, u64 offset);
+                         u64 root_objectid, u64 owner, u64 offset, int for_cow);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
@@ -2482,10 +2650,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 }
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+        ++p->slots[0];
+        if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+                return btrfs_next_leaf(root, p);
+        return 0;
+}
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
 void btrfs_drop_snapshot(struct btrfs_root *root,
-                         struct btrfs_block_rsv *block_rsv, int update_ref);
+                         struct btrfs_block_rsv *block_rsv, int update_ref,
+                         int for_reloc);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
@@ -2500,6 +2676,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 }
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
+        kfree(fs_info->balance_ctl);
        kfree(fs_info->delayed_root);
        kfree(fs_info->extent_root);
        kfree(fs_info->tree_root);
@@ -2510,6 +2687,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
        kfree(fs_info->super_for_commit);
        kfree(fs_info);
 }
+/**
+ * profile_is_valid - tests whether a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static inline int profile_is_valid(u64 flags, int extended)
+{
+        u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+        flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+        if (extended)
+                mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+        if (flags & mask)
+                return 0;
+        /* true if zero or exactly one bit set */
+        return (flags & (~flags + 1)) == flags;
+}
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 9c1eccc2c503..fe4cd0f1cef1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-        if (!ret)
+        if (!ret) {
+                trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+                                              item->key.objectid,
+                                              num_bytes, 1);
                item->bytes_reserved = num_bytes;
+        }
        return ret;
 }
@@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
                return;
        rsv = &root->fs_info->delayed_block_rsv;
+        trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+                                      item->key.objectid, item->bytes_reserved,
+                                      0);
        btrfs_block_rsv_release(root, rsv,
                                item->bytes_reserved);
 }
@@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata(
        struct btrfs_block_rsv *dst_rsv;
        u64 num_bytes;
        int ret;
-        int release = false;
+        bool release = false;
        src_rsv = trans->block_rsv;
        dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata(
                 */
                if (ret == -EAGAIN)
                        ret = -ENOSPC;
-                if (!ret)
+                if (!ret) {
                        node->bytes_reserved = num_bytes;
+                        trace_btrfs_space_reservation(root->fs_info,
+                                                      "delayed_inode",
+                                                      btrfs_ino(inode),
+                                                      num_bytes, 1);
+                }
                return ret;
        } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
                spin_lock(&BTRFS_I(inode)->lock);
@@ -707,11 +719,17 @@ out:
         * reservation here.  I think it may be time for a documentation page on
         * how block rsvs. work.
         */
-        if (!ret)
+        if (!ret) {
+                trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+                                              btrfs_ino(inode), num_bytes, 1);
                node->bytes_reserved = num_bytes;
+        }
-        if (release)
+        if (release) {
+                trace_btrfs_space_reservation(root->fs_info, "delalloc",
+                                              btrfs_ino(inode), num_bytes, 0);
                btrfs_block_rsv_release(root, src_rsv, num_bytes);
+        }
        return ret;
 }
@@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
                return;
        rsv = &root->fs_info->delayed_block_rsv;
+        trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+                                      node->inode_id, node->bytes_reserved, 0);
        btrfs_block_rsv_release(root, rsv,
                                node->bytes_reserved);
        node->bytes_reserved = 0;
@@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
                goto release_node;
        }
-        ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
-        /*
-         * we have reserved enough space when we start a new transaction,
-         * so reserving metadata failure is impossible
-         */
-        BUG_ON(ret);
        delayed_item->key.objectid = btrfs_ino(dir);
        btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
        delayed_item->key.offset = index;
@@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
        dir_item->type = type;
        memcpy((char *)(dir_item + 1), name, name_len);
+        ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+        /*
+         * we have reserved enough space when we start a new transaction,
+         * so reserving metadata failure is impossible
+         */
+        BUG_ON(ret);
        mutex_lock(&delayed_node->mutex);
        ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
        if (unlikely(ret)) {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 125cf76fcd08..66e4f29505a3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
                return -1;
        if (ref1->type > ref2->type)
                return 1;
+        /* merging of sequenced refs is not allowed */
+        if (ref1->seq < ref2->seq)
+                return -1;
+        if (ref1->seq > ref2->seq)
+                return 1;
        if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
            ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
                return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 /*
 * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot
+ * head if it was able to find one, or NULL if nothing was in that spot.
+ * If return_bigger is given, the next bigger entry is returned if no exact
+ * match is found.
 */
 static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
                                  u64 bytenr,
-                                  struct btrfs_delayed_ref_node **last)
+                                  struct btrfs_delayed_ref_node **last,
+                                  int return_bigger)
 {
-        struct rb_node *n = root->rb_node;
+        struct rb_node *n;
        struct btrfs_delayed_ref_node *entry;
-        int cmp;
+        int cmp = 0;
+again:
+        n = root->rb_node;
+        entry = NULL;
        while (n) {
                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
                WARN_ON(!entry->in_tree);
@@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
                else
                        return entry;
        }
+        if (entry && return_bigger) {
+                if (cmp > 0) {
+                        n = rb_next(&entry->rb_node);
+                        if (!n)
+                                n = rb_first(root);
+                        entry = rb_entry(n, struct btrfs_delayed_ref_node,
+                                         rb_node);
+                        bytenr = entry->bytenr;
+                        return_bigger = 0;
+                        goto again;
+                }
+                return entry;
+        }
        return NULL;
 }
@@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
        return 0;
 }
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                            u64 seq)
+{
+        struct seq_list *elem;
+        assert_spin_locked(&delayed_refs->lock);
+        if (list_empty(&delayed_refs->seq_head))
+                return 0;
+        elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
+        if (seq >= elem->seq) {
+                pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
+                         seq, elem->seq, delayed_refs);
+                return 1;
+        }
+        return 0;
+}
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                           struct list_head *cluster, u64 start)
 {
@@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                node = rb_first(&delayed_refs->root);
        } else {
                ref = NULL;
-                find_ref_head(&delayed_refs->root, start, &ref);
+                find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
                if (ref) {
-                        struct btrfs_delayed_ref_node *tmp;
-                        node = rb_prev(&ref->rb_node);
-                        while (node) {
-                                tmp = rb_entry(node,
-                                               struct btrfs_delayed_ref_node,
-                                               rb_node);
-                                if (tmp->bytenr < start)
-                                        break;
-                                ref = tmp;
-                                node = rb_prev(&ref->rb_node);
-                        }
                        node = &ref->rb_node;
                } else
                        node = rb_first(&delayed_refs->root);
@@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 * this does all the dirty work in terms of maintaining the correct
 * overall modification count.
 */
-static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+                                        struct btrfs_trans_handle *trans,
                                        struct btrfs_delayed_ref_node *ref,
                                        u64 bytenr, u64 num_bytes,
                                        int action, int is_data)
@@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
        ref->action  = 0;
        ref->is_head = 1;
        ref->in_tree = 1;
+        ref->seq = 0;
        head_ref = btrfs_delayed_node_to_head(ref);
        head_ref->must_insert_reserved = must_insert_reserved;
@@ -468,14 +500,17 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
 /*
 * helper to insert a delayed tree ref into the rbtree.
 */
-static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+                                         struct btrfs_trans_handle *trans,
                                         struct btrfs_delayed_ref_node *ref,
                                         u64 bytenr, u64 num_bytes, u64 parent,
-                                         u64 ref_root, int level, int action)
+                                         u64 ref_root, int level, int action,
+                                         int for_cow)
 {
        struct btrfs_delayed_ref_node *existing;
        struct btrfs_delayed_tree_ref *full_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
+        u64 seq = 0;
        if (action == BTRFS_ADD_DELAYED_EXTENT)
                action = BTRFS_ADD_DELAYED_REF;
@@ -491,14 +526,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
        ref->is_head = 0;
        ref->in_tree = 1;
+        if (need_ref_seq(for_cow, ref_root))
+                seq = inc_delayed_seq(delayed_refs);
+        ref->seq = seq;
        full_ref = btrfs_delayed_node_to_tree_ref(ref);
-        if (parent) {
+        full_ref->parent = parent;
-                full_ref->parent = parent;
+        full_ref->root = ref_root;
+        if (parent)
                ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
-        } else {
+        else
-                full_ref->root = ref_root;
                ref->type = BTRFS_TREE_BLOCK_REF_KEY;
-        }
        full_ref->level = level;
        trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -522,15 +560,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 /*
 * helper to insert a delayed data ref into the rbtree.
 */
-static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+                                         struct btrfs_trans_handle *trans,
                                         struct btrfs_delayed_ref_node *ref,
                                         u64 bytenr, u64 num_bytes, u64 parent,
                                         u64 ref_root, u64 owner, u64 offset,
-                                         int action)
+                                         int action, int for_cow)
 {
        struct btrfs_delayed_ref_node *existing;
        struct btrfs_delayed_data_ref *full_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
+        u64 seq = 0;
        if (action == BTRFS_ADD_DELAYED_EXTENT)
                action = BTRFS_ADD_DELAYED_REF;
@@ -546,14 +586,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
        ref->is_head = 0;
        ref->in_tree = 1;
+        if (need_ref_seq(for_cow, ref_root))
+                seq = inc_delayed_seq(delayed_refs);
+        ref->seq = seq;
        full_ref = btrfs_delayed_node_to_data_ref(ref);
-        if (parent) {
+        full_ref->parent = parent;
-                full_ref->parent = parent;
+        full_ref->root = ref_root;
+        if (parent)
                ref->type = BTRFS_SHARED_DATA_REF_KEY;
-        } else {
+        else
-                full_ref->root = ref_root;
                ref->type = BTRFS_EXTENT_DATA_REF_KEY;
-        }
        full_ref->objectid = owner;
        full_ref->offset = offset;
@@ -580,10 +624,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
 * to make sure the delayed ref is eventually processed before this
 * transaction commits.
 */
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+                               struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes, u64 parent,
                               u64 ref_root,  int level, int action,
-                               struct btrfs_delayed_extent_op *extent_op)
+                               struct btrfs_delayed_extent_op *extent_op,
+                               int for_cow)
 {
        struct btrfs_delayed_tree_ref *ref;
        struct btrfs_delayed_ref_head *head_ref;
@@ -610,13 +656,17 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-        ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
+        ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
-                                   action, 0);
+                                   num_bytes, action, 0);
        BUG_ON(ret);
-        ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
+        ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
-                                   parent, ref_root, level, action);
+                                   num_bytes, parent, ref_root, level, action,
+                                   for_cow);
        BUG_ON(ret);
+        if (!need_ref_seq(for_cow, ref_root) &&
+            waitqueue_active(&delayed_refs->seq_wait))
+                wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -624,11 +674,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 /*
 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
 */
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+                               struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes,
                               u64 parent, u64 ref_root,
                               u64 owner, u64 offset, int action,
-                               struct btrfs_delayed_extent_op *extent_op)
+                               struct btrfs_delayed_extent_op *extent_op,
+                               int for_cow)
 {
        struct btrfs_delayed_data_ref *ref;
        struct btrfs_delayed_ref_head *head_ref;
@@ -655,18 +707,23 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-        ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
+        ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
-                                   action, 1);
+                                   num_bytes, action, 1);
        BUG_ON(ret);
-        ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
+        ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
-                                   parent, ref_root, owner, offset, action);
+                                   num_bytes, parent, ref_root, owner, offset,
+                                   action, for_cow);
        BUG_ON(ret);
+        if (!need_ref_seq(for_cow, ref_root) &&
+            waitqueue_active(&delayed_refs->seq_wait))
+                wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+                                struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
                                struct btrfs_delayed_extent_op *extent_op)
 {
@@ -683,11 +740,13 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
-        ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+        ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
                                   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
                                   extent_op->is_data);
        BUG_ON(ret);
+        if (waitqueue_active(&delayed_refs->seq_wait))
+                wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -704,7 +763,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
        struct btrfs_delayed_ref_root *delayed_refs;
        delayed_refs = &trans->transaction->delayed_refs;
-        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+        ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
        if (ref)
                return btrfs_delayed_node_to_head(ref);
        return NULL;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e287e3b0eab0..d8f244d94925 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
        /* the size of the extent */
        u64 num_bytes;
+        /* seq number to keep track of insertion order */
+        u64 seq;
        /* ref count on this data structure */
        atomic_t refs;
@@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head {
 struct btrfs_delayed_tree_ref {
        struct btrfs_delayed_ref_node node;
-        union {
+        u64 root;
-                u64 root;
+        u64 parent;
-                u64 parent;
-        };
        int level;
 };
 struct btrfs_delayed_data_ref {
        struct btrfs_delayed_ref_node node;
-        union {
+        u64 root;
-                u64 root;
+        u64 parent;
-                u64 parent;
-        };
        u64 objectid;
        u64 offset;
 };
@@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root {
        int flushing;
        u64 run_delayed_start;
+        /*
+         * seq number of delayed refs. We need to know if a backref was being
+         * added before the currently processed ref or afterwards.
+         */
+        u64 seq;
+        /*
+         * seq_list holds a list of all seq numbers that are currently being
+         * added to the list. While walking backrefs (btrfs_find_all_roots,
+         * qgroups), which might take some time, no newer ref must be processed,
+         * as it might influence the outcome of the walk.
+         */
+        struct list_head seq_head;
+        /*
+         * when the only refs we have in the list must not be processed, we want
+         * to wait for more refs to show up or for the end of backref walking.
+         */
+        wait_queue_head_t seq_wait;
 };
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
        }
 }
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+                               struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes, u64 parent,
                               u64 ref_root, int level, int action,
-                               struct btrfs_delayed_extent_op *extent_op);
+                               struct btrfs_delayed_extent_op *extent_op,
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+                               int for_cow);
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+                               struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes,
                               u64 parent, u64 ref_root,
                               u64 owner, u64 offset, int action,
-                               struct btrfs_delayed_extent_op *extent_op);
+                               struct btrfs_delayed_extent_op *extent_op,
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+                               int for_cow);
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+                                struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
                                struct btrfs_delayed_extent_op *extent_op);
@@ -170,6 +194,60 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
                           struct btrfs_delayed_ref_head *head);
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                           struct list_head *cluster, u64 search_start);
+struct seq_list {
+        struct list_head list;
+        u64 seq;
+};
+static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
+{
+        assert_spin_locked(&delayed_refs->lock);
+        ++delayed_refs->seq;
+        return delayed_refs->seq;
+}
+static inline void
+btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                      struct seq_list *elem)
+{
+        assert_spin_locked(&delayed_refs->lock);
+        elem->seq = delayed_refs->seq;
+        list_add_tail(&elem->list, &delayed_refs->seq_head);
+}
+static inline void
+btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                      struct seq_list *elem)
+{
+        spin_lock(&delayed_refs->lock);
+        list_del(&elem->list);
+        wake_up(&delayed_refs->seq_wait);
+        spin_unlock(&delayed_refs->lock);
+}
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                            u64 seq);
+/*
+ * delayed refs with a ref_seq > 0 must be held back during backref walking.
+ * this only applies to items in one of the fs-trees. for_cow items never need
+ * to be held back, so they won't get a ref_seq number.
+ */
+static inline int need_ref_seq(int for_cow, u64 rootid)
+{
+        if (for_cow)
+                return 0;
+        if (rootid == BTRFS_FS_TREE_OBJECTID)
+                return 1;
+        if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+                return 1;
+        return 0;
+}
 /*
 * a node might live in a head or a regular ref, this lets you
 * test for the proper type to use.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f99a099a7747..811d9f918b1c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,6 +43,7 @@
 #include "tree-log.h"
 #include "free-space-cache.h"
 #include "inode-map.h"
+#include "check-integrity.h"
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
@@ -872,7 +873,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 #ifdef CONFIG_MIGRATION
 static int btree_migratepage(struct address_space *mapping,
-                        struct page *newpage, struct page *page)
+                        struct page *newpage, struct page *page,
+                        enum migrate_mode mode)
 {
        /*
         * we can't safely write a btree page from here,
@@ -887,7 +889,7 @@ static int btree_migratepage(struct address_space *mapping,
        if (page_has_private(page) &&
            !try_to_release_page(page, GFP_KERNEL))
                return -EAGAIN;
-        return migrate_page(mapping, newpage, page);
+        return migrate_page(mapping, newpage, page, mode);
 }
 #endif
@@ -960,6 +962,13 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
        tree = &BTRFS_I(page->mapping->host)->io_tree;
        map = &BTRFS_I(page->mapping->host)->extent_tree;
+        /*
+         * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
+         * slab allocation from alloc_extent_state down the callchain where
+         * it'd hit a BUG_ON as those flags are not allowed.
+         */
+        gfp_flags &= ~GFP_SLAB_BUG_MASK;
        ret = try_release_extent_state(map, tree, page, gfp_flags);
        if (!ret)
                return 0;
@@ -1142,7 +1151,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->orphan_item_inserted = 0;
        root->orphan_cleanup_state = 0;
-        root->fs_info = fs_info;
        root->objectid = objectid;
        root->last_trans = 0;
        root->highest_objectid = 0;
@@ -1216,6 +1224,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        return 0;
 }
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+        if (root)
+                root->fs_info = fs_info;
+        return root;
+}
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info)
 {
@@ -1223,7 +1239,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct extent_buffer *leaf;
-        root = kzalloc(sizeof(*root), GFP_NOFS);
+        root = btrfs_alloc_root(fs_info);
        if (!root)
                return ERR_PTR(-ENOMEM);
@@ -1243,7 +1259,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
        root->ref_cows = 0;
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-                                      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+                                      BTRFS_TREE_LOG_OBJECTID, NULL,
+                                      0, 0, 0, 0);
        if (IS_ERR(leaf)) {
                kfree(root);
                return ERR_CAST(leaf);
@@ -1317,7 +1334,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
        u32 blocksize;
        int ret = 0;
-        root = kzalloc(sizeof(*root), GFP_NOFS);
+        root = btrfs_alloc_root(fs_info);
        if (!root)
                return ERR_PTR(-ENOMEM);
        if (location->offset == (u64)-1) {
@@ -1873,9 +1890,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 }
-struct btrfs_root *open_ctree(struct super_block *sb,
+int open_ctree(struct super_block *sb,
-                              struct btrfs_fs_devices *fs_devices,
+               struct btrfs_fs_devices *fs_devices,
-                              char *options)
+               char *options)
 {
        u32 sectorsize;
        u32 nodesize;
@@ -1887,8 +1904,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        struct btrfs_key location;
        struct buffer_head *bh;
        struct btrfs_super_block *disk_super;
-        struct btrfs_root *tree_root = btrfs_sb(sb);
+        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-        struct btrfs_fs_info *fs_info = tree_root->fs_info;
+        struct btrfs_root *tree_root;
        struct btrfs_root *extent_root;
        struct btrfs_root *csum_root;
        struct btrfs_root *chunk_root;
@@ -1899,16 +1916,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        int num_backups_tried = 0;
        int backup_index = 0;
-        extent_root = fs_info->extent_root =
+        tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
-                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
-        csum_root = fs_info->csum_root =
+        csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
-                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
-        chunk_root = fs_info->chunk_root =
+        dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
-                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-        dev_root = fs_info->dev_root =
-                kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-        if (!extent_root || !csum_root || !chunk_root || !dev_root) {
+        if (!tree_root || !extent_root || !csum_root ||
+            !chunk_root || !dev_root) {
                err = -ENOMEM;
                goto fail;
        }
@@ -1997,6 +2012,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        init_waitqueue_head(&fs_info->scrub_pause_wait);
        init_rwsem(&fs_info->scrub_super_lock);
        fs_info->scrub_workers_refcnt = 0;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+        fs_info->check_integrity_print_mask = 0;
+#endif
+        spin_lock_init(&fs_info->balance_lock);
+        mutex_init(&fs_info->balance_mutex);
+        atomic_set(&fs_info->balance_running, 0);
+        atomic_set(&fs_info->balance_pause_req, 0);
+        atomic_set(&fs_info->balance_cancel_req, 0);
+        fs_info->balance_ctl = NULL;
+        init_waitqueue_head(&fs_info->balance_wait_q);
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
@@ -2266,9 +2292,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
           (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
           BTRFS_UUID_SIZE);
-        mutex_lock(&fs_info->chunk_mutex);
        ret = btrfs_read_chunk_tree(chunk_root);
-        mutex_unlock(&fs_info->chunk_mutex);
        if (ret) {
                printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
                       sb->s_id);
@@ -2317,9 +2341,6 @@ retry_root_backup:
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
-        fs_info->data_alloc_profile = (u64)-1;
-        fs_info->metadata_alloc_profile = (u64)-1;
-        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
@@ -2352,6 +2373,19 @@ retry_root_backup:
                btrfs_set_opt(fs_info->mount_opt, SSD);
        }
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+        if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+                ret = btrfsic_mount(tree_root, fs_devices,
+                                    btrfs_test_opt(tree_root,
+                                        CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+                                    1 : 0,
+                                    fs_info->check_integrity_print_mask);
+                if (ret)
+                        printk(KERN_WARNING "btrfs: failed to initialize"
+                               " integrity check module %s\n", sb->s_id);
+        }
+#endif
        /* do not make disk changes in broken FS */
        if (btrfs_super_log_root(disk_super) != 0 &&
            !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2367,7 +2401,7 @@ retry_root_backup:
                     btrfs_level_size(tree_root,
                                      btrfs_super_log_root_level(disk_super));
-                log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+                log_tree_root = btrfs_alloc_root(fs_info);
                if (!log_tree_root) {
                        err = -ENOMEM;
                        goto fail_trans_kthread;
@@ -2422,13 +2456,17 @@ retry_root_backup:
                if (!err)
                        err = btrfs_orphan_cleanup(fs_info->tree_root);
                up_read(&fs_info->cleanup_work_sem);
+                if (!err)
+                        err = btrfs_recover_balance(fs_info->tree_root);
                if (err) {
                        close_ctree(tree_root);
-                        return ERR_PTR(err);
+                        return err;
                }
        }
-        return tree_root;
+        return 0;
 fail_trans_kthread:
        kthread_stop(fs_info->transaction_kthread);
@@ -2474,8 +2512,7 @@ fail_srcu:
        cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
        btrfs_close_devices(fs_info->fs_devices);
-        free_fs_info(fs_info);
+        return err;
-        return ERR_PTR(err);
 recovery_tree_root:
        if (!btrfs_test_opt(tree_root, RECOVERY))
@@ -2630,7 +2667,7 @@ static int write_dev_supers(struct btrfs_device *device,
                 * we fua the first super.  The others we allow
                 * to go down lazy.
                 */
-                ret = submit_bh(WRITE_FUA, bh);
+                ret = btrfsic_submit_bh(WRITE_FUA, bh);
                if (ret)
                        errors++;
        }
@@ -2707,7 +2744,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
        device->flush_bio = bio;
        bio_get(bio);
-        submit_bio(WRITE_FLUSH, bio);
+        btrfsic_submit_bio(WRITE_FLUSH, bio);
        return 0;
 }
@@ -2971,6 +3008,9 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        smp_mb();
+        /* pause restriper - we want to resume on mount */
+        btrfs_pause_balance(root->fs_info);
        btrfs_scrub_cancel(root);
        /* wait for any defraggers to finish */
@@ -2978,7 +3018,7 @@ int close_ctree(struct btrfs_root *root)
                   (atomic_read(&fs_info->defrag_running) == 0));
        /* clear out the rbtree of defraggable inodes */
-        btrfs_run_defrag_inodes(root->fs_info);
+        btrfs_run_defrag_inodes(fs_info);
        /*
         * Here come 2 situations when btrfs is broken to flip readonly:
@@ -3007,8 +3047,8 @@ int close_ctree(struct btrfs_root *root)
        btrfs_put_block_group_cache(fs_info);
-        kthread_stop(root->fs_info->transaction_kthread);
+        kthread_stop(fs_info->transaction_kthread);
-        kthread_stop(root->fs_info->cleaner_kthread);
+        kthread_stop(fs_info->cleaner_kthread);
        fs_info->closing = 2;
        smp_mb();
@@ -3026,14 +3066,14 @@ int close_ctree(struct btrfs_root *root)
        free_extent_buffer(fs_info->extent_root->commit_root);
        free_extent_buffer(fs_info->tree_root->node);
        free_extent_buffer(fs_info->tree_root->commit_root);
-        free_extent_buffer(root->fs_info->chunk_root->node);
+        free_extent_buffer(fs_info->chunk_root->node);
-        free_extent_buffer(root->fs_info->chunk_root->commit_root);
+        free_extent_buffer(fs_info->chunk_root->commit_root);
-        free_extent_buffer(root->fs_info->dev_root->node);
+        free_extent_buffer(fs_info->dev_root->node);
-        free_extent_buffer(root->fs_info->dev_root->commit_root);
+        free_extent_buffer(fs_info->dev_root->commit_root);
-        free_extent_buffer(root->fs_info->csum_root->node);
+        free_extent_buffer(fs_info->csum_root->node);
-        free_extent_buffer(root->fs_info->csum_root->commit_root);
+        free_extent_buffer(fs_info->csum_root->commit_root);
-        btrfs_free_block_groups(root->fs_info);
+        btrfs_free_block_groups(fs_info);
        del_fs_roots(fs_info);
@@ -3053,14 +3093,17 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->caching_workers);
        btrfs_stop_workers(&fs_info->readahead_workers);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+        if (btrfs_test_opt(root, CHECK_INTEGRITY))
+                btrfsic_unmount(root, fs_info->fs_devices);
+#endif
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
        bdi_destroy(&fs_info->bdi);
        cleanup_srcu_struct(&fs_info->subvol_srcu);
-        free_fs_info(fs_info);
        return 0;
 }
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c99d0a8f13fa..e4bc4741319b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
                                                   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
                     struct btrfs_root *root, struct extent_buffer *buf);
-struct btrfs_root *open_ctree(struct super_block *sb,
+int open_ctree(struct super_block *sb,
-                              struct btrfs_fs_devices *fs_devices,
+               struct btrfs_fs_devices *fs_devices,
-                              char *options);
+               char *options);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, int max_mirrors);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1b8dc33778f9..5f77166fd01c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
                                       u64 root_objectid, u32 generation,
                                       int check_generation)
 {
-        struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_root *root;
        struct inode *inode;
        struct btrfs_key key;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5fbe576d2ba..283af7a676a3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,23 +34,24 @@
 #include "locking.h"
 #include "free-space-cache.h"
-/* control flags for do_chunk_alloc's force field
+/*
+ * control flags for do_chunk_alloc's force field
 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
 * if we really need one.
 *
- * CHUNK_ALLOC_FORCE means it must try to allocate one
- *
 * CHUNK_ALLOC_LIMITED means to only try and allocate one
 * if we have very few chunks already allocated.  This is
 * used as part of the clustering code to help make sure
 * we have a good pool of storage to cluster in, without
 * filling the FS with empty chunks
 *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
 */
 enum {
        CHUNK_ALLOC_NO_FORCE = 0,
-        CHUNK_ALLOC_FORCE = 1,
+        CHUNK_ALLOC_LIMITED = 1,
-        CHUNK_ALLOC_LIMITED = 2,
+        CHUNK_ALLOC_FORCE = 2,
 };
 /*
@@ -618,8 +619,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;
-        flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
+        flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
-                 BTRFS_BLOCK_GROUP_METADATA;
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
@@ -1872,20 +1872,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                         u64 root_objectid, u64 owner, u64 offset)
+                         u64 root_objectid, u64 owner, u64 offset, int for_cow)
 {
        int ret;
+        struct btrfs_fs_info *fs_info = root->fs_info;
        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
               root_objectid == BTRFS_TREE_LOG_OBJECTID);
        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+                                        num_bytes,
                                        parent, root_objectid, (int)owner,
-                                        BTRFS_ADD_DELAYED_REF, NULL);
+                                        BTRFS_ADD_DELAYED_REF, NULL, for_cow);
        } else {
-                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+                                        num_bytes,
                                        parent, root_objectid, owner, offset,
-                                        BTRFS_ADD_DELAYED_REF, NULL);
+                                        BTRFS_ADD_DELAYED_REF, NULL, for_cow);
        }
        return ret;
 }
@@ -2233,6 +2237,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                }
                /*
+                 * locked_ref is the head node, so we have to go one
+                 * node back for any delayed ref updates
+                 */
+                ref = select_delayed_ref(locked_ref);
+                if (ref && ref->seq &&
+                    btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+                        /*
+                         * there are still refs with lower seq numbers in the
+                         * process of being added. Don't run this ref yet.
+                         */
+                        list_del_init(&locked_ref->cluster);
+                        mutex_unlock(&locked_ref->mutex);
+                        locked_ref = NULL;
+                        delayed_refs->num_heads_ready++;
+                        spin_unlock(&delayed_refs->lock);
+                        cond_resched();
+                        spin_lock(&delayed_refs->lock);
+                        continue;
+                }
+                /*
                 * record the must insert reserved flag before we
                 * drop the spin lock.
                 */
@@ -2242,11 +2268,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                extent_op = locked_ref->extent_op;
                locked_ref->extent_op = NULL;
-                /*
-                 * locked_ref is the head node, so we have to go one
-                 * node back for any delayed ref updates
-                 */
-                ref = select_delayed_ref(locked_ref);
                if (!ref) {
                        /* All delayed refs have been processed, Go ahead
                         * and send the head node to run_one_delayed_ref,
@@ -2267,9 +2288,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                                BUG_ON(ret);
                                kfree(extent_op);
-                                cond_resched();
+                                goto next;
-                                spin_lock(&delayed_refs->lock);
-                                continue;
                        }
                        list_del_init(&locked_ref->cluster);
@@ -2279,7 +2298,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
+                /*
+                 * we modified num_entries, but as we're currently running
+                 * delayed refs, skip
+                 *     wake_up(&delayed_refs->seq_wait);
+                 * here.
+                 */
                spin_unlock(&delayed_refs->lock);
                ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2289,13 +2313,34 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                btrfs_put_delayed_ref(ref);
                kfree(extent_op);
                count++;
+next:
+                do_chunk_alloc(trans, root->fs_info->extent_root,
+                               2 * 1024 * 1024,
+                               btrfs_get_alloc_profile(root, 0),
+                               CHUNK_ALLOC_NO_FORCE);
                cond_resched();
                spin_lock(&delayed_refs->lock);
        }
        return count;
 }
+static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
+                        unsigned long num_refs)
+{
+        struct list_head *first_seq = delayed_refs->seq_head.next;
+        spin_unlock(&delayed_refs->lock);
+        pr_debug("waiting for more refs (num %ld, first %p)\n",
+                 num_refs, first_seq);
+        wait_event(delayed_refs->seq_wait,
+                   num_refs != delayed_refs->num_entries ||
+                   delayed_refs->seq_head.next != first_seq);
+        pr_debug("done waiting for more refs (num %ld, first %p)\n",
+                 delayed_refs->num_entries, delayed_refs->seq_head.next);
+        spin_lock(&delayed_refs->lock);
+}
 /*
 * this starts processing the delayed reference count updates and
 * extent insertions we have queued up so far.  count can be
@@ -2311,15 +2356,23 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_node *ref;
        struct list_head cluster;
        int ret;
+        u64 delayed_start;
        int run_all = count == (unsigned long)-1;
        int run_most = 0;
+        unsigned long num_refs = 0;
+        int consider_waiting;
        if (root == root->fs_info->extent_root)
                root = root->fs_info->tree_root;
+        do_chunk_alloc(trans, root->fs_info->extent_root,
+                       2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
+                       CHUNK_ALLOC_NO_FORCE);
        delayed_refs = &trans->transaction->delayed_refs;
        INIT_LIST_HEAD(&cluster);
 again:
+        consider_waiting = 0;
        spin_lock(&delayed_refs->lock);
        if (count == 0) {
                count = delayed_refs->num_entries * 2;
@@ -2336,11 +2389,35 @@ again:
                 * of refs to process starting at the first one we are able to
                 * lock
                 */
+                delayed_start = delayed_refs->run_delayed_start;
                ret = btrfs_find_ref_cluster(trans, &cluster,
                                             delayed_refs->run_delayed_start);
                if (ret)
                        break;
+                if (delayed_start >= delayed_refs->run_delayed_start) {
+                        if (consider_waiting == 0) {
+                                /*
+                                 * btrfs_find_ref_cluster looped. let's do one
+                                 * more cycle. if we don't run any delayed ref
+                                 * during that cycle (because we can't because
+                                 * all of them are blocked) and if the number of
+                                 * refs doesn't change, we avoid busy waiting.
+                                 */
+                                consider_waiting = 1;
+                                num_refs = delayed_refs->num_entries;
+                        } else {
+                                wait_for_more_refs(delayed_refs, num_refs);
+                                /*
+                                 * after waiting, things have changed. we
+                                 * dropped the lock and someone else might have
+                                 * run some refs, built new clusters and so on.
+                                 * therefore, we restart staleness detection.
+                                 */
+                                consider_waiting = 0;
+                        }
+                }
                ret = run_clustered_refs(trans, root, &cluster);
                BUG_ON(ret < 0);
@@ -2348,6 +2425,11 @@ again:
                if (count == 0)
                        break;
+                if (ret || delayed_refs->run_delayed_start == 0) {
+                        /* refs were run, let's reset staleness detection */
+                        consider_waiting = 0;
+                }
        }
        if (run_all) {
@@ -2405,7 +2487,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
        extent_op->update_key = 0;
        extent_op->is_data = is_data ? 1 : 0;
-        ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+        ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+                                          num_bytes, extent_op);
        if (ret)
                kfree(extent_op);
        return ret;
@@ -2590,7 +2673,7 @@ out:
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                           int full_backref, int inc)
+                           int full_backref, int inc, int for_cow)
 {
        u64 bytenr;
        u64 num_bytes;
@@ -2603,7 +2686,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
        int level;
        int ret = 0;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                            u64, u64, u64, u64, u64, u64);
+                            u64, u64, u64, u64, u64, u64, int);
        ref_root = btrfs_header_owner(buf);
        nritems = btrfs_header_nritems(buf);
@@ -2640,14 +2723,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                        key.offset -= btrfs_file_extent_offset(buf, fi);
                        ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, key.objectid,
-                                           key.offset);
+                                           key.offset, for_cow);
                        if (ret)
                                goto fail;
                } else {
                        bytenr = btrfs_node_blockptr(buf, i);
                        num_bytes = btrfs_level_size(root, level - 1);
                        ret = process_func(trans, root, bytenr, num_bytes,
-                                           parent, ref_root, level - 1, 0);
+                                           parent, ref_root, level - 1, 0,
+                                           for_cow);
                        if (ret)
                                goto fail;
                }
@@ -2659,15 +2743,15 @@ fail:
 }
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *buf, int full_backref)
+                  struct extent_buffer *buf, int full_backref, int for_cow)
 {
-        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+        return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
 }
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *buf, int full_backref)
+                  struct extent_buffer *buf, int full_backref, int for_cow)
 {
-        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+        return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
 }
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2993,9 +3077,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                INIT_LIST_HEAD(&found->block_groups[i]);
        init_rwsem(&found->groups_sem);
        spin_lock_init(&found->lock);
-        found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
+        found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
-                                BTRFS_BLOCK_GROUP_SYSTEM |
-                                BTRFS_BLOCK_GROUP_METADATA);
        found->total_bytes = total_bytes;
        found->disk_total = total_bytes * factor;
        found->bytes_used = bytes_used;
@@ -3016,20 +3098,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
-        u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
+        u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
-                                   BTRFS_BLOCK_GROUP_RAID1 |
-                                   BTRFS_BLOCK_GROUP_RAID10 |
+        /* chunk -> extended profile */
-                                   BTRFS_BLOCK_GROUP_DUP);
+        if (extra_flags == 0)
-        if (extra_flags) {
+                extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-                if (flags & BTRFS_BLOCK_GROUP_DATA)
-                        fs_info->avail_data_alloc_bits |= extra_flags;
+        if (flags & BTRFS_BLOCK_GROUP_DATA)
-                if (flags & BTRFS_BLOCK_GROUP_METADATA)
+                fs_info->avail_data_alloc_bits |= extra_flags;
-                        fs_info->avail_metadata_alloc_bits |= extra_flags;
+        if (flags & BTRFS_BLOCK_GROUP_METADATA)
-                if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+                fs_info->avail_metadata_alloc_bits |= extra_flags;
-                        fs_info->avail_system_alloc_bits |= extra_flags;
+        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-        }
+                fs_info->avail_system_alloc_bits |= extra_flags;
 }
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format.  If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
+ */
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        /*
@@ -3040,6 +3129,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        u64 num_devices = root->fs_info->fs_devices->rw_devices +
                root->fs_info->fs_devices->missing_devices;
+        /* pick restriper's target profile if it's available */
+        spin_lock(&root->fs_info->balance_lock);
+        if (root->fs_info->balance_ctl) {
+                struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+                u64 tgt = 0;
+                if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
+                    (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                    (flags & bctl->data.target)) {
+                        tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+                } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
+                           (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                           (flags & bctl->sys.target)) {
+                        tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+                } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
+                           (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                           (flags & bctl->meta.target)) {
+                        tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+                }
+                if (tgt) {
+                        spin_unlock(&root->fs_info->balance_lock);
+                        flags = tgt;
+                        goto out;
+                }
+        }
+        spin_unlock(&root->fs_info->balance_lock);
        if (num_devices == 1)
                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
        if (num_devices < 4)
@@ -3059,22 +3176,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
            ((flags & BTRFS_BLOCK_GROUP_RAID1) |
             (flags & BTRFS_BLOCK_GROUP_RAID10) |
-             (flags & BTRFS_BLOCK_GROUP_DUP)))
+             (flags & BTRFS_BLOCK_GROUP_DUP))) {
                flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+        }
+out:
+        /* extended -> chunk profile */
+        flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
        return flags;
 }
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        if (flags & BTRFS_BLOCK_GROUP_DATA)
-                flags |= root->fs_info->avail_data_alloc_bits &
+                flags |= root->fs_info->avail_data_alloc_bits;
-                         root->fs_info->data_alloc_profile;
        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-                flags |= root->fs_info->avail_system_alloc_bits &
+                flags |= root->fs_info->avail_system_alloc_bits;
-                         root->fs_info->system_alloc_profile;
        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-                flags |= root->fs_info->avail_metadata_alloc_bits &
+                flags |= root->fs_info->avail_metadata_alloc_bits;
-                         root->fs_info->metadata_alloc_profile;
        return btrfs_reduce_alloc_profile(root, flags);
 }
@@ -3191,6 +3311,8 @@ commit_trans:
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
+        trace_btrfs_space_reservation(root->fs_info, "space_info",
+                                      (u64)data_sinfo, bytes, 1);
        spin_unlock(&data_sinfo->lock);
        return 0;
@@ -3210,6 +3332,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
        data_sinfo = BTRFS_I(inode)->space_info;
        spin_lock(&data_sinfo->lock);
        data_sinfo->bytes_may_use -= bytes;
+        trace_btrfs_space_reservation(root->fs_info, "space_info",
+                                      (u64)data_sinfo, bytes, 0);
        spin_unlock(&data_sinfo->lock);
 }
@@ -3257,27 +3381,15 @@ static int should_alloc_chunk(struct btrfs_root *root,
                if (num_bytes - num_allocated < thresh)
                        return 1;
        }
-        /*
-         * we have two similar checks here, one based on percentage
-         * and once based on a hard number of 256MB.  The idea
-         * is that if we have a good amount of free
-         * room, don't allocate a chunk.  A good mount is
-         * less than 80% utilized of the chunks we have allocated,
-         * or more than 256MB free
-         */
-        if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
-                return 0;
-        if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
-                return 0;
        thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
-        /* 256MB or 5% of the FS */
+        /* 256MB or 2% of the FS */
-        thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+        thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
+        /* system chunks need a much small threshold */
+        if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+                thresh = 32 * 1024 * 1024;
-        if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+        if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
                return 0;
        return 1;
 }
@@ -3291,7 +3403,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        int wait_for_alloc = 0;
        int ret = 0;
-        flags = btrfs_reduce_alloc_profile(extent_root, flags);
+        BUG_ON(!profile_is_valid(flags, 0));
        space_info = __find_space_info(extent_root->fs_info, flags);
        if (!space_info) {
@@ -3303,7 +3415,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 again:
        spin_lock(&space_info->lock);
-        if (space_info->force_alloc)
+        if (force < space_info->force_alloc)
                force = space_info->force_alloc;
        if (space_info->full) {
                spin_unlock(&space_info->lock);
@@ -3582,6 +3694,10 @@ again:
        if (used <= space_info->total_bytes) {
                if (used + orig_bytes <= space_info->total_bytes) {
                        space_info->bytes_may_use += orig_bytes;
+                        trace_btrfs_space_reservation(root->fs_info,
+                                                      "space_info",
+                                                      (u64)space_info,
+                                                      orig_bytes, 1);
                        ret = 0;
                } else {
                        /*
@@ -3649,6 +3765,10 @@ again:
                if (used + num_bytes < space_info->total_bytes + avail) {
                        space_info->bytes_may_use += orig_bytes;
+                        trace_btrfs_space_reservation(root->fs_info,
+                                                      "space_info",
+                                                      (u64)space_info,
+                                                      orig_bytes, 1);
                        ret = 0;
                } else {
                        wait_ordered = true;
@@ -3755,7 +3875,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
        spin_unlock(&block_rsv->lock);
 }
-static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+                                    struct btrfs_block_rsv *block_rsv,
                                    struct btrfs_block_rsv *dest, u64 num_bytes)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3791,6 +3912,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
                if (num_bytes) {
                        spin_lock(&space_info->lock);
                        space_info->bytes_may_use -= num_bytes;
+                        trace_btrfs_space_reservation(fs_info, "space_info",
+                                                      (u64)space_info,
+                                                      num_bytes, 0);
                        space_info->reservation_progress++;
                        spin_unlock(&space_info->lock);
                }
@@ -3947,7 +4071,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
        if (global_rsv->full || global_rsv == block_rsv ||
            block_rsv->space_info != global_rsv->space_info)
                global_rsv = NULL;
-        block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+        block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
+                                num_bytes);
 }
 /*
@@ -4006,11 +4131,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
                num_bytes = sinfo->total_bytes - num_bytes;
                block_rsv->reserved += num_bytes;
                sinfo->bytes_may_use += num_bytes;
+                trace_btrfs_space_reservation(fs_info, "space_info",
+                                              (u64)sinfo, num_bytes, 1);
        }
        if (block_rsv->reserved >= block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
                sinfo->bytes_may_use -= num_bytes;
+                trace_btrfs_space_reservation(fs_info, "space_info",
+                                              (u64)sinfo, num_bytes, 0);
                sinfo->reservation_progress++;
                block_rsv->reserved = block_rsv->size;
                block_rsv->full = 1;
@@ -4045,7 +4174,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
-        block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+        block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
+                                (u64)-1);
        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
        WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4062,6 +4192,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
        if (!trans->bytes_reserved)
                return;
+        trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans,
+                                      trans->bytes_reserved, 0);
        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
        trans->bytes_reserved = 0;
 }
@@ -4079,6 +4211,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
         * when we are truly done with the orphan item.
         */
        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+        trace_btrfs_space_reservation(root->fs_info, "orphan",
+                                      btrfs_ino(inode), num_bytes, 1);
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
@@ -4086,6 +4220,8 @@ void btrfs_orphan_release_metadata(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+        trace_btrfs_space_reservation(root->fs_info, "orphan",
+                                      btrfs_ino(inode), num_bytes, 0);
        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
@@ -4213,12 +4349,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        /* Need to be holding the i_mutex here if we aren't free space cache */
        if (btrfs_is_free_space_inode(root, inode))
                flush = 0;
-        else
-                WARN_ON(!mutex_is_locked(&inode->i_mutex));
        if (flush && btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
+        mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
        num_bytes = ALIGN(num_bytes, root->sectorsize);
        spin_lock(&BTRFS_I(inode)->lock);
@@ -4266,8 +4401,14 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                if (dropped)
                        to_free += btrfs_calc_trans_metadata_size(root, dropped);
-                if (to_free)
+                if (to_free) {
                        btrfs_block_rsv_release(root, block_rsv, to_free);
+                        trace_btrfs_space_reservation(root->fs_info,
+                                                      "delalloc",
+                                                      btrfs_ino(inode),
+                                                      to_free, 0);
+                }
+                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                return ret;
        }
@@ -4278,7 +4419,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        }
        BTRFS_I(inode)->reserved_extents += nr_extents;
        spin_unlock(&BTRFS_I(inode)->lock);
+        mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+        if (to_reserve)
+                trace_btrfs_space_reservation(root->fs_info,"delalloc",
+                                              btrfs_ino(inode), to_reserve, 1);
        block_rsv_add_bytes(block_rsv, to_reserve, 1);
        return 0;
@@ -4308,6 +4453,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        if (dropped > 0)
                to_free += btrfs_calc_trans_metadata_size(root, dropped);
+        trace_btrfs_space_reservation(root->fs_info, "delalloc",
+                                      btrfs_ino(inode), to_free, 0);
        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
                                to_free);
 }
@@ -4562,7 +4709,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
                        cache->reserved += num_bytes;
                        space_info->bytes_reserved += num_bytes;
                        if (reserve == RESERVE_ALLOC) {
-                                BUG_ON(space_info->bytes_may_use < num_bytes);
+                                trace_btrfs_space_reservation(cache->fs_info,
+                                                              "space_info",
+                                                              (u64)space_info,
+                                                              num_bytes, 0);
                                space_info->bytes_may_use -= num_bytes;
                        }
                }
@@ -4928,6 +5078,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        rb_erase(&head->node.rb_node, &delayed_refs->root);
        delayed_refs->num_entries--;
+        if (waitqueue_active(&delayed_refs->seq_wait))
+                wake_up(&delayed_refs->seq_wait);
        /*
         * we don't take a ref on the node because we're removing it from the
@@ -4955,16 +5107,17 @@ out:
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                           u64 parent, int last_ref)
+                           u64 parent, int last_ref, int for_cow)
 {
        struct btrfs_block_group_cache *cache = NULL;
        int ret;
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-                ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
+                ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
-                                                parent, root->root_key.objectid,
+                                        buf->start, buf->len,
-                                                btrfs_header_level(buf),
+                                        parent, root->root_key.objectid,
-                                                BTRFS_DROP_DELAYED_REF, NULL);
+                                        btrfs_header_level(buf),
+                                        BTRFS_DROP_DELAYED_REF, NULL, for_cow);
                BUG_ON(ret);
        }
@@ -4999,12 +5152,12 @@ out:
        btrfs_put_block_group(cache);
 }
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                      struct btrfs_root *root,
+                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-                      u64 bytenr, u64 num_bytes, u64 parent,
+                      u64 owner, u64 offset, int for_cow)
-                      u64 root_objectid, u64 owner, u64 offset)
 {
        int ret;
+        struct btrfs_fs_info *fs_info = root->fs_info;
        /*
         * tree log blocks never actually go into the extent allocation
@@ -5016,14 +5169,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                btrfs_pin_extent(root, bytenr, num_bytes, 1);
                ret = 0;
        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+                                        num_bytes,
                                        parent, root_objectid, (int)owner,
-                                        BTRFS_DROP_DELAYED_REF, NULL);
+                                        BTRFS_DROP_DELAYED_REF, NULL, for_cow);
                BUG_ON(ret);
        } else {
-                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
-                                        parent, root_objectid, owner,
+                                                num_bytes,
-                                        offset, BTRFS_DROP_DELAYED_REF, NULL);
+                                                parent, root_objectid, owner,
+                                                offset, BTRFS_DROP_DELAYED_REF,
+                                                NULL, for_cow);
                BUG_ON(ret);
        }
        return ret;
@@ -5146,6 +5302,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        ins->objectid = 0;
        ins->offset = 0;
+        trace_find_free_extent(orig_root, num_bytes, empty_size, data);
        space_info = __find_space_info(root->fs_info, data);
        if (!space_info) {
                printk(KERN_ERR "No space info for %llu\n", data);
@@ -5295,15 +5453,6 @@ alloc:
                if (unlikely(block_group->ro))
                        goto loop;
-                spin_lock(&block_group->free_space_ctl->tree_lock);
-                if (cached &&
-                    block_group->free_space_ctl->free_space <
-                    num_bytes + empty_cluster + empty_size) {
-                        spin_unlock(&block_group->free_space_ctl->tree_lock);
-                        goto loop;
-                }
-                spin_unlock(&block_group->free_space_ctl->tree_lock);
                /*
                 * Ok we want to try and use the cluster allocator, so
                 * lets look there
@@ -5331,6 +5480,8 @@ alloc:
                        if (offset) {
                                /* we have a block, we're done */
                                spin_unlock(&last_ptr->refill_lock);
+                                trace_btrfs_reserve_extent_cluster(root,
+                                        block_group, search_start, num_bytes);
                                goto checks;
                        }
@@ -5349,8 +5500,15 @@ refill_cluster:
                         * plenty of times and not have found
                         * anything, so we are likely way too
                         * fragmented for the clustering stuff to find
-                         * anything.  */
+                         * anything.
-                        if (loop >= LOOP_NO_EMPTY_SIZE) {
+                         *
+                         * However, if the cluster is taken from the
+                         * current block group, release the cluster
+                         * first, so that we stand a better chance of
+                         * succeeding in the unclustered
+                         * allocation.  */
+                        if (loop >= LOOP_NO_EMPTY_SIZE &&
+                            last_ptr->block_group != block_group) {
                                spin_unlock(&last_ptr->refill_lock);
                                goto unclustered_alloc;
                        }
@@ -5361,6 +5519,11 @@ refill_cluster:
                         */
                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
+                        if (loop >= LOOP_NO_EMPTY_SIZE) {
+                                spin_unlock(&last_ptr->refill_lock);
+                                goto unclustered_alloc;
+                        }
                        /* allocate a cluster in this block group */
                        ret = btrfs_find_space_cluster(trans, root,
                                               block_group, last_ptr,
@@ -5377,6 +5540,9 @@ refill_cluster:
                                if (offset) {
                                        /* we found one, proceed */
                                        spin_unlock(&last_ptr->refill_lock);
+                                        trace_btrfs_reserve_extent_cluster(root,
+                                                block_group, search_start,
+                                                num_bytes);
                                        goto checks;
                                }
                        } else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5401,6 +5567,15 @@ refill_cluster:
                }
 unclustered_alloc:
+                spin_lock(&block_group->free_space_ctl->tree_lock);
+                if (cached &&
+                    block_group->free_space_ctl->free_space <
+                    num_bytes + empty_cluster + empty_size) {
+                        spin_unlock(&block_group->free_space_ctl->tree_lock);
+                        goto loop;
+                }
+                spin_unlock(&block_group->free_space_ctl->tree_lock);
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
                /*
@@ -5438,9 +5613,6 @@ checks:
                        goto loop;
                }
-                ins->objectid = search_start;
-                ins->offset = num_bytes;
                if (offset < search_start)
                        btrfs_add_free_space(used_block_group, offset,
                                             search_start - offset);
@@ -5457,6 +5629,8 @@ checks:
                ins->objectid = search_start;
                ins->offset = num_bytes;
+                trace_btrfs_reserve_extent(orig_root, block_group,
+                                           search_start, num_bytes);
                if (offset < search_start)
                        btrfs_add_free_space(used_block_group, offset,
                                             search_start - offset);
@@ -5621,6 +5795,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
                         u64 search_end, struct btrfs_key *ins,
                         u64 data)
 {
+        bool final_tried = false;
        int ret;
        u64 search_start = 0;
@@ -5640,22 +5815,25 @@ again:
                               search_start, search_end, hint_byte,
                               ins, data);
-        if (ret == -ENOSPC && num_bytes > min_alloc_size) {
+        if (ret == -ENOSPC) {
-                num_bytes = num_bytes >> 1;
+                if (!final_tried) {
-                num_bytes = num_bytes & ~(root->sectorsize - 1);
+                        num_bytes = num_bytes >> 1;
-                num_bytes = max(num_bytes, min_alloc_size);
+                        num_bytes = num_bytes & ~(root->sectorsize - 1);
-                do_chunk_alloc(trans, root->fs_info->extent_root,
+                        num_bytes = max(num_bytes, min_alloc_size);
-                               num_bytes, data, CHUNK_ALLOC_FORCE);
+                        do_chunk_alloc(trans, root->fs_info->extent_root,
-                goto again;
+                                       num_bytes, data, CHUNK_ALLOC_FORCE);
-        }
+                        if (num_bytes == min_alloc_size)
-        if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
+                                final_tried = true;
-                struct btrfs_space_info *sinfo;
+                        goto again;
+                } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
-                sinfo = __find_space_info(root->fs_info, data);
+                        struct btrfs_space_info *sinfo;
-                printk(KERN_ERR "btrfs allocation failed flags %llu, "
-                       "wanted %llu\n", (unsigned long long)data,
+                        sinfo = __find_space_info(root->fs_info, data);
-                       (unsigned long long)num_bytes);
+                        printk(KERN_ERR "btrfs allocation failed flags %llu, "
-                dump_space_info(sinfo, num_bytes, 1);
+                               "wanted %llu\n", (unsigned long long)data,
+                               (unsigned long long)num_bytes);
+                        dump_space_info(sinfo, num_bytes, 1);
+                }
        }
        trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
@@ -5842,9 +6020,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
-        ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
+        ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
-                                         0, root_objectid, owner, offset,
+                                         ins->offset, 0,
-                                         BTRFS_ADD_DELAYED_EXTENT, NULL);
+                                         root_objectid, owner, offset,
+                                         BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
        return ret;
 }
@@ -5997,10 +6176,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        return ERR_PTR(-ENOSPC);
 }
-static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
+                            struct btrfs_block_rsv *block_rsv, u32 blocksize)
 {
        block_rsv_add_bytes(block_rsv, blocksize, 0);
-        block_rsv_release_bytes(block_rsv, NULL, 0);
+        block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
 }
 /*
@@ -6014,7 +6194,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root, u32 blocksize,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
-                                        u64 hint, u64 empty_size)
+                                        u64 hint, u64 empty_size, int for_cow)
 {
        struct btrfs_key ins;
        struct btrfs_block_rsv *block_rsv;
@@ -6030,7 +6210,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
        ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
                                   empty_size, hint, (u64)-1, &ins, 0);
        if (ret) {
-                unuse_block_rsv(block_rsv, blocksize);
+                unuse_block_rsv(root->fs_info, block_rsv, blocksize);
                return ERR_PTR(ret);
        }
@@ -6058,10 +6238,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                extent_op->update_flags = 1;
                extent_op->is_data = 0;
-                ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+                ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+                                        ins.objectid,
                                        ins.offset, parent, root_objectid,
                                        level, BTRFS_ADD_DELAYED_EXTENT,
-                                        extent_op);
+                                        extent_op, for_cow);
                BUG_ON(ret);
        }
        return buf;
@@ -6078,6 +6259,7 @@ struct walk_control {
        int keep_locks;
        int reada_slot;
        int reada_count;
+        int for_reloc;
 };
 #define DROP_REFERENCE  1
@@ -6216,9 +6398,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
        /* wc->stage == UPDATE_BACKREF */
        if (!(wc->flags[level] & flag)) {
                BUG_ON(!path->locks[level]);
-                ret = btrfs_inc_ref(trans, root, eb, 1);
+                ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
                BUG_ON(ret);
-                ret = btrfs_dec_ref(trans, root, eb, 0);
+                ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
                BUG_ON(ret);
                ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
                                                  eb->len, flag, 0);
@@ -6362,7 +6544,7 @@ skip:
                }
                ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-                                        root->root_key.objectid, level - 1, 0);
+                                root->root_key.objectid, level - 1, 0, 0);
                BUG_ON(ret);
        }
        btrfs_tree_unlock(next);
@@ -6436,9 +6618,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
        if (wc->refs[level] == 1) {
                if (level == 0) {
                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-                                ret = btrfs_dec_ref(trans, root, eb, 1);
+                                ret = btrfs_dec_ref(trans, root, eb, 1,
+                                                    wc->for_reloc);
                        else
-                                ret = btrfs_dec_ref(trans, root, eb, 0);
+                                ret = btrfs_dec_ref(trans, root, eb, 0,
+                                                    wc->for_reloc);
                        BUG_ON(ret);
                }
                /* make block locked assertion in clean_tree_block happy */
@@ -6465,7 +6649,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                               btrfs_header_owner(path->nodes[level + 1]));
        }
-        btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+        btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
 out:
        wc->refs[level] = 0;
        wc->flags[level] = 0;
@@ -6549,7 +6733,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 * blocks are properly updated.
 */
 void btrfs_drop_snapshot(struct btrfs_root *root,
-                         struct btrfs_block_rsv *block_rsv, int update_ref)
+                         struct btrfs_block_rsv *block_rsv, int update_ref,
+                         int for_reloc)
 {
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
@@ -6637,6 +6822,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
        wc->stage = DROP_REFERENCE;
        wc->update_ref = update_ref;
        wc->keep_locks = 0;
+        wc->for_reloc = for_reloc;
        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
        while (1) {
@@ -6721,6 +6907,7 @@ out:
 * drop subtree rooted at tree block 'node'.
 *
 * NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
 */
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
@@ -6765,6 +6952,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        wc->stage = DROP_REFERENCE;
        wc->update_ref = 0;
        wc->keep_locks = 1;
+        wc->for_reloc = 1;
        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
        while (1) {
@@ -6792,6 +6980,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+        if (root->fs_info->balance_ctl) {
+                struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+                u64 tgt = 0;
+                /* pick restriper's target profile and return */
+                if (flags & BTRFS_BLOCK_GROUP_DATA &&
+                    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                        tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+                } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+                           bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                        tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+                } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+                           bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                        tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+                }
+                if (tgt) {
+                        /* extended -> chunk profile */
+                        tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+                        return tgt;
+                }
+        }
        /*
         * we add in the count of missing devices because we want
         * to make sure that any RAID levels on a degraded FS
@@ -7085,7 +7296,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                 * space to fit our block group in.
                 */
                if (device->total_bytes > device->bytes_used + min_free) {
-                        ret = find_free_dev_extent(NULL, device, min_free,
+                        ret = find_free_dev_extent(device, min_free,
                                                   &dev_offset, NULL);
                        if (!ret)
                                dev_nr++;
@@ -7447,6 +7658,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
                                &cache->space_info);
        BUG_ON(ret);
+        update_global_block_rsv(root->fs_info);
        spin_lock(&cache->space_info->lock);
        cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7466,6 +7678,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        return 0;
 }
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+        u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+        /* chunk -> extended profile */
+        if (extra_flags == 0)
+                extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+        if (flags & BTRFS_BLOCK_GROUP_DATA)
+                fs_info->avail_data_alloc_bits &= ~extra_flags;
+        if (flags & BTRFS_BLOCK_GROUP_METADATA)
+                fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+                fs_info->avail_system_alloc_bits &= ~extra_flags;
+}
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start)
 {
@@ -7476,6 +7704,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        struct inode *inode;
        int ret;
+        int index;
        int factor;
        root = root->fs_info->extent_root;
@@ -7491,6 +7720,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        free_excluded_extents(root, block_group);
        memcpy(&key, &block_group->key, sizeof(key));
+        index = get_block_group_index(block_group);
        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
                                  BTRFS_BLOCK_GROUP_RAID1 |
                                  BTRFS_BLOCK_GROUP_RAID10))
@@ -7565,6 +7795,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         * are still on the list after taking the semaphore
         */
        list_del_init(&block_group->list);
+        if (list_empty(&block_group->space_info->block_groups[index]))
+                clear_avail_alloc_bits(root->fs_info, block_group->flags);
        up_write(&block_group->space_info->groups_sem);
        if (block_group->cached == BTRFS_CACHE_STARTED)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 49f3c9dc09f4..fcf77e1ded40 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -18,6 +18,7 @@
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
+#include "check-integrity.h"
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
        }
        bio->bi_bdev = dev->bdev;
        bio_add_page(bio, page, length, start-page_offset(page));
-        submit_bio(WRITE_SYNC, bio);
+        btrfsic_submit_bio(WRITE_SYNC, bio);
        wait_for_completion(&compl);
        if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
                ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
                                           mirror_num, bio_flags, start);
        else
-                submit_bio(rw, bio);
+                btrfsic_submit_bio(rw, bio);
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
                ret = -EOPNOTSUPP;
@@ -3579,6 +3580,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
        atomic_set(&eb->blocking_writers, 0);
        atomic_set(&eb->spinning_readers, 0);
        atomic_set(&eb->spinning_writers, 0);
+        eb->lock_nested = 0;
        init_waitqueue_head(&eb->write_lock_wq);
        init_waitqueue_head(&eb->read_lock_wq);
@@ -3907,6 +3909,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
        while (start <= end) {
                index = start >> PAGE_CACHE_SHIFT;
                page = find_get_page(tree->mapping, index);
+                if (!page)
+                        return 1;
                uptodate = PageUptodate(page);
                page_cache_release(page);
                if (!uptodate) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7604c3001322..bc6a042cb6fc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -129,6 +129,7 @@ struct extent_buffer {
        struct list_head leak_list;
        struct rcu_head rcu_head;
        atomic_t refs;
+        pid_t lock_owner;
        /* count of read lock holders on the extent buffer */
        atomic_t write_locks;
@@ -137,6 +138,7 @@ struct extent_buffer {
        atomic_t blocking_readers;
        atomic_t spinning_readers;
        atomic_t spinning_writers;
+        int lock_nested;
        /* protects write locks */
        rwlock_t lock;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 97fbe939c050..859ba2dd8890 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -678,7 +678,7 @@ next_slot:
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
                                                new_key.objectid,
-                                                start - extent_offset);
+                                                start - extent_offset, 0);
                                BUG_ON(ret);
                                *hint_byte = disk_bytenr;
                        }
@@ -753,7 +753,7 @@ next_slot:
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
                                                key.objectid, key.offset -
-                                                extent_offset);
+                                                extent_offset, 0);
                                BUG_ON(ret);
                                inode_sub_bytes(inode,
                                                extent_end - key.offset);
@@ -962,7 +962,7 @@ again:
                ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
                                           root->root_key.objectid,
-                                           ino, orig_offset);
+                                           ino, orig_offset, 0);
                BUG_ON(ret);
                if (split == start) {
@@ -989,7 +989,7 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                        ino, orig_offset);
+                                        ino, orig_offset, 0);
                BUG_ON(ret);
        }
        other_start = 0;
@@ -1006,7 +1006,7 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                        ino, orig_offset);
+                                        ino, orig_offset, 0);
                BUG_ON(ret);
        }
        if (del_nr == 0) {
@@ -1081,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = find_or_create_page(inode->i_mapping, index + i,
-                                               mask);
+                                               mask | __GFP_WRITE);
                if (!pages[i]) {
                        faili = i - 1;
                        err = -ENOMEM;
@@ -1136,7 +1136,8 @@ again:
                                     GFP_NOFS);
        }
        for (i = 0; i < num_pages; i++) {
-                clear_page_dirty_for_io(pages[i]);
+                if (clear_page_dirty_for_io(pages[i]))
+                        account_page_redirty(pages[i]);
                set_page_extent_mapped(pages[i]);
                WARN_ON(!PageLocked(pages[i]));
        }
@@ -1273,7 +1274,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                                   dirty_pages);
                if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
                        btrfs_btree_balance_dirty(root, 1);
-                btrfs_throttle(root);
                pos += copied;
                num_written += copied;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9a897bf79538..c2f20594c9f7 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
        io_ctl_unmap_page(io_ctl);
        for (i = 0; i < io_ctl->num_pages; i++) {
-                ClearPageChecked(io_ctl->pages[i]);
+                if (io_ctl->pages[i]) {
-                unlock_page(io_ctl->pages[i]);
+                        ClearPageChecked(io_ctl->pages[i]);
-                page_cache_release(io_ctl->pages[i]);
+                        unlock_page(io_ctl->pages[i]);
+                        page_cache_release(io_ctl->pages[i]);
+                }
        }
 }
@@ -635,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
        if (!num_entries)
                return 0;
-        io_ctl_init(&io_ctl, inode, root);
+        ret = io_ctl_init(&io_ctl, inode, root);
+        if (ret)
+                return ret;
        ret = readahead_cache(inode);
        if (ret)
                goto out;
@@ -838,7 +843,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        struct io_ctl io_ctl;
        struct list_head bitmap_list;
        struct btrfs_key key;
-        u64 start, end, len;
+        u64 start, extent_start, extent_end, len;
        int entries = 0;
        int bitmaps = 0;
        int ret;
@@ -849,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        if (!i_size_read(inode))
                return -1;
-        io_ctl_init(&io_ctl, inode, root);
+        ret = io_ctl_init(&io_ctl, inode, root);
+        if (ret)
+                return -1;
        /* Get the cluster for this block_group if it exists */
        if (block_group && !list_empty(&block_group->cluster_list))
@@ -857,25 +864,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                                     struct btrfs_free_cluster,
                                     block_group_list);
-        /*
-         * We shouldn't have switched the pinned extents yet so this is the
-         * right one
-         */
-        unpin = root->fs_info->pinned_extents;
        /* Lock all pages first so we can lock the extent safely. */
        io_ctl_prepare_pages(&io_ctl, inode, 0);
        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
                         0, &cached_state, GFP_NOFS);
-        /*
-         * When searching for pinned extents, we need to start at our start
-         * offset.
-         */
-        if (block_group)
-                start = block_group->key.objectid;
        node = rb_first(&ctl->free_space_offset);
        if (!node && cluster) {
                node = rb_first(&cluster->root);
@@ -918,9 +912,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
         * We want to add any pinned extents to our free space cache
         * so we don't leak the space
         */
+        /*
+         * We shouldn't have switched the pinned extents yet so this is the
+         * right one
+         */
+        unpin = root->fs_info->pinned_extents;
+        if (block_group)
+                start = block_group->key.objectid;
        while (block_group && (start < block_group->key.objectid +
                               block_group->key.offset)) {
-                ret = find_first_extent_bit(unpin, start, &start, &end,
+                ret = find_first_extent_bit(unpin, start,
+                                            &extent_start, &extent_end,
                                            EXTENT_DIRTY);
                if (ret) {
                        ret = 0;
@@ -928,20 +933,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                }
                /* This pinned extent is out of our range */
-                if (start >= block_group->key.objectid +
+                if (extent_start >= block_group->key.objectid +
                    block_group->key.offset)
                        break;
-                len = block_group->key.objectid +
+                extent_start = max(extent_start, start);
-                        block_group->key.offset - start;
+                extent_end = min(block_group->key.objectid +
-                len = min(len, end + 1 - start);
+                                 block_group->key.offset, extent_end + 1);
+                len = extent_end - extent_start;
                entries++;
-                ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+                ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
                if (ret)
                        goto out_nospc;
-                start = end + 1;
+                start = extent_end;
        }
        /* Write out the bitmaps */
@@ -2236,7 +2242,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
                if (entry->bitmap) {
                        ret = btrfs_alloc_from_bitmap(block_group,
                                                      cluster, entry, bytes,
-                                                      min_start);
+                                                      cluster->window_start);
                        if (ret == 0) {
                                node = rb_next(&entry->offset_index);
                                if (!node)
@@ -2245,6 +2251,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
                                                 offset_index);
                                continue;
                        }
+                        cluster->window_start += bytes;
                } else {
                        ret = entry->offset;
@@ -2283,23 +2290,23 @@ out:
 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
                                struct btrfs_free_space *entry,
                                struct btrfs_free_cluster *cluster,
-                                u64 offset, u64 bytes, u64 min_bytes)
+                                u64 offset, u64 bytes,
+                                u64 cont1_bytes, u64 min_bytes)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        unsigned long next_zero;
        unsigned long i;
-        unsigned long search_bits;
+        unsigned long want_bits;
-        unsigned long total_bits;
+        unsigned long min_bits;
        unsigned long found_bits;
        unsigned long start = 0;
        unsigned long total_found = 0;
        int ret;
-        bool found = false;
        i = offset_to_bit(entry->offset, block_group->sectorsize,
                          max_t(u64, offset, entry->offset));
-        search_bits = bytes_to_bits(bytes, block_group->sectorsize);
+        want_bits = bytes_to_bits(bytes, block_group->sectorsize);
-        total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+        min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
 again:
        found_bits = 0;
@@ -2308,7 +2315,7 @@ again:
             i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
                next_zero = find_next_zero_bit(entry->bitmap,
                                               BITS_PER_BITMAP, i);
-                if (next_zero - i >= search_bits) {
+                if (next_zero - i >= min_bits) {
                        found_bits = next_zero - i;
                        break;
                }
@@ -2318,10 +2325,9 @@ again:
        if (!found_bits)
                return -ENOSPC;
-        if (!found) {
+        if (!total_found) {
                start = i;
                cluster->max_size = 0;
-                found = true;
        }
        total_found += found_bits;
@@ -2329,13 +2335,8 @@ again:
        if (cluster->max_size < found_bits * block_group->sectorsize)
                cluster->max_size = found_bits * block_group->sectorsize;
-        if (total_found < total_bits) {
+        if (total_found < want_bits || cluster->max_size < cont1_bytes) {
-                i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
+                i = next_zero + 1;
-                if (i - start > total_bits * 2) {
-                        total_found = 0;
-                        cluster->max_size = 0;
-                        found = false;
-                }
                goto again;
        }
@@ -2346,28 +2347,31 @@ again:
                                 &entry->offset_index, 1);
        BUG_ON(ret);
+        trace_btrfs_setup_cluster(block_group, cluster,
+                                  total_found * block_group->sectorsize, 1);
        return 0;
 }
 /*
 * This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
 */
 static noinline int
 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
                        struct btrfs_free_cluster *cluster,
                        struct list_head *bitmaps, u64 offset, u64 bytes,
-                        u64 min_bytes)
+                        u64 cont1_bytes, u64 min_bytes)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *first = NULL;
        struct btrfs_free_space *entry = NULL;
-        struct btrfs_free_space *prev = NULL;
        struct btrfs_free_space *last;
        struct rb_node *node;
        u64 window_start;
        u64 window_free;
        u64 max_extent;
-        u64 max_gap = 128 * 1024;
+        u64 total_size = 0;
        entry = tree_search_offset(ctl, offset, 0, 1);
        if (!entry)
@@ -2377,8 +2381,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
         * We don't want bitmaps, so just move along until we find a normal
         * extent entry.
         */
-        while (entry->bitmap) {
+        while (entry->bitmap || entry->bytes < min_bytes) {
-                if (list_empty(&entry->list))
+                if (entry->bitmap && list_empty(&entry->list))
                        list_add_tail(&entry->list, bitmaps);
                node = rb_next(&entry->offset_index);
                if (!node)
@@ -2391,12 +2395,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
        max_extent = entry->bytes;
        first = entry;
        last = entry;
-        prev = entry;
-        while (window_free <= min_bytes) {
+        for (node = rb_next(&entry->offset_index); node;
-                node = rb_next(&entry->offset_index);
+             node = rb_next(&entry->offset_index)) {
-                if (!node)
-                        return -ENOSPC;
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                if (entry->bitmap) {
@@ -2405,26 +2406,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
                        continue;
                }
-                /*
+                if (entry->bytes < min_bytes)
-                 * we haven't filled the empty size and the window is
+                        continue;
-                 * very large.  reset and try again
-                 */
+                last = entry;
-                if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
+                window_free += entry->bytes;
-                    entry->offset - window_start > (min_bytes * 2)) {
+                if (entry->bytes > max_extent)
-                        first = entry;
-                        window_start = entry->offset;
-                        window_free = entry->bytes;
-                        last = entry;
                        max_extent = entry->bytes;
-                } else {
-                        last = entry;
-                        window_free += entry->bytes;
-                        if (entry->bytes > max_extent)
-                                max_extent = entry->bytes;
-                }
-                prev = entry;
        }
+        if (window_free < bytes || max_extent < cont1_bytes)
+                return -ENOSPC;
        cluster->window_start = first->offset;
        node = &first->offset_index;
@@ -2438,17 +2431,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
-                if (entry->bitmap)
+                if (entry->bitmap || entry->bytes < min_bytes)
                        continue;
                rb_erase(&entry->offset_index, &ctl->free_space_offset);
                ret = tree_insert_offset(&cluster->root, entry->offset,
                                         &entry->offset_index, 0);
+                total_size += entry->bytes;
                BUG_ON(ret);
        } while (node && entry != last);
        cluster->max_size = max_extent;
+        trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
        return 0;
 }
@@ -2460,7 +2454,7 @@ static noinline int
 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
                     struct btrfs_free_cluster *cluster,
                     struct list_head *bitmaps, u64 offset, u64 bytes,
-                     u64 min_bytes)
+                     u64 cont1_bytes, u64 min_bytes)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry;
@@ -2482,10 +2476,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
        }
        list_for_each_entry(entry, bitmaps, list) {
-                if (entry->bytes < min_bytes)
+                if (entry->bytes < bytes)
                        continue;
                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
-                                           bytes, min_bytes);
+                                           bytes, cont1_bytes, min_bytes);
                if (!ret)
                        return 0;
        }
@@ -2499,7 +2493,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 /*
 * here we try to find a cluster of blocks in a block group.  The goal
- * is to find at least bytes free and up to empty_size + bytes free.
+ * is to find at least bytes+empty_size.
 * We might not find them all in one contiguous area.
 *
 * returns zero and sets up cluster if things worked out, otherwise
@@ -2515,23 +2509,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
        struct btrfs_free_space *entry, *tmp;
        LIST_HEAD(bitmaps);
        u64 min_bytes;
+        u64 cont1_bytes;
        int ret;
-        /* for metadata, allow allocates with more holes */
+        /*
+         * Choose the minimum extent size we'll require for this
+         * cluster.  For SSD_SPREAD, don't allow any fragmentation.
+         * For metadata, allow allocates with smaller extents.  For
+         * data, keep it dense.
+         */
        if (btrfs_test_opt(root, SSD_SPREAD)) {
-                min_bytes = bytes + empty_size;
+                cont1_bytes = min_bytes = bytes + empty_size;
        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
-                /*
+                cont1_bytes = bytes;
-                 * we want to do larger allocations when we are
+                min_bytes = block_group->sectorsize;
-                 * flushing out the delayed refs, it helps prevent
+        } else {
-                 * making more work as we go along.
+                cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
-                 */
+                min_bytes = block_group->sectorsize;
-                if (trans->transaction->delayed_refs.flushing)
+        }
-                        min_bytes = max(bytes, (bytes + empty_size) >> 1);
-                else
-                        min_bytes = max(bytes, (bytes + empty_size) >> 4);
-        } else
-                min_bytes = max(bytes, (bytes + empty_size) >> 2);
        spin_lock(&ctl->tree_lock);
@@ -2539,7 +2534,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
         * If we know we don't have enough space to make a cluster don't even
         * bother doing all the work to try and find one.
         */
-        if (ctl->free_space < min_bytes) {
+        if (ctl->free_space < bytes) {
                spin_unlock(&ctl->tree_lock);
                return -ENOSPC;
        }
@@ -2552,11 +2547,17 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                goto out;
        }
+        trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
+                                 min_bytes);
+        INIT_LIST_HEAD(&bitmaps);
        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
-                                      bytes, min_bytes);
+                                      bytes + empty_size,
+                                      cont1_bytes, min_bytes);
        if (ret)
                ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
-                                           offset, bytes, min_bytes);
+                                           offset, bytes + empty_size,
+                                           cont1_bytes, min_bytes);
        /* Clear our temporary list */
        list_for_each_entry_safe(entry, tmp, &bitmaps, list)
@@ -2567,6 +2568,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                list_add_tail(&cluster->block_group_list,
                              &block_group->cluster_list);
                cluster->block_group = block_group;
+        } else {
+                trace_btrfs_failed_cluster_setup(block_group);
        }
 out:
        spin_unlock(&cluster->lock);
@@ -2588,17 +2591,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
        cluster->block_group = NULL;
 }
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+static int do_trimming(struct btrfs_block_group_cache *block_group,
-                           u64 *trimmed, u64 start, u64 end, u64 minlen)
+                       u64 *total_trimmed, u64 start, u64 bytes,
+                       u64 reserved_start, u64 reserved_bytes)
 {
-        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+        struct btrfs_space_info *space_info = block_group->space_info;
-        struct btrfs_free_space *entry = NULL;
        struct btrfs_fs_info *fs_info = block_group->fs_info;
-        u64 bytes = 0;
+        int ret;
-        u64 actually_trimmed;
+        int update = 0;
-        int ret = 0;
+        u64 trimmed = 0;
-        *trimmed = 0;
+        spin_lock(&space_info->lock);
+        spin_lock(&block_group->lock);
+        if (!block_group->ro) {
+                block_group->reserved += reserved_bytes;
+                space_info->bytes_reserved += reserved_bytes;
+                update = 1;
+        }
+        spin_unlock(&block_group->lock);
+        spin_unlock(&space_info->lock);
+        ret = btrfs_error_discard_extent(fs_info->extent_root,
+                                         start, bytes, &trimmed);
+        if (!ret)
+                *total_trimmed += trimmed;
+        btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+        if (update) {
+                spin_lock(&space_info->lock);
+                spin_lock(&block_group->lock);
+                if (block_group->ro)
+                        space_info->bytes_readonly += reserved_bytes;
+                block_group->reserved -= reserved_bytes;
+                space_info->bytes_reserved -= reserved_bytes;
+                spin_unlock(&space_info->lock);
+                spin_unlock(&block_group->lock);
+        }
+        return ret;
+}
+static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
+                          u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+        struct btrfs_free_space *entry;
+        struct rb_node *node;
+        int ret = 0;
+        u64 extent_start;
+        u64 extent_bytes;
+        u64 bytes;
        while (start < end) {
                spin_lock(&ctl->tree_lock);
@@ -2609,81 +2652,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
                }
                entry = tree_search_offset(ctl, start, 0, 1);
-                if (!entry)
+                if (!entry) {
-                        entry = tree_search_offset(ctl,
-                                                   offset_to_bitmap(ctl, start),
-                                                   1, 1);
-                if (!entry || entry->offset >= end) {
                        spin_unlock(&ctl->tree_lock);
                        break;
                }
-                if (entry->bitmap) {
+                /* skip bitmaps */
-                        ret = search_bitmap(ctl, entry, &start, &bytes);
+                while (entry->bitmap) {
-                        if (!ret) {
+                        node = rb_next(&entry->offset_index);
-                                if (start >= end) {
+                        if (!node) {
-                                        spin_unlock(&ctl->tree_lock);
-                                        break;
-                                }
-                                bytes = min(bytes, end - start);
-                                bitmap_clear_bits(ctl, entry, start, bytes);
-                                if (entry->bytes == 0)
-                                        free_bitmap(ctl, entry);
-                        } else {
-                                start = entry->offset + BITS_PER_BITMAP *
-                                        block_group->sectorsize;
                                spin_unlock(&ctl->tree_lock);
-                                ret = 0;
+                                goto out;
-                                continue;
                        }
-                } else {
+                        entry = rb_entry(node, struct btrfs_free_space,
-                        start = entry->offset;
+                                         offset_index);
-                        bytes = min(entry->bytes, end - start);
-                        unlink_free_space(ctl, entry);
-                        kmem_cache_free(btrfs_free_space_cachep, entry);
                }
+                if (entry->offset >= end) {
+                        spin_unlock(&ctl->tree_lock);
+                        break;
+                }
+                extent_start = entry->offset;
+                extent_bytes = entry->bytes;
+                start = max(start, extent_start);
+                bytes = min(extent_start + extent_bytes, end) - start;
+                if (bytes < minlen) {
+                        spin_unlock(&ctl->tree_lock);
+                        goto next;
+                }
+                unlink_free_space(ctl, entry);
+                kmem_cache_free(btrfs_free_space_cachep, entry);
                spin_unlock(&ctl->tree_lock);
-                if (bytes >= minlen) {
+                ret = do_trimming(block_group, total_trimmed, start, bytes,
-                        struct btrfs_space_info *space_info;
+                                  extent_start, extent_bytes);
-                        int update = 0;
+                if (ret)
+                        break;
-                        space_info = block_group->space_info;
+next:
-                        spin_lock(&space_info->lock);
+                start += bytes;
-                        spin_lock(&block_group->lock);
-                        if (!block_group->ro) {
-                                block_group->reserved += bytes;
-                                space_info->bytes_reserved += bytes;
-                                update = 1;
-                        }
-                        spin_unlock(&block_group->lock);
-                        spin_unlock(&space_info->lock);
-                        ret = btrfs_error_discard_extent(fs_info->extent_root,
-                                                         start,
-                                                         bytes,
-                                                         &actually_trimmed);
-                        btrfs_add_free_space(block_group, start, bytes);
-                        if (update) {
-                                spin_lock(&space_info->lock);
-                                spin_lock(&block_group->lock);
-                                if (block_group->ro)
-                                        space_info->bytes_readonly += bytes;
-                                block_group->reserved -= bytes;
-                                space_info->bytes_reserved -= bytes;
-                                spin_unlock(&space_info->lock);
-                                spin_unlock(&block_group->lock);
-                        }
-                        if (ret)
+                if (fatal_signal_pending(current)) {
-                                break;
+                        ret = -ERESTARTSYS;
-                        *trimmed += actually_trimmed;
+                        break;
+                }
+                cond_resched();
+        }
+out:
+        return ret;
+}
+static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
+                        u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+        struct btrfs_free_space *entry;
+        int ret = 0;
+        int ret2;
+        u64 bytes;
+        u64 offset = offset_to_bitmap(ctl, start);
+        while (offset < end) {
+                bool next_bitmap = false;
+                spin_lock(&ctl->tree_lock);
+                if (ctl->free_space < minlen) {
+                        spin_unlock(&ctl->tree_lock);
+                        break;
+                }
+                entry = tree_search_offset(ctl, offset, 1, 0);
+                if (!entry) {
+                        spin_unlock(&ctl->tree_lock);
+                        next_bitmap = true;
+                        goto next;
+                }
+                bytes = minlen;
+                ret2 = search_bitmap(ctl, entry, &start, &bytes);
+                if (ret2 || start >= end) {
+                        spin_unlock(&ctl->tree_lock);
+                        next_bitmap = true;
+                        goto next;
+                }
+                bytes = min(bytes, end - start);
+                if (bytes < minlen) {
+                        spin_unlock(&ctl->tree_lock);
+                        goto next;
+                }
+                bitmap_clear_bits(ctl, entry, start, bytes);
+                if (entry->bytes == 0)
+                        free_bitmap(ctl, entry);
+                spin_unlock(&ctl->tree_lock);
+                ret = do_trimming(block_group, total_trimmed, start, bytes,
+                                  start, bytes);
+                if (ret)
+                        break;
+next:
+                if (next_bitmap) {
+                        offset += BITS_PER_BITMAP * ctl->unit;
+                } else {
+                        start += bytes;
+                        if (start >= offset + BITS_PER_BITMAP * ctl->unit)
+                                offset += BITS_PER_BITMAP * ctl->unit;
                }
-                start += bytes;
-                bytes = 0;
                if (fatal_signal_pending(current)) {
                        ret = -ERESTARTSYS;
@@ -2696,6 +2776,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
        return ret;
 }
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+                           u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+        int ret;
+        *trimmed = 0;
+        ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+        if (ret)
+                return ret;
+        ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+        return ret;
+}
 /*
 * Find the left-most item in the cache tree, and then return the
 * smallest inode number in the item.
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f8962a957d65..213ffa86ce1b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
                                          trans->bytes_reserved);
        if (ret)
                goto out;
+        trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
+                                      trans->bytes_reserved, 1);
 again:
        inode = lookup_free_ino_inode(root, path);
        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -498,6 +500,8 @@ again:
 out_put:
        iput(inode);
 out_release:
+        trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
+                                      trans->bytes_reserved, 0);
        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 out:
        trans->block_rsv = rsv;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81b235a61f8c..32214fe0f7e3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1951,12 +1951,28 @@ enum btrfs_orphan_cleanup_state {
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root)
 {
+        struct btrfs_block_rsv *block_rsv;
        int ret;
        if (!list_empty(&root->orphan_list) ||
            root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
                return;
+        spin_lock(&root->orphan_lock);
+        if (!list_empty(&root->orphan_list)) {
+                spin_unlock(&root->orphan_lock);
+                return;
+        }
+        if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
+                spin_unlock(&root->orphan_lock);
+                return;
+        }
+        block_rsv = root->orphan_block_rsv;
+        root->orphan_block_rsv = NULL;
+        spin_unlock(&root->orphan_lock);
        if (root->orphan_item_inserted &&
            btrfs_root_refs(&root->root_item) > 0) {
                ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
@@ -1965,10 +1981,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
                root->orphan_item_inserted = 0;
        }
-        if (root->orphan_block_rsv) {
+        if (block_rsv) {
-                WARN_ON(root->orphan_block_rsv->size > 0);
+                WARN_ON(block_rsv->size > 0);
-                btrfs_free_block_rsv(root, root->orphan_block_rsv);
+                btrfs_free_block_rsv(root, block_rsv);
-                root->orphan_block_rsv = NULL;
        }
 }
@@ -2224,14 +2239,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                continue;
                        }
                        nr_truncate++;
-                        /*
-                         * Need to hold the imutex for reservation purposes, not
-                         * a huge deal here but I have a WARN_ON in
-                         * btrfs_delalloc_reserve_space to catch offenders.
-                         */
-                        mutex_lock(&inode->i_mutex);
                        ret = btrfs_truncate(inode);
-                        mutex_unlock(&inode->i_mutex);
                } else {
                        nr_unlink++;
                }
@@ -2845,7 +2853,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
                BUG_ON(!root->fs_info->enospc_unlink);
                root->fs_info->enospc_unlink = 0;
        }
-        btrfs_end_transaction_throttle(trans, root);
+        btrfs_end_transaction(trans, root);
 }
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3009,7 +3017,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        int pending_del_nr = 0;
        int pending_del_slot = 0;
        int extent_type = -1;
-        int encoding;
        int ret;
        int err = 0;
        u64 ino = btrfs_ino(inode);
@@ -3059,7 +3066,6 @@ search_again:
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                found_type = btrfs_key_type(&found_key);
-                encoding = 0;
                if (found_key.objectid != ino)
                        break;
@@ -3072,10 +3078,6 @@ search_again:
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
                        extent_type = btrfs_file_extent_type(leaf, fi);
-                        encoding = btrfs_file_extent_compression(leaf, fi);
-                        encoding |= btrfs_file_extent_encryption(leaf, fi);
-                        encoding |= btrfs_file_extent_other_encoding(leaf, fi);
                        if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
                                item_end +=
                                    btrfs_file_extent_num_bytes(leaf, fi);
@@ -3103,7 +3105,7 @@ search_again:
                if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
                        u64 num_dec;
                        extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
-                        if (!del_item && !encoding) {
+                        if (!del_item) {
                                u64 orig_num_bytes =
                                        btrfs_file_extent_num_bytes(leaf, fi);
                                extent_num_bytes = new_size -
@@ -3179,7 +3181,7 @@ delete:
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes, 0,
                                                btrfs_header_owner(leaf),
-                                                ino, extent_offset);
+                                                ino, extent_offset, 0);
                        BUG_ON(ret);
                }
@@ -3434,7 +3436,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
                i_size_write(inode, newsize);
                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
                ret = btrfs_update_inode(trans, root, inode);
-                btrfs_end_transaction_throttle(trans, root);
+                btrfs_end_transaction(trans, root);
        } else {
                /*
@@ -4655,7 +4657,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        }
 out_unlock:
        nr = trans->blocks_used;
-        btrfs_end_transaction_throttle(trans, root);
+        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
        if (drop_inode) {
                inode_dec_link_count(inode);
@@ -4723,7 +4725,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        }
 out_unlock:
        nr = trans->blocks_used;
-        btrfs_end_transaction_throttle(trans, root);
+        btrfs_end_transaction(trans, root);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -4782,7 +4784,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        }
        nr = trans->blocks_used;
-        btrfs_end_transaction_throttle(trans, root);
+        btrfs_end_transaction(trans, root);
 fail:
        if (drop_inode) {
                inode_dec_link_count(inode);
@@ -4848,7 +4850,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 out_fail:
        nr = trans->blocks_used;
-        btrfs_end_transaction_throttle(trans, root);
+        btrfs_end_transaction(trans, root);
        if (drop_on_err)
                iput(inode);
        btrfs_btree_balance_dirty(root, nr);
@@ -5121,7 +5123,7 @@ again:
                        }
                        flush_dcache_page(page);
                } else if (create && PageUptodate(page)) {
-                        WARN_ON(1);
+                        BUG();
                        if (!trans) {
                                kunmap(page);
                                free_extent_map(em);
@@ -6399,21 +6401,23 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        unsigned long zero_start;
        loff_t size;
        int ret;
+        int reserved = 0;
        u64 page_start;
        u64 page_end;
-        /* Need this to keep space reservations serialized */
-        mutex_lock(&inode->i_mutex);
        ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-        mutex_unlock(&inode->i_mutex);
+        if (!ret) {
-        if (!ret)
                ret = btrfs_update_time(vma->vm_file);
+                reserved = 1;
+        }
        if (ret) {
                if (ret == -ENOMEM)
                        ret = VM_FAULT_OOM;
                else /* -ENOSPC, -EIO, etc */
                        ret = VM_FAULT_SIGBUS;
-                goto out;
+                if (reserved)
+                        goto out;
+                goto out_noreserve;
        }
        ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
@@ -6494,8 +6498,9 @@ out_unlock:
        if (!ret)
                return VM_FAULT_LOCKED;
        unlock_page(page);
-        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out:
+        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+out_noreserve:
        return ret;
 }
@@ -6668,7 +6673,7 @@ end_trans:
                        err = ret;
                nr = trans->blocks_used;
-                ret = btrfs_end_transaction_throttle(trans, root);
+                ret = btrfs_end_transaction(trans, root);
                btrfs_btree_balance_dirty(root, nr);
        }
@@ -6749,6 +6754,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        extent_io_tree_init(&ei->io_tree, &inode->i_data);
        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
        mutex_init(&ei->log_mutex);
+        mutex_init(&ei->delalloc_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->i_orphan);
        INIT_LIST_HEAD(&ei->delalloc_inodes);
@@ -7074,7 +7080,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                btrfs_end_log_trans(root);
        }
 out_fail:
-        btrfs_end_transaction_throttle(trans, root);
+        btrfs_end_transaction(trans, root);
 out_notrans:
        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&root->fs_info->subvol_sem);
@@ -7246,7 +7252,7 @@ out_unlock:
        if (!err)
                d_instantiate(dentry, inode);
        nr = trans->blocks_used;
-        btrfs_end_transaction_throttle(trans, root);
+        btrfs_end_transaction(trans, root);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5441ff1480fd..03bb62a9ee24 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        struct btrfs_trans_handle *trans;
        unsigned int flags, oldflags;
        int ret;
+        u64 ip_oldflags;
+        unsigned int i_oldflags;
        if (btrfs_root_readonly(root))
                return -EROFS;
@@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        mutex_lock(&inode->i_mutex);
+        ip_oldflags = ip->flags;
+        i_oldflags = inode->i_flags;
        flags = btrfs_mask_flags(inode->i_mode, flags);
        oldflags = btrfs_flags_to_ioctl(ip->flags);
        if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -249,19 +254,24 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
        }
-        trans = btrfs_join_transaction(root);
+        trans = btrfs_start_transaction(root, 1);
-        BUG_ON(IS_ERR(trans));
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto out_drop;
+        }
        btrfs_update_iflags(inode);
        inode->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, inode);
-        BUG_ON(ret);
        btrfs_end_transaction(trans, root);
+ out_drop:
+        if (ret) {
+                ip->flags = ip_oldflags;
+                inode->i_flags = i_oldflags;
+        }
        mnt_drop_write_file(file);
-        ret = 0;
 out_unlock:
        mutex_unlock(&inode->i_mutex);
        return ret;
@@ -276,14 +286,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 {
-        struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
+        struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
-        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_device *device;
        struct request_queue *q;
        struct fstrim_range range;
        u64 minlen = ULLONG_MAX;
        u64 num_devices = 0;
-        u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+        u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
        int ret;
        if (!capable(CAP_SYS_ADMIN))
@@ -312,7 +321,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
        range.len = min(range.len, total_bytes - range.start);
        range.minlen = max(range.minlen, minlen);
-        ret = btrfs_trim_fs(root, &range);
+        ret = btrfs_trim_fs(fs_info->tree_root, &range);
        if (ret < 0)
                return ret;
@@ -358,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root,
                return PTR_ERR(trans);
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-                                      0, objectid, NULL, 0, 0, 0);
+                                      0, objectid, NULL, 0, 0, 0, 0);
        if (IS_ERR(leaf)) {
                ret = PTR_ERR(leaf);
                goto fail;
@@ -858,10 +867,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
                return 0;
        file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
-        mutex_lock(&inode->i_mutex);
        ret = btrfs_delalloc_reserve_space(inode,
                                           num_pages << PAGE_CACHE_SHIFT);
-        mutex_unlock(&inode->i_mutex);
        if (ret)
                return ret;
 again:
@@ -1058,7 +1065,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                i = range->start >> PAGE_CACHE_SHIFT;
        }
        if (!max_to_defrag)
-                max_to_defrag = last_index;
+                max_to_defrag = last_index + 1;
        /*
         * make writeback starts from i, so the defrag range can be
@@ -1203,13 +1210,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
+        mutex_lock(&root->fs_info->volume_mutex);
+        if (root->fs_info->balance_ctl) {
+                printk(KERN_INFO "btrfs: balance in progress\n");
+                ret = -EINVAL;
+                goto out;
+        }
        vol_args = memdup_user(arg, sizeof(*vol_args));
-        if (IS_ERR(vol_args))
+        if (IS_ERR(vol_args)) {
-                return PTR_ERR(vol_args);
+                ret = PTR_ERR(vol_args);
+                goto out;
+        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-        mutex_lock(&root->fs_info->volume_mutex);
        sizestr = vol_args->name;
        devstr = strchr(sizestr, ':');
        if (devstr) {
@@ -1226,7 +1241,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
                       (unsigned long long)devid);
                ret = -EINVAL;
-                goto out_unlock;
+                goto out_free;
        }
        if (!strcmp(sizestr, "max"))
                new_size = device->bdev->bd_inode->i_size;
@@ -1241,7 +1256,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                new_size = memparse(sizestr, NULL);
                if (new_size == 0) {
                        ret = -EINVAL;
-                        goto out_unlock;
+                        goto out_free;
                }
        }
@@ -1250,7 +1265,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        if (mod < 0) {
                if (new_size > old_size) {
                        ret = -EINVAL;
-                        goto out_unlock;
+                        goto out_free;
                }
                new_size = old_size - new_size;
        } else if (mod > 0) {
@@ -1259,11 +1274,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        if (new_size < 256 * 1024 * 1024) {
                ret = -EINVAL;
-                goto out_unlock;
+                goto out_free;
        }
        if (new_size > device->bdev->bd_inode->i_size) {
                ret = -EFBIG;
-                goto out_unlock;
+                goto out_free;
        }
        do_div(new_size, root->sectorsize);
@@ -1276,7 +1291,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                trans = btrfs_start_transaction(root, 0);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
-                        goto out_unlock;
+                        goto out_free;
                }
                ret = btrfs_grow_device(trans, device, new_size);
                btrfs_commit_transaction(trans, root);
@@ -1284,9 +1299,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                ret = btrfs_shrink_device(device, new_size);
        }
-out_unlock:
+out_free:
-        mutex_unlock(&root->fs_info->volume_mutex);
        kfree(vol_args);
+out:
+        mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 }
@@ -2052,14 +2068,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
+        mutex_lock(&root->fs_info->volume_mutex);
+        if (root->fs_info->balance_ctl) {
+                printk(KERN_INFO "btrfs: balance in progress\n");
+                ret = -EINVAL;
+                goto out;
+        }
        vol_args = memdup_user(arg, sizeof(*vol_args));
-        if (IS_ERR(vol_args))
+        if (IS_ERR(vol_args)) {
-                return PTR_ERR(vol_args);
+                ret = PTR_ERR(vol_args);
+                goto out;
+        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_init_new_device(root, vol_args->name);
        kfree(vol_args);
+out:
+        mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 }
@@ -2074,14 +2101,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
+        mutex_lock(&root->fs_info->volume_mutex);
+        if (root->fs_info->balance_ctl) {
+                printk(KERN_INFO "btrfs: balance in progress\n");
+                ret = -EINVAL;
+                goto out;
+        }
        vol_args = memdup_user(arg, sizeof(*vol_args));
-        if (IS_ERR(vol_args))
+        if (IS_ERR(vol_args)) {
-                return PTR_ERR(vol_args);
+                ret = PTR_ERR(vol_args);
+                goto out;
+        }
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_rm_device(root, vol_args->name);
        kfree(vol_args);
+out:
+        mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 }
@@ -2427,7 +2465,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                                        disko, diskl, 0,
                                                        root->root_key.objectid,
                                                        btrfs_ino(inode),
-                                                        new_key.offset - datao);
+                                                        new_key.offset - datao,
+                                                        0);
                                        BUG_ON(ret);
                                }
                        } else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -2977,7 +3016,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 {
        int ret = 0;
        int size;
-        u64 extent_offset;
+        u64 extent_item_pos;
        struct btrfs_ioctl_logical_ino_args *loi;
        struct btrfs_data_container *inodes = NULL;
        struct btrfs_path *path = NULL;
@@ -3008,15 +3047,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
        }
        ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+        btrfs_release_path(path);
        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                ret = -ENOENT;
        if (ret < 0)
                goto out;
-        extent_offset = loi->logical - key.objectid;
+        extent_item_pos = loi->logical - key.objectid;
        ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
-                                        extent_offset, build_ino_list, inodes);
+                                        extent_item_pos, build_ino_list,
+                                        inodes);
        if (ret < 0)
                goto out;
@@ -3034,6 +3075,163 @@ out:
        return ret;
 }
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+                               struct btrfs_ioctl_balance_args *bargs)
+{
+        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+        bargs->flags = bctl->flags;
+        if (atomic_read(&fs_info->balance_running))
+                bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+        if (atomic_read(&fs_info->balance_pause_req))
+                bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+        if (atomic_read(&fs_info->balance_cancel_req))
+                bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
+        memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+        memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+        memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+        if (lock) {
+                spin_lock(&fs_info->balance_lock);
+                memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+                spin_unlock(&fs_info->balance_lock);
+        } else {
+                memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+        }
+}
+static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_ioctl_balance_args *bargs;
+        struct btrfs_balance_control *bctl;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        mutex_lock(&fs_info->volume_mutex);
+        mutex_lock(&fs_info->balance_mutex);
+        if (arg) {
+                bargs = memdup_user(arg, sizeof(*bargs));
+                if (IS_ERR(bargs)) {
+                        ret = PTR_ERR(bargs);
+                        goto out;
+                }
+                if (bargs->flags & BTRFS_BALANCE_RESUME) {
+                        if (!fs_info->balance_ctl) {
+                                ret = -ENOTCONN;
+                                goto out_bargs;
+                        }
+                        bctl = fs_info->balance_ctl;
+                        spin_lock(&fs_info->balance_lock);
+                        bctl->flags |= BTRFS_BALANCE_RESUME;
+                        spin_unlock(&fs_info->balance_lock);
+                        goto do_balance;
+                }
+        } else {
+                bargs = NULL;
+        }
+        if (fs_info->balance_ctl) {
+                ret = -EINPROGRESS;
+                goto out_bargs;
+        }
+        bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+        if (!bctl) {
+                ret = -ENOMEM;
+                goto out_bargs;
+        }
+        bctl->fs_info = fs_info;
+        if (arg) {
+                memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+                memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+                memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+                bctl->flags = bargs->flags;
+        } else {
+                /* balance everything - no filters */
+                bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+        }
+do_balance:
+        ret = btrfs_balance(bctl, bargs);
+        /*
+         * bctl is freed in __cancel_balance or in free_fs_info if
+         * restriper was paused all the way until unmount
+         */
+        if (arg) {
+                if (copy_to_user(arg, bargs, sizeof(*bargs)))
+                        ret = -EFAULT;
+        }
+out_bargs:
+        kfree(bargs);
+out:
+        mutex_unlock(&fs_info->balance_mutex);
+        mutex_unlock(&fs_info->volume_mutex);
+        return ret;
+}
+static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        switch (cmd) {
+        case BTRFS_BALANCE_CTL_PAUSE:
+                return btrfs_pause_balance(root->fs_info);
+        case BTRFS_BALANCE_CTL_CANCEL:
+                return btrfs_cancel_balance(root->fs_info);
+        }
+        return -EINVAL;
+}
+static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+                                         void __user *arg)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_ioctl_balance_args *bargs;
+        int ret = 0;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        mutex_lock(&fs_info->balance_mutex);
+        if (!fs_info->balance_ctl) {
+                ret = -ENOTCONN;
+                goto out;
+        }
+        bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+        if (!bargs) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        update_ioctl_balance_args(fs_info, 1, bargs);
+        if (copy_to_user(arg, bargs, sizeof(*bargs)))
+                ret = -EFAULT;
+        kfree(bargs);
+out:
+        mutex_unlock(&fs_info->balance_mutex);
+        return ret;
+}
 long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
 {
@@ -3078,7 +3276,7 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_DEV_INFO:
                return btrfs_ioctl_dev_info(root, argp);
        case BTRFS_IOC_BALANCE:
-                return btrfs_balance(root->fs_info->dev_root);
+                return btrfs_ioctl_balance(root, NULL);
        case BTRFS_IOC_CLONE:
                return btrfs_ioctl_clone(file, arg, 0, 0, 0);
        case BTRFS_IOC_CLONE_RANGE:
@@ -3110,6 +3308,12 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_scrub_cancel(root, argp);
        case BTRFS_IOC_SCRUB_PROGRESS:
                return btrfs_ioctl_scrub_progress(root, argp);
+        case BTRFS_IOC_BALANCE_V2:
+                return btrfs_ioctl_balance(root, argp);
+        case BTRFS_IOC_BALANCE_CTL:
+                return btrfs_ioctl_balance_ctl(root, arg);
+        case BTRFS_IOC_BALANCE_PROGRESS:
+                return btrfs_ioctl_balance_progress(root, argp);
        }
        return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 252ae9915de8..4f69028a68c4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args {
        __u64 reserved[124];                    /* pad to 1k */
 };
+/* balance control ioctl modes */
+#define BTRFS_BALANCE_CTL_PAUSE         1
+#define BTRFS_BALANCE_CTL_CANCEL        2
+/*
+ * this is packed, because it should be exactly the same as its disk
+ * byte order counterpart (struct btrfs_disk_balance_args)
+ */
+struct btrfs_balance_args {
+        __u64 profiles;
+        __u64 usage;
+        __u64 devid;
+        __u64 pstart;
+        __u64 pend;
+        __u64 vstart;
+        __u64 vend;
+        __u64 target;
+        __u64 flags;
+        __u64 unused[8];
+} __attribute__ ((__packed__));
+/* report balance progress to userspace */
+struct btrfs_balance_progress {
+        __u64 expected;         /* estimated # of chunks that will be
+                                 * relocated to fulfill the request */
+        __u64 considered;       /* # of chunks we have considered so far */
+        __u64 completed;        /* # of chunks relocated so far */
+};
+#define BTRFS_BALANCE_STATE_RUNNING     (1ULL << 0)
+#define BTRFS_BALANCE_STATE_PAUSE_REQ   (1ULL << 1)
+#define BTRFS_BALANCE_STATE_CANCEL_REQ  (1ULL << 2)
+struct btrfs_ioctl_balance_args {
+        __u64 flags;                            /* in/out */
+        __u64 state;                            /* out */
+        struct btrfs_balance_args data;         /* in/out */
+        struct btrfs_balance_args meta;         /* in/out */
+        struct btrfs_balance_args sys;          /* in/out */
+        struct btrfs_balance_progress stat;     /* out */
+        __u64 unused[72];                       /* pad to 1k */
+};
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
 struct btrfs_ioctl_ino_lookup_args {
        __u64 treeid;
@@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args {
                                 struct btrfs_ioctl_dev_info_args)
 #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
                               struct btrfs_ioctl_fs_info_args)
+#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
+                                   struct btrfs_ioctl_balance_args)
+#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
+#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
+                                        struct btrfs_ioctl_balance_args)
 #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
                                        struct btrfs_ioctl_ino_path_args)
 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d77b67c4b275..5e178d8f7167 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
 */
 void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
+        if (eb->lock_nested) {
+                read_lock(&eb->lock);
+                if (eb->lock_nested && current->pid == eb->lock_owner) {
+                        read_unlock(&eb->lock);
+                        return;
+                }
+                read_unlock(&eb->lock);
+        }
        if (rw == BTRFS_WRITE_LOCK) {
                if (atomic_read(&eb->blocking_writers) == 0) {
                        WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
 */
 void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
+        if (eb->lock_nested) {
+                read_lock(&eb->lock);
+                if (&eb->lock_nested && current->pid == eb->lock_owner) {
+                        read_unlock(&eb->lock);
+                        return;
+                }
+                read_unlock(&eb->lock);
+        }
        if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
                BUG_ON(atomic_read(&eb->blocking_writers) != 1);
                write_lock(&eb->lock);
@@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 void btrfs_tree_read_lock(struct extent_buffer *eb)
 {
 again:
+        read_lock(&eb->lock);
+        if (atomic_read(&eb->blocking_writers) &&
+            current->pid == eb->lock_owner) {
+                /*
+                 * This extent is already write-locked by our thread. We allow
+                 * an additional read lock to be added because it's for the same
+                 * thread. btrfs_find_all_roots() depends on this as it may be
+                 * called on a partly (write-)locked tree.
+                 */
+                BUG_ON(eb->lock_nested);
+                eb->lock_nested = 1;
+                read_unlock(&eb->lock);
+                return;
+        }
+        read_unlock(&eb->lock);
        wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
        read_lock(&eb->lock);
        if (atomic_read(&eb->blocking_writers)) {
                read_unlock(&eb->lock);
-                wait_event(eb->write_lock_wq,
-                           atomic_read(&eb->blocking_writers) == 0);
                goto again;
        }
        atomic_inc(&eb->read_locks);
@@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
        }
        atomic_inc(&eb->write_locks);
        atomic_inc(&eb->spinning_writers);
+        eb->lock_owner = current->pid;
        return 1;
 }
@@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
 */
 void btrfs_tree_read_unlock(struct extent_buffer *eb)
 {
+        if (eb->lock_nested) {
+                read_lock(&eb->lock);
+                if (eb->lock_nested && current->pid == eb->lock_owner) {
+                        eb->lock_nested = 0;
+                        read_unlock(&eb->lock);
+                        return;
+                }
+                read_unlock(&eb->lock);
+        }
        btrfs_assert_tree_read_locked(eb);
        WARN_ON(atomic_read(&eb->spinning_readers) == 0);
        atomic_dec(&eb->spinning_readers);
@@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
 */
 void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
 {
+        if (eb->lock_nested) {
+                read_lock(&eb->lock);
+                if (eb->lock_nested && current->pid == eb->lock_owner) {
+                        eb->lock_nested = 0;
+                        read_unlock(&eb->lock);
+                        return;
+                }
+                read_unlock(&eb->lock);
+        }
        btrfs_assert_tree_read_locked(eb);
        WARN_ON(atomic_read(&eb->blocking_readers) == 0);
        if (atomic_dec_and_test(&eb->blocking_readers))
@@ -181,6 +229,7 @@ again:
        WARN_ON(atomic_read(&eb->spinning_writers));
        atomic_inc(&eb->spinning_writers);
        atomic_inc(&eb->write_locks);
+        eb->lock_owner = current->pid;
        return 0;
 }
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cfb55434a469..8c1aae2c845d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
                ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
                                           num_bytes, parent,
                                           btrfs_header_owner(leaf),
-                                           key.objectid, key.offset);
+                                           key.objectid, key.offset, 1);
                BUG_ON(ret);
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        parent, btrfs_header_owner(leaf),
-                                        key.objectid, key.offset);
+                                        key.objectid, key.offset, 1);
                BUG_ON(ret);
        }
        if (dirty)
@@ -1778,21 +1778,23 @@ again:
                ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
                                        path->nodes[level]->start,
-                                        src->root_key.objectid, level - 1, 0);
+                                        src->root_key.objectid, level - 1, 0,
+                                        1);
                BUG_ON(ret);
                ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
                                        0, dest->root_key.objectid, level - 1,
-                                        0);
+                                        0, 1);
                BUG_ON(ret);
                ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
                                        path->nodes[level]->start,
-                                        src->root_key.objectid, level - 1, 0);
+                                        src->root_key.objectid, level - 1, 0,
+                                        1);
                BUG_ON(ret);
                ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
                                        0, dest->root_key.objectid, level - 1,
-                                        0);
+                                        0, 1);
                BUG_ON(ret);
                btrfs_unlock_up_safe(path, 0);
@@ -2244,7 +2246,7 @@ again:
                } else {
                        list_del_init(&reloc_root->root_list);
                }
-                btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
+                btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
        }
        if (found) {
@@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                                                node->eb->start, blocksize,
                                                upper->eb->start,
                                                btrfs_header_owner(upper->eb),
-                                                node->level, 0);
+                                                node->level, 0, 1);
                        BUG_ON(ret);
                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2947,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
        while (index <= last_index) {
-                mutex_lock(&inode->i_mutex);
                ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
-                mutex_unlock(&inode->i_mutex);
                if (ret)
                        goto out;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ddf2c90d3fc0..9770cc5bfb76 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -25,6 +25,7 @@
 #include "transaction.h"
 #include "backref.h"
 #include "extent_io.h"
+#include "check-integrity.h"
 /*
 * This is only the first step towards a full-features scrub. It reads all
@@ -309,7 +310,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
        u8 ref_level;
        unsigned long ptr = 0;
        const int bufsize = 4096;
-        u64 extent_offset;
+        u64 extent_item_pos;
        path = btrfs_alloc_path();
@@ -329,12 +330,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
        if (ret < 0)
                goto out;
-        extent_offset = swarn.logical - found_key.objectid;
+        extent_item_pos = swarn.logical - found_key.objectid;
        swarn.extent_item_size = found_key.offset;
        eb = path->nodes[0];
        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
        item_size = btrfs_item_size_nr(eb, path->slots[0]);
+        btrfs_release_path(path);
        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                do {
@@ -351,7 +353,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
        } else {
                swarn.path = path;
                iterate_extent_inodes(fs_info, path, found_key.objectid,
-                                        extent_offset,
+                                        extent_item_pos,
                                        scrub_print_warning_inode, &swarn);
        }
@@ -732,7 +734,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
        bio_add_page(bio, page, PAGE_SIZE, 0);
        bio->bi_end_io = scrub_fixup_end_io;
        bio->bi_private = &complete;
-        submit_bio(rw, bio);
+        btrfsic_submit_bio(rw, bio);
        /* this will also unplug the queue */
        wait_for_completion(&complete);
@@ -958,7 +960,7 @@ static int scrub_submit(struct scrub_dev *sdev)
        sdev->curr = -1;
        atomic_inc(&sdev->in_flight);
-        submit_bio(READ, sbio->bio);
+        btrfsic_submit_bio(READ, sbio->bio);
        return 0;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ae488aa1966a..3ce97b217cbe 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -147,13 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 static void btrfs_put_super(struct super_block *sb)
 {
-        struct btrfs_root *root = btrfs_sb(sb);
+        (void)close_ctree(btrfs_sb(sb)->tree_root);
-        int ret;
+        /* FIXME: need to fix VFS to return error? */
+        /* AV: return it _where_?  ->put_super() can be triggered by any number
-        ret = close_ctree(root);
+         * of async events, up to and including delivery of SIGKILL to the
-        sb->s_fs_info = NULL;
+         * last process that kept it busy.  Or segfault in the aforementioned
+         * process...  Whom would you report that to?
-        (void)ret; /* FIXME: need to fix VFS to return error? */
+         */
 }
 enum {
@@ -163,8 +163,11 @@ enum {
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-        Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
+        Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
-        Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
+        Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+        Opt_check_integrity, Opt_check_integrity_including_extent_data,
+        Opt_check_integrity_print_mask,
+        Opt_err,
 };
 static match_table_t tokens = {
@@ -199,6 +202,10 @@ static match_table_t tokens = {
        {Opt_inode_cache, "inode_cache"},
        {Opt_no_space_cache, "nospace_cache"},
        {Opt_recovery, "recovery"},
+        {Opt_skip_balance, "skip_balance"},
+        {Opt_check_integrity, "check_int"},
+        {Opt_check_integrity_including_extent_data, "check_int_data"},
+        {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
        {Opt_err, NULL},
 };
@@ -397,6 +404,40 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        printk(KERN_INFO "btrfs: enabling auto recovery");
                        btrfs_set_opt(info->mount_opt, RECOVERY);
                        break;
+                case Opt_skip_balance:
+                        btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+                        break;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+                case Opt_check_integrity_including_extent_data:
+                        printk(KERN_INFO "btrfs: enabling check integrity"
+                               " including extent data\n");
+                        btrfs_set_opt(info->mount_opt,
+                                      CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+                        btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+                        break;
+                case Opt_check_integrity:
+                        printk(KERN_INFO "btrfs: enabling check integrity\n");
+                        btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+                        break;
+                case Opt_check_integrity_print_mask:
+                        intarg = 0;
+                        match_int(&args[0], &intarg);
+                        if (intarg) {
+                                info->check_integrity_print_mask = intarg;
+                                printk(KERN_INFO "btrfs:"
+                                       " check_integrity_print_mask 0x%x\n",
+                                       info->check_integrity_print_mask);
+                        }
+                        break;
+#else
+                case Opt_check_integrity_including_extent_data:
+                case Opt_check_integrity:
+                case Opt_check_integrity_print_mask:
+                        printk(KERN_ERR "btrfs: support for check_integrity*"
+                               " not compiled in!\n");
+                        ret = -EINVAL;
+                        goto out;
+#endif
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@ -500,7 +541,8 @@ out:
 static struct dentry *get_default_root(struct super_block *sb,
                                       u64 subvol_objectid)
 {
-        struct btrfs_root *root = sb->s_fs_info;
+        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+        struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_root *new_root;
        struct btrfs_dir_item *di;
        struct btrfs_path *path;
@@ -530,7 +572,7 @@ static struct dentry *get_default_root(struct super_block *sb,
         * will mount by default if we haven't been given a specific subvolume
         * to mount.
         */
-        dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+        dir_id = btrfs_super_root_dir(fs_info->super_copy);
        di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
        if (IS_ERR(di)) {
                btrfs_free_path(path);
@@ -544,7 +586,7 @@ static struct dentry *get_default_root(struct super_block *sb,
                 */
                btrfs_free_path(path);
                dir_id = BTRFS_FIRST_FREE_OBJECTID;
-                new_root = root->fs_info->fs_root;
+                new_root = fs_info->fs_root;
                goto setup_root;
        }
@@ -552,7 +594,7 @@ static struct dentry *get_default_root(struct super_block *sb,
        btrfs_free_path(path);
 find_root:
-        new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+        new_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (IS_ERR(new_root))
                return ERR_CAST(new_root);
@@ -588,7 +630,7 @@ static int btrfs_fill_super(struct super_block *sb,
 {
        struct inode *inode;
        struct dentry *root_dentry;
-        struct btrfs_root *tree_root;
+        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_key key;
        int err;
@@ -603,18 +645,16 @@ static int btrfs_fill_super(struct super_block *sb,
        sb->s_flags |= MS_POSIXACL;
 #endif
-        tree_root = open_ctree(sb, fs_devices, (char *)data);
+        err = open_ctree(sb, fs_devices, (char *)data);
+        if (err) {
-        if (IS_ERR(tree_root)) {
                printk("btrfs: open_ctree failed\n");
-                return PTR_ERR(tree_root);
+                return err;
        }
-        sb->s_fs_info = tree_root;
        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-        inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
+        inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto fail_close;
@@ -631,23 +671,25 @@ static int btrfs_fill_super(struct super_block *sb,
        save_mount_options(sb, data);
        cleancache_init_fs(sb);
+        sb->s_flags |= MS_ACTIVE;
        return 0;
 fail_close:
-        close_ctree(tree_root);
+        close_ctree(fs_info->tree_root);
        return err;
 }
 int btrfs_sync_fs(struct super_block *sb, int wait)
 {
        struct btrfs_trans_handle *trans;
-        struct btrfs_root *root = btrfs_sb(sb);
+        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+        struct btrfs_root *root = fs_info->tree_root;
        int ret;
        trace_btrfs_sync_fs(wait);
        if (!wait) {
-                filemap_flush(root->fs_info->btree_inode->i_mapping);
+                filemap_flush(fs_info->btree_inode->i_mapping);
                return 0;
        }
@@ -663,8 +705,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 {
-        struct btrfs_root *root = btrfs_sb(dentry->d_sb);
+        struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
-        struct btrfs_fs_info *info = root->fs_info;
+        struct btrfs_root *root = info->tree_root;
        char *compress_type;
        if (btrfs_test_opt(root, DEGRADED))
@@ -722,28 +764,25 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
                seq_puts(seq, ",autodefrag");
        if (btrfs_test_opt(root, INODE_MAP_CACHE))
                seq_puts(seq, ",inode_cache");
+        if (btrfs_test_opt(root, SKIP_BALANCE))
+                seq_puts(seq, ",skip_balance");
        return 0;
 }
 static int btrfs_test_super(struct super_block *s, void *data)
 {
-        struct btrfs_root *test_root = data;
+        struct btrfs_fs_info *p = data;
-        struct btrfs_root *root = btrfs_sb(s);
+        struct btrfs_fs_info *fs_info = btrfs_sb(s);
-        /*
+        return fs_info->fs_devices == p->fs_devices;
-         * If this super block is going away, return false as it
-         * can't match as an existing super block.
-         */
-        if (!atomic_read(&s->s_active))
-                return 0;
-        return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
 }
 static int btrfs_set_super(struct super_block *s, void *data)
 {
-        s->s_fs_info = data;
+        int err = set_anon_super(s, data);
+        if (!err)
-        return set_anon_super(s, data);
+                s->s_fs_info = data;
+        return err;
 }
 /*
@@ -903,12 +942,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        if (!fs_info)
                return ERR_PTR(-ENOMEM);
-        fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-        if (!fs_info->tree_root) {
-                error = -ENOMEM;
-                goto error_fs_info;
-        }
-        fs_info->tree_root->fs_info = fs_info;
        fs_info->fs_devices = fs_devices;
        fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
@@ -928,43 +961,30 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        }
        bdev = fs_devices->latest_bdev;
-        s = sget(fs_type, btrfs_test_super, btrfs_set_super,
+        s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
-                 fs_info->tree_root);
        if (IS_ERR(s)) {
                error = PTR_ERR(s);
                goto error_close_devices;
        }
        if (s->s_root) {
-                if ((flags ^ s->s_flags) & MS_RDONLY) {
-                        deactivate_locked_super(s);
-                        error = -EBUSY;
-                        goto error_close_devices;
-                }
                btrfs_close_devices(fs_devices);
                free_fs_info(fs_info);
+                if ((flags ^ s->s_flags) & MS_RDONLY)
+                        error = -EBUSY;
        } else {
                char b[BDEVNAME_SIZE];
                s->s_flags = flags | MS_NOSEC;
                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
-                btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+                btrfs_sb(s)->bdev_holder = fs_type;
                error = btrfs_fill_super(s, fs_devices, data,
                                         flags & MS_SILENT ? 1 : 0);
-                if (error) {
-                        deactivate_locked_super(s);
-                        return ERR_PTR(error);
-                }
-                s->s_flags |= MS_ACTIVE;
        }
-        root = get_default_root(s, subvol_objectid);
+        root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
-        if (IS_ERR(root)) {
+        if (IS_ERR(root))
                deactivate_locked_super(s);
-                return root;
-        }
        return root;
@@ -977,7 +997,8 @@ error_fs_info:
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 {
-        struct btrfs_root *root = btrfs_sb(sb);
+        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+        struct btrfs_root *root = fs_info->tree_root;
        int ret;
        ret = btrfs_parse_options(root, data);
@@ -993,13 +1014,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                ret =  btrfs_commit_super(root);
                WARN_ON(ret);
        } else {
-                if (root->fs_info->fs_devices->rw_devices == 0)
+                if (fs_info->fs_devices->rw_devices == 0)
                        return -EACCES;
-                if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
+                if (btrfs_super_log_root(fs_info->super_copy) != 0)
                        return -EINVAL;
-                ret = btrfs_cleanup_fs_roots(root->fs_info);
+                ret = btrfs_cleanup_fs_roots(fs_info);
                WARN_ON(ret);
                /* recover relocation */
@@ -1168,18 +1189,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct btrfs_root *root = btrfs_sb(dentry->d_sb);
+        struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
-        struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+        struct btrfs_super_block *disk_super = fs_info->super_copy;
-        struct list_head *head = &root->fs_info->space_info;
+        struct list_head *head = &fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
        u64 total_free_data = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
-        __be32 *fsid = (__be32 *)root->fs_info->fsid;
+        __be32 *fsid = (__be32 *)fs_info->fsid;
        int ret;
        /* holding chunk_muext to avoid allocating new chunks */
-        mutex_lock(&root->fs_info->chunk_mutex);
+        mutex_lock(&fs_info->chunk_mutex);
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1198,14 +1219,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
        buf->f_bavail = total_free_data;
-        ret = btrfs_calc_avail_data_space(root, &total_free_data);
+        ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
        if (ret) {
-                mutex_unlock(&root->fs_info->chunk_mutex);
+                mutex_unlock(&fs_info->chunk_mutex);
                return ret;
        }
        buf->f_bavail += total_free_data;
        buf->f_bavail = buf->f_bavail >> bits;
-        mutex_unlock(&root->fs_info->chunk_mutex);
+        mutex_unlock(&fs_info->chunk_mutex);
        /* We treat it as constant endianness (it doesn't matter _which_)
           because we want the fsid to come out the same whether mounted
@@ -1219,11 +1240,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
+static void btrfs_kill_super(struct super_block *sb)
+{
+        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+        kill_anon_super(sb);
+        free_fs_info(fs_info);
+}
 static struct file_system_type btrfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "btrfs",
        .mount          = btrfs_mount,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = btrfs_kill_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -1257,17 +1285,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 static int btrfs_freeze(struct super_block *sb)
 {
-        struct btrfs_root *root = btrfs_sb(sb);
+        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-        mutex_lock(&root->fs_info->transaction_kthread_mutex);
+        mutex_lock(&fs_info->transaction_kthread_mutex);
-        mutex_lock(&root->fs_info->cleaner_mutex);
+        mutex_lock(&fs_info->cleaner_mutex);
        return 0;
 }
 static int btrfs_unfreeze(struct super_block *sb)
 {
-        struct btrfs_root *root = btrfs_sb(sb);
+        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-        mutex_unlock(&root->fs_info->cleaner_mutex);
+        mutex_unlock(&fs_info->cleaner_mutex);
-        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+        mutex_unlock(&fs_info->transaction_kthread_mutex);
        return 0;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81376d94cd3c..287a6728b1ad 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
        WARN_ON(atomic_read(&transaction->use_count) == 0);
        if (atomic_dec_and_test(&transaction->use_count)) {
                BUG_ON(!list_empty(&transaction->list));
+                WARN_ON(transaction->delayed_refs.root.rb_node);
+                WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
                memset(transaction, 0, sizeof(*transaction));
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
@@ -108,8 +110,11 @@ loop:
        cur_trans->delayed_refs.num_heads = 0;
        cur_trans->delayed_refs.flushing = 0;
        cur_trans->delayed_refs.run_delayed_start = 0;
+        cur_trans->delayed_refs.seq = 1;
+        init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
        spin_lock_init(&cur_trans->commit_lock);
        spin_lock_init(&cur_trans->delayed_refs.lock);
+        INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
        list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
@@ -321,6 +326,8 @@ again:
        }
        if (num_bytes) {
+                trace_btrfs_space_reservation(root->fs_info, "transaction",
+                                              (u64)h, num_bytes, 1);
                h->block_rsv = &root->fs_info->trans_block_rsv;
                h->bytes_reserved = num_bytes;
        }
@@ -467,19 +474,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
-        while (count < 4) {
+        while (count < 2) {
                unsigned long cur = trans->delayed_ref_updates;
                trans->delayed_ref_updates = 0;
                if (cur &&
                    trans->transaction->delayed_refs.num_heads_ready > 64) {
                        trans->delayed_ref_updates = 0;
-                        /*
-                         * do a full flush if the transaction is trying
-                         * to close
-                         */
-                        if (trans->transaction->delayed_refs.flushing)
-                                cur = 0;
                        btrfs_run_delayed_refs(trans, root, cur);
                } else {
                        break;
@@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
                if (btrfs_header_backref_rev(root->node) <
                    BTRFS_MIXED_BACKREF_REV)
-                        btrfs_drop_snapshot(root, NULL, 0);
+                        btrfs_drop_snapshot(root, NULL, 0, 0);
                else
-                        btrfs_drop_snapshot(root, NULL, 1);
+                        btrfs_drop_snapshot(root, NULL, 1, 0);
        }
        return 0;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3568374d419d..966cc74f5d6c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                ret = btrfs_inc_extent_ref(trans, root,
                                                ins.objectid, ins.offset,
                                                0, root->root_key.objectid,
-                                                key->objectid, offset);
+                                                key->objectid, offset, 0);
                                BUG_ON(ret);
                        } else {
                                /*
@@ -1957,7 +1957,8 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
                finish_wait(&root->log_commit_wait[index], &wait);
                mutex_lock(&root->log_mutex);
-        } while (root->log_transid < transid + 2 &&
+        } while (root->fs_info->last_trans_log_full_commit !=
+                 trans->transid && root->log_transid < transid + 2 &&
                 atomic_read(&root->log_commit[index]));
        return 0;
 }
@@ -1966,7 +1967,8 @@ static int wait_for_writer(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root)
 {
        DEFINE_WAIT(wait);
-        while (atomic_read(&root->log_writers)) {
+        while (root->fs_info->last_trans_log_full_commit !=
+               trans->transid && atomic_read(&root->log_writers)) {
                prepare_to_wait(&root->log_writer_wait,
                                &wait, TASK_UNINTERRUPTIBLE);
                mutex_unlock(&root->log_mutex);
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644
index 000000000000..12f5147bd2b1
--- /dev/null
+++ b/fs/btrfs/ulist.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ */
+#include <linux/slab.h>
+#include <linux/module.h>
+#include "ulist.h"
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * The implementation is preliminary and can probably be sped up
+ * significantly. A first step would be to store the values in an rbtree
+ * as soon as ULIST_SIZE is exceeded.
+ *
+ * A sample usage for ulists is the enumeration of directed graphs without
+ * visiting a node twice. The pseudo-code could look like this:
+ *
+ * ulist = ulist_alloc();
+ * ulist_add(ulist, root);
+ * elem = NULL;
+ *
+ * while ((elem = ulist_next(ulist, elem)) {
+ *      for (all child nodes n in elem)
+ *              ulist_add(ulist, n);
+ *      do something useful with the node;
+ * }
+ * ulist_free(ulist);
+ *
+ * This assumes the graph nodes are adressable by u64. This stems from the
+ * usage for tree enumeration in btrfs, where the logical addresses are
+ * 64 bit.
+ *
+ * It is also useful for tree enumeration which could be done elegantly
+ * recursively, but is not possible due to kernel stack limitations. The
+ * loop would be similar to the above.
+ */
+/**
+ * ulist_init - freshly initialize a ulist
+ * @ulist:      the ulist to initialize
+ *
+ * Note: don't use this function to init an already used ulist, use
+ * ulist_reinit instead.
+ */
+void ulist_init(struct ulist *ulist)
+{
+        ulist->nnodes = 0;
+        ulist->nodes = ulist->int_nodes;
+        ulist->nodes_alloced = ULIST_SIZE;
+}
+EXPORT_SYMBOL(ulist_init);
+/**
+ * ulist_fini - free up additionally allocated memory for the ulist
+ * @ulist:      the ulist from which to free the additional memory
+ *
+ * This is useful in cases where the base 'struct ulist' has been statically
+ * allocated.
+ */
+void ulist_fini(struct ulist *ulist)
+{
+        /*
+         * The first ULIST_SIZE elements are stored inline in struct ulist.
+         * Only if more elements are alocated they need to be freed.
+         */
+        if (ulist->nodes_alloced > ULIST_SIZE)
+                kfree(ulist->nodes);
+        ulist->nodes_alloced = 0;       /* in case ulist_fini is called twice */
+}
+EXPORT_SYMBOL(ulist_fini);
+/**
+ * ulist_reinit - prepare a ulist for reuse
+ * @ulist:      ulist to be reused
+ *
+ * Free up all additional memory allocated for the list elements and reinit
+ * the ulist.
+ */
+void ulist_reinit(struct ulist *ulist)
+{
+        ulist_fini(ulist);
+        ulist_init(ulist);
+}
+EXPORT_SYMBOL(ulist_reinit);
+/**
+ * ulist_alloc - dynamically allocate a ulist
+ * @gfp_mask:   allocation flags to for base allocation
+ *
+ * The allocated ulist will be returned in an initialized state.
+ */
+struct ulist *ulist_alloc(unsigned long gfp_mask)
+{
+        struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
+        if (!ulist)
+                return NULL;
+        ulist_init(ulist);
+        return ulist;
+}
+EXPORT_SYMBOL(ulist_alloc);
+/**
+ * ulist_free - free dynamically allocated ulist
+ * @ulist:      ulist to free
+ *
+ * It is not necessary to call ulist_fini before.
+ */
+void ulist_free(struct ulist *ulist)
+{
+        if (!ulist)
+                return;
+        ulist_fini(ulist);
+        kfree(ulist);
+}
+EXPORT_SYMBOL(ulist_free);
+/**
+ * ulist_add - add an element to the ulist
+ * @ulist:      ulist to add the element to
+ * @val:        value to add to ulist
+ * @aux:        auxiliary value to store along with val
+ * @gfp_mask:   flags to use for allocation
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks write
+ *       locking is needed
+ *
+ * Add an element to a ulist. The @val will only be added if it doesn't
+ * already exist. If it is added, the auxiliary value @aux is stored along with
+ * it. In case @val already exists in the ulist, @aux is ignored, even if
+ * it differs from the already stored value.
+ *
+ * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
+ * inserted.
+ * In case of allocation failure -ENOMEM is returned and the ulist stays
+ * unaltered.
+ */
+int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+              unsigned long gfp_mask)
+{
+        int i;
+        for (i = 0; i < ulist->nnodes; ++i) {
+                if (ulist->nodes[i].val == val)
+                        return 0;
+        }
+        if (ulist->nnodes >= ulist->nodes_alloced) {
+                u64 new_alloced = ulist->nodes_alloced + 128;
+                struct ulist_node *new_nodes;
+                void *old = NULL;
+                /*
+                 * if nodes_alloced == ULIST_SIZE no memory has been allocated
+                 * yet, so pass NULL to krealloc
+                 */
+                if (ulist->nodes_alloced > ULIST_SIZE)
+                        old = ulist->nodes;
+                new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
+                                     gfp_mask);
+                if (!new_nodes)
+                        return -ENOMEM;
+                if (!old)
+                        memcpy(new_nodes, ulist->int_nodes,
+                               sizeof(ulist->int_nodes));
+                ulist->nodes = new_nodes;
+                ulist->nodes_alloced = new_alloced;
+        }
+        ulist->nodes[ulist->nnodes].val = val;
+        ulist->nodes[ulist->nnodes].aux = aux;
+        ++ulist->nnodes;
+        return 1;
+}
+EXPORT_SYMBOL(ulist_add);
+/**
+ * ulist_next - iterate ulist
+ * @ulist:      ulist to iterate
+ * @prev:       previously returned element or %NULL to start iteration
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks only read
+ *       locking is needed
+ *
+ * This function is used to iterate an ulist. The iteration is started with
+ * @prev = %NULL. It returns the next element from the ulist or %NULL when the
+ * end is reached. No guarantee is made with respect to the order in which
+ * the elements are returned. They might neither be returned in order of
+ * addition nor in ascending order.
+ * It is allowed to call ulist_add during an enumeration. Newly added items
+ * are guaranteed to show up in the running enumeration.
+ */
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
+{
+        int next;
+        if (ulist->nnodes == 0)
+                return NULL;
+        if (!prev)
+                return &ulist->nodes[0];
+        next = (prev - ulist->nodes) + 1;
+        if (next < 0 || next >= ulist->nnodes)
+                return NULL;
+        return &ulist->nodes[next];
+}
+EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644
index 000000000000..2e25dec58ec0
--- /dev/null
+++ b/fs/btrfs/ulist.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ *
+ */
+#ifndef __ULIST__
+#define __ULIST__
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * The implementation is preliminary and can probably be sped up
+ * significantly. A first step would be to store the values in an rbtree
+ * as soon as ULIST_SIZE is exceeded.
+ */
+/*
+ * number of elements statically allocated inside struct ulist
+ */
+#define ULIST_SIZE 16
+/*
+ * element of the list
+ */
+struct ulist_node {
+        u64 val;                /* value to store */
+        unsigned long aux;      /* auxiliary value saved along with the val */
+};
+struct ulist {
+        /*
+         * number of elements stored in list
+         */
+        unsigned long nnodes;
+        /*
+         * number of nodes we already have room for
+         */
+        unsigned long nodes_alloced;
+        /*
+         * pointer to the array storing the elements. The first ULIST_SIZE
+         * elements are stored inline. In this case the it points to int_nodes.
+         * After exceeding ULIST_SIZE, dynamic memory is allocated.
+         */
+        struct ulist_node *nodes;
+        /*
+         * inline storage space for the first ULIST_SIZE entries
+         */
+        struct ulist_node int_nodes[ULIST_SIZE];
+};
+void ulist_init(struct ulist *ulist);
+void ulist_fini(struct ulist *ulist);
+void ulist_reinit(struct ulist *ulist);
+struct ulist *ulist_alloc(unsigned long gfp_mask);
+void ulist_free(struct ulist *ulist);
+int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+              unsigned long gfp_mask);
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
+#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9d..0b4e2af7954d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
 #include <linux/random.h>
 #include <linux/iocontext.h>
 #include <linux/capability.h>
+#include <linux/kthread.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -32,6 +33,7 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "async-thread.h"
+#include "check-integrity.h"
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
@@ -246,7 +248,7 @@ loop_lock:
                        sync_pending = 0;
                }
-                submit_bio(cur->bi_rw, cur);
+                btrfsic_submit_bio(cur->bi_rw, cur);
                num_run++;
                batch_run++;
                if (need_resched())
@@ -706,8 +708,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        u64 devid;
        u64 transid;
-        mutex_lock(&uuid_mutex);
        flags |= FMODE_EXCL;
        bdev = blkdev_get_by_path(path, flags, holder);
@@ -716,6 +716,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                goto error;
        }
+        mutex_lock(&uuid_mutex);
        ret = set_blocksize(bdev, 4096);
        if (ret)
                goto error_close;
@@ -737,9 +738,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        brelse(bh);
 error_close:
+        mutex_unlock(&uuid_mutex);
        blkdev_put(bdev, flags);
 error:
-        mutex_unlock(&uuid_mutex);
        return ret;
 }
@@ -829,7 +830,6 @@ out:
 /*
 * find_free_dev_extent - find free space in the specified device
- * @trans:      transaction handler
 * @device:     the device which we search the free space in
 * @num_bytes:  the size of the free space that we need
 * @start:      store the start of the free space.
@@ -848,8 +848,7 @@ out:
 * But if we don't find suitable free space, it is used to store the size of
 * the max free space.
 */
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
-                         struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *len)
 {
        struct btrfs_key key;
@@ -893,7 +892,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
        key.offset = search_start;
        key.type = BTRFS_DEV_EXTENT_KEY;
-        ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
        if (ret > 0) {
@@ -1282,7 +1281,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        bool clear_super = false;
        mutex_lock(&uuid_mutex);
-        mutex_lock(&root->fs_info->volume_mutex);
        all_avail = root->fs_info->avail_data_alloc_bits |
                root->fs_info->avail_system_alloc_bits |
@@ -1452,7 +1450,6 @@ error_close:
        if (bdev)
                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
-        mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
        return ret;
 error_undo:
@@ -1469,8 +1466,7 @@ error_undo:
 /*
 * does all the dirty work required for changing file system's UUID.
 */
-static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
+static int btrfs_prepare_sprout(struct btrfs_root *root)
-                                struct btrfs_root *root)
 {
        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
        struct btrfs_fs_devices *old_devices;
@@ -1629,7 +1625,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        }
        filemap_write_and_wait(bdev->bd_inode->i_mapping);
-        mutex_lock(&root->fs_info->volume_mutex);
        devices = &root->fs_info->fs_devices->devices;
        /*
@@ -1695,7 +1690,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        if (seeding_dev) {
                sb->s_flags &= ~MS_RDONLY;
-                ret = btrfs_prepare_sprout(trans, root);
+                ret = btrfs_prepare_sprout(root);
                BUG_ON(ret);
        }
@@ -1757,8 +1752,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                ret = btrfs_relocate_sys_chunks(root);
                BUG_ON(ret);
        }
-out:
-        mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 error:
        blkdev_put(bdev, FMODE_EXCL);
@@ -1766,7 +1760,7 @@ error:
                mutex_unlock(&uuid_mutex);
                up_write(&sb->s_umount);
        }
-        goto out;
+        return ret;
 }
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2077,6 +2071,362 @@ error:
        return ret;
 }
+static int insert_balance_item(struct btrfs_root *root,
+                               struct btrfs_balance_control *bctl)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_balance_item *item;
+        struct btrfs_disk_balance_args disk_bargs;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        int ret, err;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                btrfs_free_path(path);
+                return PTR_ERR(trans);
+        }
+        key.objectid = BTRFS_BALANCE_OBJECTID;
+        key.type = BTRFS_BALANCE_ITEM_KEY;
+        key.offset = 0;
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      sizeof(*item));
+        if (ret)
+                goto out;
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+        btrfs_set_balance_data(leaf, item, &disk_bargs);
+        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+        btrfs_set_balance_meta(leaf, item, &disk_bargs);
+        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+        btrfs_set_balance_sys(leaf, item, &disk_bargs);
+        btrfs_set_balance_flags(leaf, item, bctl->flags);
+        btrfs_mark_buffer_dirty(leaf);
+out:
+        btrfs_free_path(path);
+        err = btrfs_commit_transaction(trans, root);
+        if (err && !ret)
+                ret = err;
+        return ret;
+}
+static int del_balance_item(struct btrfs_root *root)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        int ret, err;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                btrfs_free_path(path);
+                return PTR_ERR(trans);
+        }
+        key.objectid = BTRFS_BALANCE_OBJECTID;
+        key.type = BTRFS_BALANCE_ITEM_KEY;
+        key.offset = 0;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        ret = btrfs_del_item(trans, root, path);
+out:
+        btrfs_free_path(path);
+        err = btrfs_commit_transaction(trans, root);
+        if (err && !ret)
+                ret = err;
+        return ret;
+}
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
+{
+        /*
+         * Turn on soft mode for chunk types that were being converted.
+         */
+        if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+                bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+                bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+        if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+                bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+        /*
+         * Turn on usage filter if is not already used.  The idea is
+         * that chunks that we have already balanced should be
+         * reasonably full.  Don't do it for chunks that are being
+         * converted - that will keep us from relocating unconverted
+         * (albeit full) chunks.
+         */
+        if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+            !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+                bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+                bctl->data.usage = 90;
+        }
+        if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+            !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+                bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+                bctl->sys.usage = 90;
+        }
+        if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+            !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+                bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+                bctl->meta.usage = 90;
+        }
+}
+/*
+ * Should be called with both balance and volume mutexes held to
+ * serialize other volume operations (add_dev/rm_dev/resize) with
+ * restriper.  Same goes for unset_balance_control.
+ */
+static void set_balance_control(struct btrfs_balance_control *bctl)
+{
+        struct btrfs_fs_info *fs_info = bctl->fs_info;
+        BUG_ON(fs_info->balance_ctl);
+        spin_lock(&fs_info->balance_lock);
+        fs_info->balance_ctl = bctl;
+        spin_unlock(&fs_info->balance_lock);
+}
+static void unset_balance_control(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+        BUG_ON(!fs_info->balance_ctl);
+        spin_lock(&fs_info->balance_lock);
+        fs_info->balance_ctl = NULL;
+        spin_unlock(&fs_info->balance_lock);
+        kfree(bctl);
+}
+/*
+ * Balance filters.  Return 1 if chunk should be filtered out
+ * (should not be balanced).
+ */
+static int chunk_profiles_filter(u64 chunk_profile,
+                                 struct btrfs_balance_args *bargs)
+{
+        chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+        if (chunk_profile == 0)
+                chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+        if (bargs->profiles & chunk_profile)
+                return 0;
+        return 1;
+}
+static u64 div_factor_fine(u64 num, int factor)
+{
+        if (factor <= 0)
+                return 0;
+        if (factor >= 100)
+                return num;
+        num *= factor;
+        do_div(num, 100);
+        return num;
+}
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+                              struct btrfs_balance_args *bargs)
+{
+        struct btrfs_block_group_cache *cache;
+        u64 chunk_used, user_thresh;
+        int ret = 1;
+        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+        chunk_used = btrfs_block_group_used(&cache->item);
+        user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+        if (chunk_used < user_thresh)
+                ret = 0;
+        btrfs_put_block_group(cache);
+        return ret;
+}
+static int chunk_devid_filter(struct extent_buffer *leaf,
+                              struct btrfs_chunk *chunk,
+                              struct btrfs_balance_args *bargs)
+{
+        struct btrfs_stripe *stripe;
+        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+        int i;
+        for (i = 0; i < num_stripes; i++) {
+                stripe = btrfs_stripe_nr(chunk, i);
+                if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+                        return 0;
+        }
+        return 1;
+}
+/* [pstart, pend) */
+static int chunk_drange_filter(struct extent_buffer *leaf,
+                               struct btrfs_chunk *chunk,
+                               u64 chunk_offset,
+                               struct btrfs_balance_args *bargs)
+{
+        struct btrfs_stripe *stripe;
+        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+        u64 stripe_offset;
+        u64 stripe_length;
+        int factor;
+        int i;
+        if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+                return 0;
+        if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+             BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
+                factor = 2;
+        else
+                factor = 1;
+        factor = num_stripes / factor;
+        for (i = 0; i < num_stripes; i++) {
+                stripe = btrfs_stripe_nr(chunk, i);
+                if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+                        continue;
+                stripe_offset = btrfs_stripe_offset(leaf, stripe);
+                stripe_length = btrfs_chunk_length(leaf, chunk);
+                do_div(stripe_length, factor);
+                if (stripe_offset < bargs->pend &&
+                    stripe_offset + stripe_length > bargs->pstart)
+                        return 0;
+        }
+        return 1;
+}
+/* [vstart, vend) */
+static int chunk_vrange_filter(struct extent_buffer *leaf,
+                               struct btrfs_chunk *chunk,
+                               u64 chunk_offset,
+                               struct btrfs_balance_args *bargs)
+{
+        if (chunk_offset < bargs->vend &&
+            chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+                /* at least part of the chunk is inside this vrange */
+                return 0;
+        return 1;
+}
+static int chunk_soft_convert_filter(u64 chunk_profile,
+                                     struct btrfs_balance_args *bargs)
+{
+        if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+                return 0;
+        chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+        if (chunk_profile == 0)
+                chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+        if (bargs->target & chunk_profile)
+                return 1;
+        return 0;
+}
+static int should_balance_chunk(struct btrfs_root *root,
+                                struct extent_buffer *leaf,
+                                struct btrfs_chunk *chunk, u64 chunk_offset)
+{
+        struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+        struct btrfs_balance_args *bargs = NULL;
+        u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+        /* type filter */
+        if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+              (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+                return 0;
+        }
+        if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+                bargs = &bctl->data;
+        else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+                bargs = &bctl->sys;
+        else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+                bargs = &bctl->meta;
+        /* profiles filter */
+        if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+            chunk_profiles_filter(chunk_type, bargs)) {
+                return 0;
+        }
+        /* usage filter */
+        if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+            chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+                return 0;
+        }
+        /* devid filter */
+        if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+            chunk_devid_filter(leaf, chunk, bargs)) {
+                return 0;
+        }
+        /* drange filter, makes sense only with devid filter */
+        if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+            chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+                return 0;
+        }
+        /* vrange filter */
+        if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+            chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+                return 0;
+        }
+        /* soft profile changing mode */
+        if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+            chunk_soft_convert_filter(chunk_type, bargs)) {
+                return 0;
+        }
+        return 1;
+}
 static u64 div_factor(u64 num, int factor)
 {
        if (factor == 10)
@@ -2086,29 +2436,28 @@ static u64 div_factor(u64 num, int factor)
        return num;
 }
-int btrfs_balance(struct btrfs_root *dev_root)
+static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 {
-        int ret;
+        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
-        struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+        struct btrfs_root *chunk_root = fs_info->chunk_root;
+        struct btrfs_root *dev_root = fs_info->dev_root;
+        struct list_head *devices;
        struct btrfs_device *device;
        u64 old_size;
        u64 size_to_free;
+        struct btrfs_chunk *chunk;
        struct btrfs_path *path;
        struct btrfs_key key;
-        struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
-        struct btrfs_trans_handle *trans;
        struct btrfs_key found_key;
+        struct btrfs_trans_handle *trans;
-        if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
+        struct extent_buffer *leaf;
-                return -EROFS;
+        int slot;
+        int ret;
-        if (!capable(CAP_SYS_ADMIN))
+        int enospc_errors = 0;
-                return -EPERM;
+        bool counting = true;
-        mutex_lock(&dev_root->fs_info->volume_mutex);
-        dev_root = dev_root->fs_info->dev_root;
        /* step one make some room on all the devices */
+        devices = &fs_info->fs_devices->devices;
        list_for_each_entry(device, devices, dev_list) {
                old_size = device->total_bytes;
                size_to_free = div_factor(old_size, 1);
@@ -2137,11 +2486,23 @@ int btrfs_balance(struct btrfs_root *dev_root)
                ret = -ENOMEM;
                goto error;
        }
+        /* zero out stat counters */
+        spin_lock(&fs_info->balance_lock);
+        memset(&bctl->stat, 0, sizeof(bctl->stat));
+        spin_unlock(&fs_info->balance_lock);
+again:
        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
        key.offset = (u64)-1;
        key.type = BTRFS_CHUNK_ITEM_KEY;
        while (1) {
+                if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+                    atomic_read(&fs_info->balance_cancel_req)) {
+                        ret = -ECANCELED;
+                        goto error;
+                }
                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
                if (ret < 0)
                        goto error;
@@ -2151,15 +2512,19 @@ int btrfs_balance(struct btrfs_root *dev_root)
                 * failed
                 */
                if (ret == 0)
-                        break;
+                        BUG(); /* FIXME break ? */
                ret = btrfs_previous_item(chunk_root, path, 0,
                                          BTRFS_CHUNK_ITEM_KEY);
-                if (ret)
+                if (ret) {
+                        ret = 0;
                        break;
+                }
+                leaf = path->nodes[0];
+                slot = path->slots[0];
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
-                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-                                      path->slots[0]);
                if (found_key.objectid != key.objectid)
                        break;
@@ -2167,22 +2532,375 @@ int btrfs_balance(struct btrfs_root *dev_root)
                if (found_key.offset == 0)
                        break;
+                chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+                if (!counting) {
+                        spin_lock(&fs_info->balance_lock);
+                        bctl->stat.considered++;
+                        spin_unlock(&fs_info->balance_lock);
+                }
+                ret = should_balance_chunk(chunk_root, leaf, chunk,
+                                           found_key.offset);
                btrfs_release_path(path);
+                if (!ret)
+                        goto loop;
+                if (counting) {
+                        spin_lock(&fs_info->balance_lock);
+                        bctl->stat.expected++;
+                        spin_unlock(&fs_info->balance_lock);
+                        goto loop;
+                }
                ret = btrfs_relocate_chunk(chunk_root,
                                           chunk_root->root_key.objectid,
                                           found_key.objectid,
                                           found_key.offset);
                if (ret && ret != -ENOSPC)
                        goto error;
+                if (ret == -ENOSPC) {
+                        enospc_errors++;
+                } else {
+                        spin_lock(&fs_info->balance_lock);
+                        bctl->stat.completed++;
+                        spin_unlock(&fs_info->balance_lock);
+                }
+loop:
                key.offset = found_key.offset - 1;
        }
-        ret = 0;
+        if (counting) {
+                btrfs_release_path(path);
+                counting = false;
+                goto again;
+        }
 error:
        btrfs_free_path(path);
-        mutex_unlock(&dev_root->fs_info->volume_mutex);
+        if (enospc_errors) {
+                printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+                       enospc_errors);
+                if (!ret)
+                        ret = -ENOSPC;
+        }
        return ret;
 }
+static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+{
+        /* cancel requested || normal exit path */
+        return atomic_read(&fs_info->balance_cancel_req) ||
+                (atomic_read(&fs_info->balance_pause_req) == 0 &&
+                 atomic_read(&fs_info->balance_cancel_req) == 0);
+}
+static void __cancel_balance(struct btrfs_fs_info *fs_info)
+{
+        int ret;
+        unset_balance_control(fs_info);
+        ret = del_balance_item(fs_info->tree_root);
+        BUG_ON(ret);
+}
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+                               struct btrfs_ioctl_balance_args *bargs);
+/*
+ * Should be called with both balance and volume mutexes held
+ */
+int btrfs_balance(struct btrfs_balance_control *bctl,
+                  struct btrfs_ioctl_balance_args *bargs)
+{
+        struct btrfs_fs_info *fs_info = bctl->fs_info;
+        u64 allowed;
+        int ret;
+        if (btrfs_fs_closing(fs_info) ||
+            atomic_read(&fs_info->balance_pause_req) ||
+            atomic_read(&fs_info->balance_cancel_req)) {
+                ret = -EINVAL;
+                goto out;
+        }
+        /*
+         * In case of mixed groups both data and meta should be picked,
+         * and identical options should be given for both of them.
+         */
+        allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+        if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+            (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
+                if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+                    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+                    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+                        printk(KERN_ERR "btrfs: with mixed groups data and "
+                               "metadata balance options must be the same\n");
+                        ret = -EINVAL;
+                        goto out;
+                }
+        }
+        /*
+         * Profile changing sanity checks.  Skip them if a simple
+         * balance is requested.
+         */
+        if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
+              BTRFS_BALANCE_ARGS_CONVERT))
+                goto do_balance;
+        allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+        if (fs_info->fs_devices->num_devices == 1)
+                allowed |= BTRFS_BLOCK_GROUP_DUP;
+        else if (fs_info->fs_devices->num_devices < 4)
+                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+        else
+                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+                                BTRFS_BLOCK_GROUP_RAID10);
+        if (!profile_is_valid(bctl->data.target, 1) ||
+            bctl->data.target & ~allowed) {
+                printk(KERN_ERR "btrfs: unable to start balance with target "
+                       "data profile %llu\n",
+                       (unsigned long long)bctl->data.target);
+                ret = -EINVAL;
+                goto out;
+        }
+        if (!profile_is_valid(bctl->meta.target, 1) ||
+            bctl->meta.target & ~allowed) {
+                printk(KERN_ERR "btrfs: unable to start balance with target "
+                       "metadata profile %llu\n",
+                       (unsigned long long)bctl->meta.target);
+                ret = -EINVAL;
+                goto out;
+        }
+        if (!profile_is_valid(bctl->sys.target, 1) ||
+            bctl->sys.target & ~allowed) {
+                printk(KERN_ERR "btrfs: unable to start balance with target "
+                       "system profile %llu\n",
+                       (unsigned long long)bctl->sys.target);
+                ret = -EINVAL;
+                goto out;
+        }
+        if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
+                printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+                ret = -EINVAL;
+                goto out;
+        }
+        /* allow to reduce meta or sys integrity only if force set */
+        allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+                        BTRFS_BLOCK_GROUP_RAID10;
+        if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+             (fs_info->avail_system_alloc_bits & allowed) &&
+             !(bctl->sys.target & allowed)) ||
+            ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+             (fs_info->avail_metadata_alloc_bits & allowed) &&
+             !(bctl->meta.target & allowed))) {
+                if (bctl->flags & BTRFS_BALANCE_FORCE) {
+                        printk(KERN_INFO "btrfs: force reducing metadata "
+                               "integrity\n");
+                } else {
+                        printk(KERN_ERR "btrfs: balance will reduce metadata "
+                               "integrity, use force if you want this\n");
+                        ret = -EINVAL;
+                        goto out;
+                }
+        }
+do_balance:
+        ret = insert_balance_item(fs_info->tree_root, bctl);
+        if (ret && ret != -EEXIST)
+                goto out;
+        if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+                BUG_ON(ret == -EEXIST);
+                set_balance_control(bctl);
+        } else {
+                BUG_ON(ret != -EEXIST);
+                spin_lock(&fs_info->balance_lock);
+                update_balance_args(bctl);
+                spin_unlock(&fs_info->balance_lock);
+        }
+        atomic_inc(&fs_info->balance_running);
+        mutex_unlock(&fs_info->balance_mutex);
+        ret = __btrfs_balance(fs_info);
+        mutex_lock(&fs_info->balance_mutex);
+        atomic_dec(&fs_info->balance_running);
+        if (bargs) {
+                memset(bargs, 0, sizeof(*bargs));
+                update_ioctl_balance_args(fs_info, 0, bargs);
+        }
+        if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+            balance_need_close(fs_info)) {
+                __cancel_balance(fs_info);
+        }
+        wake_up(&fs_info->balance_wait_q);
+        return ret;
+out:
+        if (bctl->flags & BTRFS_BALANCE_RESUME)
+                __cancel_balance(fs_info);
+        else
+                kfree(bctl);
+        return ret;
+}
+static int balance_kthread(void *data)
+{
+        struct btrfs_balance_control *bctl =
+                        (struct btrfs_balance_control *)data;
+        struct btrfs_fs_info *fs_info = bctl->fs_info;
+        int ret = 0;
+        mutex_lock(&fs_info->volume_mutex);
+        mutex_lock(&fs_info->balance_mutex);
+        set_balance_control(bctl);
+        if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+                printk(KERN_INFO "btrfs: force skipping balance\n");
+        } else {
+                printk(KERN_INFO "btrfs: continuing balance\n");
+                ret = btrfs_balance(bctl, NULL);
+        }
+        mutex_unlock(&fs_info->balance_mutex);
+        mutex_unlock(&fs_info->volume_mutex);
+        return ret;
+}
+int btrfs_recover_balance(struct btrfs_root *tree_root)
+{
+        struct task_struct *tsk;
+        struct btrfs_balance_control *bctl;
+        struct btrfs_balance_item *item;
+        struct btrfs_disk_balance_args disk_bargs;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+        if (!bctl) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        key.objectid = BTRFS_BALANCE_OBJECTID;
+        key.type = BTRFS_BALANCE_ITEM_KEY;
+        key.offset = 0;
+        ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out_bctl;
+        if (ret > 0) { /* ret = -ENOENT; */
+                ret = 0;
+                goto out_bctl;
+        }
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+        bctl->fs_info = tree_root->fs_info;
+        bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
+        btrfs_balance_data(leaf, item, &disk_bargs);
+        btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+        btrfs_balance_meta(leaf, item, &disk_bargs);
+        btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+        btrfs_balance_sys(leaf, item, &disk_bargs);
+        btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+        tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
+        if (IS_ERR(tsk))
+                ret = PTR_ERR(tsk);
+        else
+                goto out;
+out_bctl:
+        kfree(bctl);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+        int ret = 0;
+        mutex_lock(&fs_info->balance_mutex);
+        if (!fs_info->balance_ctl) {
+                mutex_unlock(&fs_info->balance_mutex);
+                return -ENOTCONN;
+        }
+        if (atomic_read(&fs_info->balance_running)) {
+                atomic_inc(&fs_info->balance_pause_req);
+                mutex_unlock(&fs_info->balance_mutex);
+                wait_event(fs_info->balance_wait_q,
+                           atomic_read(&fs_info->balance_running) == 0);
+                mutex_lock(&fs_info->balance_mutex);
+                /* we are good with balance_ctl ripped off from under us */
+                BUG_ON(atomic_read(&fs_info->balance_running));
+                atomic_dec(&fs_info->balance_pause_req);
+        } else {
+                ret = -ENOTCONN;
+        }
+        mutex_unlock(&fs_info->balance_mutex);
+        return ret;
+}
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+        mutex_lock(&fs_info->balance_mutex);
+        if (!fs_info->balance_ctl) {
+                mutex_unlock(&fs_info->balance_mutex);
+                return -ENOTCONN;
+        }
+        atomic_inc(&fs_info->balance_cancel_req);
+        /*
+         * if we are running just wait and return, balance item is
+         * deleted in btrfs_balance in this case
+         */
+        if (atomic_read(&fs_info->balance_running)) {
+                mutex_unlock(&fs_info->balance_mutex);
+                wait_event(fs_info->balance_wait_q,
+                           atomic_read(&fs_info->balance_running) == 0);
+                mutex_lock(&fs_info->balance_mutex);
+        } else {
+                /* __cancel_balance needs volume_mutex */
+                mutex_unlock(&fs_info->balance_mutex);
+                mutex_lock(&fs_info->volume_mutex);
+                mutex_lock(&fs_info->balance_mutex);
+                if (fs_info->balance_ctl)
+                        __cancel_balance(fs_info);
+                mutex_unlock(&fs_info->volume_mutex);
+        }
+        BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+        atomic_dec(&fs_info->balance_cancel_req);
+        mutex_unlock(&fs_info->balance_mutex);
+        return 0;
+}
 /*
 * shrinking a device means finding all of the device extents past
 * the new size, and then following the back refs to the chunks.
@@ -2323,8 +3041,7 @@ done:
        return ret;
 }
-static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+static int btrfs_add_system_chunk(struct btrfs_root *root,
-                           struct btrfs_root *root,
                           struct btrfs_key *key,
                           struct btrfs_chunk *chunk, int item_size)
 {
@@ -2441,10 +3158,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                max_stripe_size = 1024 * 1024 * 1024;
                max_chunk_size = 10 * max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
-                max_stripe_size = 256 * 1024 * 1024;
+                /* for larger filesystems, use larger metadata chunks */
+                if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+                        max_stripe_size = 1024 * 1024 * 1024;
+                else
+                        max_stripe_size = 256 * 1024 * 1024;
                max_chunk_size = max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-                max_stripe_size = 8 * 1024 * 1024;
+                max_stripe_size = 32 * 1024 * 1024;
                max_chunk_size = 2 * max_stripe_size;
        } else {
                printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
@@ -2496,7 +3217,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                if (total_avail == 0)
                        continue;
-                ret = find_free_dev_extent(trans, device,
+                ret = find_free_dev_extent(device,
                                           max_stripe_size * dev_stripes,
                                           &dev_offset, &max_avail);
                if (ret && ret != -ENOSPC)
@@ -2687,7 +3408,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
-                ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+                ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
                                             item_size);
                BUG_ON(ret);
        }
@@ -2752,8 +3473,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
                return ret;
        alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
-                        (fs_info->metadata_alloc_profile &
+                                fs_info->avail_metadata_alloc_bits;
-                         fs_info->avail_metadata_alloc_bits);
        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@ -2763,8 +3483,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
        sys_chunk_offset = chunk_offset + chunk_size;
        alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
-                        (fs_info->system_alloc_profile &
+                                fs_info->avail_system_alloc_bits;
-                         fs_info->avail_system_alloc_bits);
        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
        ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@ -2901,26 +3620,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        u64 stripe_nr;
        u64 stripe_nr_orig;
        u64 stripe_nr_end;
-        int stripes_allocated = 8;
-        int stripes_required = 1;
        int stripe_index;
        int i;
+        int ret = 0;
        int num_stripes;
        int max_errors = 0;
        struct btrfs_bio *bbio = NULL;
-        if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
-                stripes_allocated = 1;
-again:
-        if (bbio_ret) {
-                bbio = kzalloc(btrfs_bio_size(stripes_allocated),
-                                GFP_NOFS);
-                if (!bbio)
-                        return -ENOMEM;
-                atomic_set(&bbio->error, 0);
-        }
        read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, logical, *length);
        read_unlock(&em_tree->lock);
@@ -2939,32 +3645,6 @@ again:
        if (mirror_num > map->num_stripes)
                mirror_num = 0;
-        /* if our btrfs_bio struct is too small, back off and try again */
-        if (rw & REQ_WRITE) {
-                if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
-                                 BTRFS_BLOCK_GROUP_DUP)) {
-                        stripes_required = map->num_stripes;
-                        max_errors = 1;
-                } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-                        stripes_required = map->sub_stripes;
-                        max_errors = 1;
-                }
-        }
-        if (rw & REQ_DISCARD) {
-                if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-                                 BTRFS_BLOCK_GROUP_RAID1 |
-                                 BTRFS_BLOCK_GROUP_DUP |
-                                 BTRFS_BLOCK_GROUP_RAID10)) {
-                        stripes_required = map->num_stripes;
-                }
-        }
-        if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
-            stripes_allocated < stripes_required) {
-                stripes_allocated = map->num_stripes;
-                free_extent_map(em);
-                kfree(bbio);
-                goto again;
-        }
        stripe_nr = offset;
        /*
         * stripe_nr counts the total number of stripes we have to stride
@@ -2980,10 +3660,7 @@ again:
        if (rw & REQ_DISCARD)
                *length = min_t(u64, em->len - offset, *length);
-        else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+        else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
-                              BTRFS_BLOCK_GROUP_RAID1 |
-                              BTRFS_BLOCK_GROUP_RAID10 |
-                              BTRFS_BLOCK_GROUP_DUP)) {
                /* we limit the length of each bio to what fits in a stripe */
                *length = min_t(u64, em->len - offset,
                                map->stripe_len - stripe_offset);
@@ -3059,81 +3736,55 @@ again:
        }
        BUG_ON(stripe_index >= map->num_stripes);
+        bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+        if (!bbio) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        atomic_set(&bbio->error, 0);
        if (rw & REQ_DISCARD) {
+                int factor = 0;
+                int sub_stripes = 0;
+                u64 stripes_per_dev = 0;
+                u32 remaining_stripes = 0;
+                if (map->type &
+                    (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+                        if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+                                sub_stripes = 1;
+                        else
+                                sub_stripes = map->sub_stripes;
+                        factor = map->num_stripes / sub_stripes;
+                        stripes_per_dev = div_u64_rem(stripe_nr_end -
+                                                      stripe_nr_orig,
+                                                      factor,
+                                                      &remaining_stripes);
+                }
                for (i = 0; i < num_stripes; i++) {
                        bbio->stripes[i].physical =
                                map->stripes[stripe_index].physical +
                                stripe_offset + stripe_nr * map->stripe_len;
                        bbio->stripes[i].dev = map->stripes[stripe_index].dev;
-                        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+                        if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-                                u64 stripes;
+                                         BTRFS_BLOCK_GROUP_RAID10)) {
-                                u32 last_stripe = 0;
+                                bbio->stripes[i].length = stripes_per_dev *
-                                int j;
+                                                          map->stripe_len;
+                                if (i / sub_stripes < remaining_stripes)
-                                div_u64_rem(stripe_nr_end - 1,
+                                        bbio->stripes[i].length +=
-                                            map->num_stripes,
+                                                map->stripe_len;
-                                            &last_stripe);
+                                if (i < sub_stripes)
-                                for (j = 0; j < map->num_stripes; j++) {
-                                        u32 test;
-                                        div_u64_rem(stripe_nr_end - 1 - j,
-                                                    map->num_stripes, &test);
-                                        if (test == stripe_index)
-                                                break;
-                                }
-                                stripes = stripe_nr_end - 1 - j;
-                                do_div(stripes, map->num_stripes);
-                                bbio->stripes[i].length = map->stripe_len *
-                                        (stripes - stripe_nr + 1);
-                                if (i == 0) {
-                                        bbio->stripes[i].length -=
-                                                stripe_offset;
-                                        stripe_offset = 0;
-                                }
-                                if (stripe_index == last_stripe)
-                                        bbio->stripes[i].length -=
-                                                stripe_end_offset;
-                        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-                                u64 stripes;
-                                int j;
-                                int factor = map->num_stripes /
-                                             map->sub_stripes;
-                                u32 last_stripe = 0;
-                                div_u64_rem(stripe_nr_end - 1,
-                                            factor, &last_stripe);
-                                last_stripe *= map->sub_stripes;
-                                for (j = 0; j < factor; j++) {
-                                        u32 test;
-                                        div_u64_rem(stripe_nr_end - 1 - j,
-                                                    factor, &test);
-                                        if (test ==
-                                            stripe_index / map->sub_stripes)
-                                                break;
-                                }
-                                stripes = stripe_nr_end - 1 - j;
-                                do_div(stripes, factor);
-                                bbio->stripes[i].length = map->stripe_len *
-                                        (stripes - stripe_nr + 1);
-                                if (i < map->sub_stripes) {
                                        bbio->stripes[i].length -=
                                                stripe_offset;
-                                        if (i == map->sub_stripes - 1)
+                                if ((i / sub_stripes + 1) %
-                                                stripe_offset = 0;
+                                    sub_stripes == remaining_stripes)
-                                }
-                                if (stripe_index >= last_stripe &&
-                                    stripe_index <= (last_stripe +
-                                                     map->sub_stripes - 1)) {
                                        bbio->stripes[i].length -=
                                                stripe_end_offset;
-                                }
+                                if (i == sub_stripes - 1)
+                                        stripe_offset = 0;
                        } else
                                bbio->stripes[i].length = *length;
@@ -3155,15 +3806,22 @@ again:
                        stripe_index++;
                }
        }
-        if (bbio_ret) {
-                *bbio_ret = bbio;
+        if (rw & REQ_WRITE) {
-                bbio->num_stripes = num_stripes;
+                if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
-                bbio->max_errors = max_errors;
+                                 BTRFS_BLOCK_GROUP_RAID10 |
-                bbio->mirror_num = mirror_num;
+                                 BTRFS_BLOCK_GROUP_DUP)) {
+                        max_errors = 1;
+                }
        }
+        *bbio_ret = bbio;
+        bbio->num_stripes = num_stripes;
+        bbio->max_errors = max_errors;
+        bbio->mirror_num = mirror_num;
 out:
        free_extent_map(em);
-        return 0;
+        return ret;
 }
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
@@ -3304,7 +3962,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
        /* don't bother with additional async steps for reads, right now */
        if (!(rw & REQ_WRITE)) {
                bio_get(bio);
-                submit_bio(rw, bio);
+                btrfsic_submit_bio(rw, bio);
                bio_put(bio);
                return 0;
        }
@@ -3399,7 +4057,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                        if (async_submit)
                                schedule_bio(root, dev, rw, bio);
                        else
-                                submit_bio(rw, bio);
+                                btrfsic_submit_bio(rw, bio);
                } else {
                        bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
                        bio->bi_sector = logical >> 9;
@@ -3568,7 +4226,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
        struct btrfs_fs_devices *fs_devices;
        int ret;
-        mutex_lock(&uuid_mutex);
+        BUG_ON(!mutex_is_locked(&uuid_mutex));
        fs_devices = root->fs_info->fs_devices->seed;
        while (fs_devices) {
@@ -3606,7 +4264,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
        fs_devices->seed = root->fs_info->fs_devices->seed;
        root->fs_info->fs_devices->seed = fs_devices;
 out:
-        mutex_unlock(&uuid_mutex);
        return ret;
 }
@@ -3749,6 +4406,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
        if (!path)
                return -ENOMEM;
+        mutex_lock(&uuid_mutex);
+        lock_chunks(root);
        /* first we search for all of the device items, and then we
         * read in all of the chunk items.  This way we can create chunk
         * mappings that reference all of the devices that are afound
@@ -3799,6 +4459,9 @@ again:
        }
        ret = 0;
 error:
+        unlock_chunks(root);
+        mutex_unlock(&uuid_mutex);
        btrfs_free_path(path);
        return ret;
 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 78f2d4d4f37f..19ac95048b88 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -186,6 +186,51 @@ struct map_lookup {
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
                            (sizeof(struct btrfs_bio_stripe) * (n)))
+/*
+ * Restriper's general type filter
+ */
+#define BTRFS_BALANCE_DATA              (1ULL << 0)
+#define BTRFS_BALANCE_SYSTEM            (1ULL << 1)
+#define BTRFS_BALANCE_METADATA          (1ULL << 2)
+#define BTRFS_BALANCE_TYPE_MASK         (BTRFS_BALANCE_DATA |       \
+                                         BTRFS_BALANCE_SYSTEM |     \
+                                         BTRFS_BALANCE_METADATA)
+#define BTRFS_BALANCE_FORCE             (1ULL << 3)
+#define BTRFS_BALANCE_RESUME            (1ULL << 4)
+/*
+ * Balance filters
+ */
+#define BTRFS_BALANCE_ARGS_PROFILES     (1ULL << 0)
+#define BTRFS_BALANCE_ARGS_USAGE        (1ULL << 1)
+#define BTRFS_BALANCE_ARGS_DEVID        (1ULL << 2)
+#define BTRFS_BALANCE_ARGS_DRANGE       (1ULL << 3)
+#define BTRFS_BALANCE_ARGS_VRANGE       (1ULL << 4)
+/*
+ * Profile changing flags.  When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
+ * half-filled).
+ */
+#define BTRFS_BALANCE_ARGS_CONVERT      (1ULL << 8)
+#define BTRFS_BALANCE_ARGS_SOFT         (1ULL << 9)
+struct btrfs_balance_args;
+struct btrfs_balance_progress;
+struct btrfs_balance_control {
+        struct btrfs_fs_info *fs_info;
+        struct btrfs_balance_args data;
+        struct btrfs_balance_args meta;
+        struct btrfs_balance_args sys;
+        u64 flags;
+        struct btrfs_balance_progress stat;
+};
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
                                   u64 end, u64 *length);
@@ -228,9 +273,12 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
                                       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
-int btrfs_balance(struct btrfs_root *dev_root);
+int btrfs_balance(struct btrfs_balance_control *bctl,
+                  struct btrfs_ioctl_balance_args *bargs);
+int btrfs_recover_balance(struct btrfs_root *tree_root);
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
-                         struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *max_avail);
 #endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3848b04e310e..e7a5659087e6 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
 out:
-        btrfs_end_transaction_throttle(trans, root);
+        btrfs_end_transaction(trans, root);
        return ret;
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b60fc8bfb3e9..620daad201db 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -641,10 +641,10 @@ static int __cap_is_valid(struct ceph_cap *cap)
        unsigned long ttl;
        u32 gen;
-        spin_lock(&cap->session->s_cap_lock);
+        spin_lock(&cap->session->s_gen_ttl_lock);
        gen = cap->session->s_cap_gen;
        ttl = cap->session->s_cap_ttl;
-        spin_unlock(&cap->session->s_cap_lock);
+        spin_unlock(&cap->session->s_gen_ttl_lock);
        if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
                dout("__cap_is_valid %p cap %p issued %s "
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 74fd74719dc2..3e8094be4604 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -973,12 +973,12 @@ static int dentry_lease_is_valid(struct dentry *dentry)
        spin_lock(&dentry->d_lock);
        di = ceph_dentry(dentry);
-        if (di && di->lease_session) {
+        if (di->lease_session) {
                s = di->lease_session;
-                spin_lock(&s->s_cap_lock);
+                spin_lock(&s->s_gen_ttl_lock);
                gen = s->s_cap_gen;
                ttl = s->s_cap_ttl;
-                spin_unlock(&s->s_cap_lock);
+                spin_unlock(&s->s_gen_ttl_lock);
                if (di->lease_gen == gen &&
                    time_before(jiffies, dentry->d_time) &&
@@ -1072,13 +1072,11 @@ static void ceph_d_release(struct dentry *dentry)
        struct ceph_dentry_info *di = ceph_dentry(dentry);
        dout("d_release %p\n", dentry);
-        if (di) {
+        ceph_dentry_lru_del(dentry);
-                ceph_dentry_lru_del(dentry);
+        if (di->lease_session)
-                if (di->lease_session)
+                ceph_put_mds_session(di->lease_session);
-                        ceph_put_mds_session(di->lease_session);
+        kmem_cache_free(ceph_dentry_cachep, di);
-                kmem_cache_free(ceph_dentry_cachep, di);
+        dentry->d_fsdata = NULL;
-                dentry->d_fsdata = NULL;
-        }
 }
 static int ceph_snapdir_d_revalidate(struct dentry *dentry,
@@ -1096,17 +1094,36 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
 */
 void ceph_dir_set_complete(struct inode *inode)
 {
-        /* not yet implemented */
+        struct dentry *dentry = d_find_any_alias(inode);
+        
+        if (dentry && ceph_dentry(dentry) &&
+            ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
+                dout(" marking %p (%p) complete\n", inode, dentry);
+                set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+        }
+        dput(dentry);
 }
 void ceph_dir_clear_complete(struct inode *inode)
 {
-        /* not yet implemented */
+        struct dentry *dentry = d_find_any_alias(inode);
+        if (dentry && ceph_dentry(dentry)) {
+                dout(" marking %p (%p) complete\n", inode, dentry);
+                set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+        }
+        dput(dentry);
 }
 bool ceph_dir_test_complete(struct inode *inode)
 {
-        /* not yet implemented */
+        struct dentry *dentry = d_find_any_alias(inode);
+        if (dentry && ceph_dentry(dentry)) {
+                dout(" marking %p (%p) NOT complete\n", inode, dentry);
+                clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+        }
+        dput(dentry);
        return false;
 }
@@ -1220,6 +1237,7 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
        do {
                ceph_mdsc_get_request(req);
                spin_unlock(&ci->i_unsafe_lock);
                dout("dir_fsync %p wait on tid %llu (until %llu)\n",
                     inode, req->r_tid, last_tid);
                if (req->r_timeout) {
@@ -1232,9 +1250,9 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
                } else {
                        wait_for_completion(&req->r_safe_completion);
                }
-                spin_lock(&ci->i_unsafe_lock);
                ceph_mdsc_put_request(req);
+                spin_lock(&ci->i_unsafe_lock);
                if (ret || list_empty(head))
                        break;
                req = list_entry(head->next,
@@ -1259,13 +1277,11 @@ void ceph_dentry_lru_add(struct dentry *dn)
        dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
             dn->d_name.len, dn->d_name.name);
-        if (di) {
+        mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
-                mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+        spin_lock(&mdsc->dentry_lru_lock);
-                spin_lock(&mdsc->dentry_lru_lock);
+        list_add_tail(&di->lru, &mdsc->dentry_lru);
-                list_add_tail(&di->lru, &mdsc->dentry_lru);
+        mdsc->num_dentry++;
-                mdsc->num_dentry++;
+        spin_unlock(&mdsc->dentry_lru_lock);
-                spin_unlock(&mdsc->dentry_lru_lock);
-        }
 }
 void ceph_dentry_lru_touch(struct dentry *dn)
@@ -1275,12 +1291,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
        dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
             dn->d_name.len, dn->d_name.name, di->offset);
-        if (di) {
+        mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
-                mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+        spin_lock(&mdsc->dentry_lru_lock);
-                spin_lock(&mdsc->dentry_lru_lock);
+        list_move_tail(&di->lru, &mdsc->dentry_lru);
-                list_move_tail(&di->lru, &mdsc->dentry_lru);
+        spin_unlock(&mdsc->dentry_lru_lock);
-                spin_unlock(&mdsc->dentry_lru_lock);
-        }
 }
 void ceph_dentry_lru_del(struct dentry *dn)
@@ -1290,13 +1304,11 @@ void ceph_dentry_lru_del(struct dentry *dn)
        dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
             dn->d_name.len, dn->d_name.name);
-        if (di) {
+        mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
-                mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
+        spin_lock(&mdsc->dentry_lru_lock);
-                spin_lock(&mdsc->dentry_lru_lock);
+        list_del_init(&di->lru);
-                list_del_init(&di->lru);
+        mdsc->num_dentry--;
-                mdsc->num_dentry--;
+        spin_unlock(&mdsc->dentry_lru_lock);
-                spin_unlock(&mdsc->dentry_lru_lock);
-        }
 }
 /*
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9fbcdecaaccd..fbb2a643ef10 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -56,9 +56,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
                return -EINVAL;
        spin_lock(&dentry->d_lock);
-        parent = dget(dentry->d_parent);
+        parent = dentry->d_parent;
-        spin_unlock(&dentry->d_lock);
        if (*max_len >= connected_handle_length) {
                dout("encode_fh %p connectable\n", dentry);
                cfh->ino = ceph_ino(dentry->d_inode);
@@ -81,7 +79,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
                *max_len = handle_length;
                type = 255;
        }
-        dput(parent);
+        spin_unlock(&dentry->d_lock);
        return type;
 }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 25283e7a37f8..2c489378b4cd 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -850,11 +850,12 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 {
        struct dentry *dir = dn->d_parent;
        struct inode *inode = dir->d_inode;
-        struct ceph_inode_info *ci = ceph_inode(inode);
+        struct ceph_inode_info *ci;
        struct ceph_dentry_info *di;
        BUG_ON(!inode);
+        ci = ceph_inode(inode);
        di = ceph_dentry(dn);
        spin_lock(&ci->i_ceph_lock);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6203d805eb45..866e8d7ca37d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -262,6 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg,
        /* trace */
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
+                ceph_decode_need(&p, end, len, bad);
                err = parse_reply_info_trace(&p, p+len, info, features);
                if (err < 0)
                        goto out_bad;
@@ -270,6 +271,7 @@ static int parse_reply_info(struct ceph_msg *msg,
        /* extra */
        ceph_decode_32_safe(&p, end, len, bad);
        if (len > 0) {
+                ceph_decode_need(&p, end, len, bad);
                err = parse_reply_info_extra(&p, p+len, info, features);
                if (err < 0)
                        goto out_bad;
@@ -398,9 +400,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
        s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
        s->s_con.peer_name.num = cpu_to_le64(mds);
-        spin_lock_init(&s->s_cap_lock);
+        spin_lock_init(&s->s_gen_ttl_lock);
        s->s_cap_gen = 0;
        s->s_cap_ttl = 0;
+        spin_lock_init(&s->s_cap_lock);
        s->s_renew_requested = 0;
        s->s_renew_seq = 0;
        INIT_LIST_HEAD(&s->s_caps);
@@ -2326,10 +2330,10 @@ static void handle_session(struct ceph_mds_session *session,
        case CEPH_SESSION_STALE:
                pr_info("mds%d caps went stale, renewing\n",
                        session->s_mds);
-                spin_lock(&session->s_cap_lock);
+                spin_lock(&session->s_gen_ttl_lock);
                session->s_cap_gen++;
                session->s_cap_ttl = 0;
-                spin_unlock(&session->s_cap_lock);
+                spin_unlock(&session->s_gen_ttl_lock);
                send_renew_caps(mdsc, session);
                break;
@@ -2772,7 +2776,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        di = ceph_dentry(dentry);
        switch (h->action) {
        case CEPH_MDS_LEASE_REVOKE:
-                if (di && di->lease_session == session) {
+                if (di->lease_session == session) {
                        if (ceph_seq_cmp(di->lease_seq, seq) > 0)
                                h->seq = cpu_to_le32(di->lease_seq);
                        __ceph_mdsc_drop_dentry_lease(dentry);
@@ -2781,7 +2785,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
                break;
        case CEPH_MDS_LEASE_RENEW:
-                if (di && di->lease_session == session &&
+                if (di->lease_session == session &&
                    di->lease_gen == session->s_cap_gen &&
                    di->lease_renew_from &&
                    di->lease_renew_after == 0) {
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index a50ca0e39475..8c7c04ebb595 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -117,10 +117,13 @@ struct ceph_mds_session {
        void             *s_authorizer_buf, *s_authorizer_reply_buf;
        size_t            s_authorizer_buf_len, s_authorizer_reply_buf_len;
-        /* protected by s_cap_lock */
+        /* protected by s_gen_ttl_lock */
-        spinlock_t        s_cap_lock;
+        spinlock_t        s_gen_ttl_lock;
        u32               s_cap_gen;  /* inc each time we get mds stale msg */
        unsigned long     s_cap_ttl;  /* when session caps expire */
+        /* protected by s_cap_lock */
+        spinlock_t        s_cap_lock;
        struct list_head  s_caps;     /* all caps issued by this session */
        int               s_nr_caps, s_trim_caps;
        int               s_num_cap_releases;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 11bd0fc4853f..00de2c9568cd 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -131,6 +131,8 @@ enum {
        Opt_rbytes,
        Opt_norbytes,
        Opt_noasyncreaddir,
+        Opt_dcache,
+        Opt_nodcache,
        Opt_ino32,
 };
@@ -152,6 +154,8 @@ static match_table_t fsopt_tokens = {
        {Opt_rbytes, "rbytes"},
        {Opt_norbytes, "norbytes"},
        {Opt_noasyncreaddir, "noasyncreaddir"},
+        {Opt_dcache, "dcache"},
+        {Opt_nodcache, "nodcache"},
        {Opt_ino32, "ino32"},
        {-1, NULL}
 };
@@ -231,6 +235,12 @@ static int parse_fsopt_token(char *c, void *private)
        case Opt_noasyncreaddir:
                fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
                break;
+        case Opt_dcache:
+                fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
+                break;
+        case Opt_nodcache:
+                fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
+                break;
        case Opt_ino32:
                fsopt->flags |= CEPH_MOUNT_OPT_INO32;
                break;
@@ -377,6 +387,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
                seq_puts(m, ",norbytes");
        if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
                seq_puts(m, ",noasyncreaddir");
+        if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
+                seq_puts(m, ",dcache");
+        else
+                seq_puts(m, ",nodcache");
        if (fsopt->wsize)
                seq_printf(m, ",wsize=%d", fsopt->wsize);
@@ -636,19 +650,26 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
        req->r_num_caps = 2;
        err = ceph_mdsc_do_request(mdsc, NULL, req);
        if (err == 0) {
+                struct inode *inode = req->r_target_inode;
+                req->r_target_inode = NULL;
                dout("open_root_inode success\n");
-                if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
+                if (ceph_ino(inode) == CEPH_INO_ROOT &&
                    fsc->sb->s_root == NULL) {
-                        root = d_alloc_root(req->r_target_inode);
+                        root = d_alloc_root(inode);
-                        ceph_init_dentry(root);
+                        if (!root) {
+                                iput(inode);
+                                root = ERR_PTR(-ENOMEM);
+                                goto out;
+                        }
                } else {
-                        root = d_obtain_alias(req->r_target_inode);
+                        root = d_obtain_alias(inode);
                }
-                req->r_target_inode = NULL;
+                ceph_init_dentry(root);
                dout("open_root_inode success, root dentry is %p\n", root);
        } else {
                root = ERR_PTR(err);
        }
+out:
        ceph_mdsc_put_request(req);
        return root;
 }
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cb3652b37271..1421f3d875a2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -28,6 +28,7 @@
 #define CEPH_MOUNT_OPT_RBYTES          (1<<5) /* dir st_bytes = rbytes */
 #define CEPH_MOUNT_OPT_NOASYNCREADDIR  (1<<7) /* no dcache readdir */
 #define CEPH_MOUNT_OPT_INO32           (1<<8) /* 32 bit inos */
+#define CEPH_MOUNT_OPT_DCACHE          (1<<9) /* use dcache for readdir etc */
 #define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a5e36e4488a7..a76f697303d9 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -111,8 +111,10 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 }
 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
+        { true, "ceph.file.layout", ceph_vxattrcb_layout},
+        /* The following extended attribute name is deprecated */
        { true, "ceph.layout", ceph_vxattrcb_layout},
-        { NULL, NULL }
+        { true, NULL, NULL }
 };
 static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
@@ -818,6 +820,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
        struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
        int issued;
        int err;
+        int required_blob_size;
        int dirty;
        if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -833,14 +836,34 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
                        return -EOPNOTSUPP;
        }
+        err = -ENOMEM;
        spin_lock(&ci->i_ceph_lock);
        __build_xattrs(inode);
+retry:
        issued = __ceph_caps_issued(ci, NULL);
        dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
        if (!(issued & CEPH_CAP_XATTR_EXCL))
                goto do_sync;
+        required_blob_size = __get_required_blob_size(ci, 0, 0);
+        if (!ci->i_xattrs.prealloc_blob ||
+            required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+                struct ceph_buffer *blob;
+                spin_unlock(&ci->i_ceph_lock);
+                dout(" preaallocating new blob size=%d\n", required_blob_size);
+                blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
+                if (!blob)
+                        goto out;
+                spin_lock(&ci->i_ceph_lock);
+                if (ci->i_xattrs.prealloc_blob)
+                        ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+                ci->i_xattrs.prealloc_blob = blob;
+                goto retry;
+        }
        err = __remove_xattr_by_name(ceph_inode(inode), name);
        dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
        ci->i_xattrs.dirty = true;
@@ -853,6 +876,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
 do_sync:
        spin_unlock(&ci->i_ceph_lock);
        err = ceph_send_removexattr(dentry, name);
+out:
        return err;
 }
diff --git a/fs/char_dev.c b/fs/char_dev.c
index dca9e5e0f73b..3f152b92a94a 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -272,7 +272,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
        cd = __register_chrdev_region(major, baseminor, count, name);
        if (IS_ERR(cd))
                return PTR_ERR(cd);
-        
        cdev = cdev_alloc();
        if (!cdev)
                goto out2;
@@ -280,7 +280,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
        cdev->owner = fops->owner;
        cdev->ops = fops;
        kobject_set_name(&cdev->kobj, "%s", name);
-                
        err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
        if (err)
                goto out;
@@ -405,7 +405,7 @@ static int chrdev_open(struct inode *inode, struct file *filp)
                goto out_cdev_put;
        if (filp->f_op->open) {
-                ret = filp->f_op->open(inode,filp);
+                ret = filp->f_op->open(inode, filp);
                if (ret)
                        goto out_cdev_put;
        }
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index f66cc1625150..0554b00a7b33 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -140,7 +140,6 @@ config CIFS_DFS_UPCALL
 config CIFS_FSCACHE
          bool "Provide CIFS client caching support (EXPERIMENTAL)"
-          depends on EXPERIMENTAL
          depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
          help
            Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
@@ -149,7 +148,7 @@ config CIFS_FSCACHE
 config CIFS_ACL
          bool "Provide CIFS ACL support (EXPERIMENTAL)"
-          depends on EXPERIMENTAL && CIFS_XATTR && KEYS
+          depends on CIFS_XATTR && KEYS
          help
            Allows to fetch CIFS/NTFS ACL from the server.  The DACL blob
            is handed over to the application/caller.
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 84e8c0724704..24b3dfc05282 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -676,14 +676,23 @@ static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
 {
        char c;
        int rc;
+        static bool warned;
        rc = get_user(c, buffer);
        if (rc)
                return rc;
        if (c == '0' || c == 'n' || c == 'N')
                multiuser_mount = 0;
-        else if (c == '1' || c == 'y' || c == 'Y')
+        else if (c == '1' || c == 'y' || c == 'Y') {
                multiuser_mount = 1;
+                if (!warned) {
+                        warned = true;
+                        printk(KERN_WARNING "CIFS VFS: The legacy multiuser "
+                                "mount code is scheduled to be deprecated in "
+                                "3.5. Please switch to using the multiuser "
+                                "mount option.");
+                }
+        }
        return count;
 }
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 2272fd5fe5b7..e622863b292f 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -113,9 +113,11 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
                   MAX_MECH_STR_LEN +
                   UID_KEY_LEN + (sizeof(uid_t) * 2) +
                   CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
-                   USER_KEY_LEN + strlen(sesInfo->user_name) +
                   PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
+        if (sesInfo->user_name)
+                desc_len += USER_KEY_LEN + strlen(sesInfo->user_name);
        spnego_key = ERR_PTR(-ENOMEM);
        description = kzalloc(desc_len, GFP_KERNEL);
        if (description == NULL)
@@ -152,8 +154,10 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
        dp = description + strlen(description);
        sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
-        dp = description + strlen(description);
+        if (sesInfo->user_name) {
-        sprintf(dp, ";user=%s", sesInfo->user_name);
+                dp = description + strlen(description);
+                sprintf(dp, ";user=%s", sesInfo->user_name);
+        }
        dp = description + strlen(description);
        sprintf(dp, ";pid=0x%x", current->pid);
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 1b2e180b018d..fbb9da951843 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -27,17 +27,17 @@
 #include "cifs_debug.h"
 /*
- * cifs_ucs2_bytes - how long will a string be after conversion?
+ * cifs_utf16_bytes - how long will a string be after conversion?
- * @ucs - pointer to input string
+ * @utf16 - pointer to input string
 * @maxbytes - don't go past this many bytes of input string
 * @codepage - destination codepage
 *
- * Walk a ucs2le string and return the number of bytes that the string will
+ * Walk a utf16le string and return the number of bytes that the string will
 * be after being converted to the given charset, not including any null
 * termination required. Don't walk past maxbytes in the source buffer.
 */
 int
-cifs_ucs2_bytes(const __le16 *from, int maxbytes,
+cifs_utf16_bytes(const __le16 *from, int maxbytes,
                const struct nls_table *codepage)
 {
        int i;
@@ -122,7 +122,7 @@ cp_convert:
 }
 /*
- * cifs_from_ucs2 - convert utf16le string to local charset
+ * cifs_from_utf16 - convert utf16le string to local charset
 * @to - destination buffer
 * @from - source buffer
 * @tolen - destination buffer size (in bytes)
@@ -130,7 +130,7 @@ cp_convert:
 * @codepage - codepage to which characters should be converted
 * @mapchar - should characters be remapped according to the mapchars option?
 *
- * Convert a little-endian ucs2le string (as sent by the server) to a string
+ * Convert a little-endian utf16le string (as sent by the server) to a string
 * in the provided codepage. The tolen and fromlen parameters are to ensure
 * that the code doesn't walk off of the end of the buffer (which is always
 * a danger if the alignment of the source buffer is off). The destination
@@ -139,12 +139,12 @@ cp_convert:
 * null terminator).
 *
 * Note that some windows versions actually send multiword UTF-16 characters
- * instead of straight UCS-2. The linux nls routines however aren't able to
+ * instead of straight UTF16-2. The linux nls routines however aren't able to
 * deal with those characters properly. In the event that we get some of
 * those characters, they won't be translated properly.
 */
 int
-cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
+cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
                 const struct nls_table *codepage, bool mapchar)
 {
        int i, charlen, safelen;
@@ -190,13 +190,13 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
 }
 /*
- * NAME:        cifs_strtoUCS()
+ * NAME:        cifs_strtoUTF16()
 *
 * FUNCTION:    Convert character string to unicode string
 *
 */
 int
-cifs_strtoUCS(__le16 *to, const char *from, int len,
+cifs_strtoUTF16(__le16 *to, const char *from, int len,
              const struct nls_table *codepage)
 {
        int charlen;
@@ -206,7 +206,7 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
        for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
                charlen = codepage->char2uni(from, len, &wchar_to);
                if (charlen < 1) {
-                        cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
+                        cERROR(1, "strtoUTF16: char2uni of 0x%x returned %d",
                                *from, charlen);
                        /* A question mark */
                        wchar_to = 0x003f;
@@ -220,7 +220,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 }
 /*
- * cifs_strndup_from_ucs - copy a string from wire format to the local codepage
+ * cifs_strndup_from_utf16 - copy a string from wire format to the local
+ * codepage
 * @src - source string
 * @maxlen - don't walk past this many bytes in the source string
 * @is_unicode - is this a unicode string?
@@ -231,19 +232,19 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
 * error.
 */
 char *
-cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
+cifs_strndup_from_utf16(const char *src, const int maxlen,
-             const struct nls_table *codepage)
+                        const bool is_unicode, const struct nls_table *codepage)
 {
        int len;
        char *dst;
        if (is_unicode) {
-                len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage);
+                len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage);
                len += nls_nullsize(codepage);
                dst = kmalloc(len, GFP_KERNEL);
                if (!dst)
                        return NULL;
-                cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage,
+                cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
                               false);
        } else {
                len = strnlen(src, maxlen);
@@ -264,7 +265,7 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
 * names are little endian 16 bit Unicode on the wire
 */
 int
-cifsConvertToUCS(__le16 *target, const char *source, int srclen,
+cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
                 const struct nls_table *cp, int mapChars)
 {
        int i, j, charlen;
@@ -273,7 +274,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
        wchar_t tmp;
        if (!mapChars)
-                return cifs_strtoUCS(target, source, PATH_MAX, cp);
+                return cifs_strtoUTF16(target, source, PATH_MAX, cp);
        for (i = 0, j = 0; i < srclen; j++) {
                src_char = source[i];
@@ -281,7 +282,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
                switch (src_char) {
                case 0:
                        put_unaligned(0, &target[j]);
-                        goto ctoUCS_out;
+                        goto ctoUTF16_out;
                case ':':
                        dst_char = cpu_to_le16(UNI_COLON);
                        break;
@@ -326,7 +327,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
                put_unaligned(dst_char, &target[j]);
        }
-ctoUCS_out:
+ctoUTF16_out:
        return i;
 }
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 6d02fd560566..a513a546700b 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -74,16 +74,16 @@ extern const struct UniCaseRange CifsUniLowerRange[];
 #endif                          /* UNIUPR_NOLOWER */
 #ifdef __KERNEL__
-int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
+int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
-                   const struct nls_table *codepage, bool mapchar);
+                    const struct nls_table *codepage, bool mapchar);
-int cifs_ucs2_bytes(const __le16 *from, int maxbytes,
+int cifs_utf16_bytes(const __le16 *from, int maxbytes,
-                    const struct nls_table *codepage);
+                     const struct nls_table *codepage);
-int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
+int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *);
-char *cifs_strndup_from_ucs(const char *src, const int maxlen,
+char *cifs_strndup_from_utf16(const char *src, const int maxlen,
-                            const bool is_unicode,
+                              const bool is_unicode,
-                            const struct nls_table *codepage);
+                              const struct nls_table *codepage);
-extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
+extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen,
-                        const struct nls_table *cp, int mapChars);
+                              const struct nls_table *cp, int mapChars);
 #endif
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 72ddf23ef6f7..c1b254487388 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -909,6 +909,8 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
                umode_t group_mask = S_IRWXG;
                umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
+                if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *))
+                        return;
                ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
                                GFP_KERNEL);
                if (!ppace) {
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 5d9b9acc5fce..63c460e503b6 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -327,7 +327,7 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
        attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
        attrptr->length = cpu_to_le16(2 * dlen);
        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-        cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
+        cifs_strtoUTF16((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
        return 0;
 }
@@ -376,7 +376,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
                                        kmalloc(attrsize + 1, GFP_KERNEL);
                                if (!ses->domainName)
                                                return -ENOMEM;
-                                cifs_from_ucs2(ses->domainName,
+                                cifs_from_utf16(ses->domainName,
                                        (__le16 *)blobptr, attrsize, attrsize,
                                        nls_cp, false);
                                break;
@@ -420,15 +420,20 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
        }
        /* convert ses->user_name to unicode and uppercase */
-        len = strlen(ses->user_name);
+        len = ses->user_name ? strlen(ses->user_name) : 0;
        user = kmalloc(2 + (len * 2), GFP_KERNEL);
        if (user == NULL) {
                cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
                rc = -ENOMEM;
                return rc;
        }
-        len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
-        UniStrupr(user);
+        if (len) {
+                len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp);
+                UniStrupr(user);
+        } else {
+                memset(user, '\0', 2);
+        }
        rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
                                (char *)user, 2 * len);
@@ -448,8 +453,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
                        rc = -ENOMEM;
                        return rc;
                }
-                len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
+                len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,
-                                        nls_cp);
+                                      nls_cp);
                rc =
                crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
                                        (char *)domain, 2 * len);
@@ -468,7 +473,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
                        rc = -ENOMEM;
                        return rc;
                }
-                len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+                len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len,
                                        nls_cp);
                rc =
                crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ba53c1c6c6cc..76e7d8b6da17 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -879,6 +879,8 @@ require use of the stronger protocol */
 #define   CIFSSEC_MASK          0xB70B7 /* current flags supported if weak */
 #endif /* UPCALL */
 #else /* do not allow weak pw hash */
+#define   CIFSSEC_MUST_LANMAN   0
+#define   CIFSSEC_MUST_PLNTXT   0
 #ifdef CONFIG_CIFS_UPCALL
 #define   CIFSSEC_MASK          0x8F08F /* flags supported if no weak allowed */
 #else
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6600aa2d2ef3..8b7794c31591 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -821,8 +821,8 @@ PsxDelete:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
+                    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
-                                     PATH_MAX, nls_codepage, remap);
+                                       PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else { /* BB add path length overrun check */
@@ -893,8 +893,8 @@ DelFileRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifsConvertToUCS((__le16 *) pSMB->fileName, fileName,
+                    cifsConvertToUTF16((__le16 *) pSMB->fileName, fileName,
-                                     PATH_MAX, nls_codepage, remap);
+                                       PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {                /* BB improve check for buffer overruns BB */
@@ -938,8 +938,8 @@ RmDirRetry:
                return rc;
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-                name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, dirName,
+                name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, dirName,
-                                         PATH_MAX, nls_codepage, remap);
+                                              PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {                /* BB improve check for buffer overruns BB */
@@ -981,8 +981,8 @@ MkDirRetry:
                return rc;
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-                name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, name,
+                name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
-                                            PATH_MAX, nls_codepage, remap);
+                                              PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {                /* BB improve check for buffer overruns BB */
@@ -1030,8 +1030,8 @@ PsxCreat:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifsConvertToUCS((__le16 *) pSMB->FileName, name,
+                    cifsConvertToUTF16((__le16 *) pSMB->FileName, name,
-                                     PATH_MAX, nls_codepage, remap);
+                                       PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
@@ -1197,8 +1197,8 @@ OldOpenRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                count = 1;      /* account for one byte pad to word boundary */
                name_len =
-                   cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
+                   cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
-                                    fileName, PATH_MAX, nls_codepage, remap);
+                                      fileName, PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {                /* BB improve check for buffer overruns BB */
@@ -1304,8 +1304,8 @@ openRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                count = 1;      /* account for one byte pad to word boundary */
                name_len =
-                    cifsConvertToUCS((__le16 *) (pSMB->fileName + 1),
+                    cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
-                                     fileName, PATH_MAX, nls_codepage, remap);
+                                       fileName, PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
                pSMB->NameLength = cpu_to_le16(name_len);
@@ -2649,16 +2649,16 @@ renameRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
+                    cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName,
-                                     PATH_MAX, nls_codepage, remap);
+                                       PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
                pSMB->OldFileName[name_len] = 0x04;     /* pad */
        /* protocol requires ASCII signature byte on Unicode string */
                pSMB->OldFileName[name_len + 1] = 0x00;
                name_len2 =
-                    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
+                    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
-                                     toName, PATH_MAX, nls_codepage, remap);
+                                       toName, PATH_MAX, nls_codepage, remap);
                name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
                name_len2 *= 2; /* convert to bytes */
        } else {        /* BB improve the check for buffer overruns BB */
@@ -2738,10 +2738,12 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon,
        /* unicode only call */
        if (target_name == NULL) {
                sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid);
-                len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+                len_of_str =
+                        cifsConvertToUTF16((__le16 *)rename_info->target_name,
                                        dummy_string, 24, nls_codepage, remap);
        } else {
-                len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name,
+                len_of_str =
+                        cifsConvertToUTF16((__le16 *)rename_info->target_name,
                                        target_name, PATH_MAX, nls_codepage,
                                        remap);
        }
@@ -2795,17 +2797,17 @@ copyRetry:
        pSMB->Flags = cpu_to_le16(flags & COPY_TREE);
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-                name_len = cifsConvertToUCS((__le16 *) pSMB->OldFileName,
+                name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName,
-                                            fromName, PATH_MAX, nls_codepage,
+                                              fromName, PATH_MAX, nls_codepage,
-                                            remap);
+                                              remap);
                name_len++;     /* trailing null */
                name_len *= 2;
                pSMB->OldFileName[name_len] = 0x04;     /* pad */
                /* protocol requires ASCII signature byte on Unicode string */
                pSMB->OldFileName[name_len + 1] = 0x00;
                name_len2 =
-                    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
+                    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
-                                toName, PATH_MAX, nls_codepage, remap);
+                                       toName, PATH_MAX, nls_codepage, remap);
                name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
                name_len2 *= 2; /* convert to bytes */
        } else {        /* BB improve the check for buffer overruns BB */
@@ -2861,9 +2863,9 @@ createSymLinkRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifs_strtoUCS((__le16 *) pSMB->FileName, fromName, PATH_MAX
+                    cifs_strtoUTF16((__le16 *) pSMB->FileName, fromName,
-                                  /* find define for this maxpathcomponent */
+                                    /* find define for this maxpathcomponent */
-                                  , nls_codepage);
+                                    PATH_MAX, nls_codepage);
                name_len++;     /* trailing null */
                name_len *= 2;
@@ -2885,9 +2887,9 @@ createSymLinkRetry:
        data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len_target =
-                    cifs_strtoUCS((__le16 *) data_offset, toName, PATH_MAX
+                    cifs_strtoUTF16((__le16 *) data_offset, toName, PATH_MAX
-                                  /* find define for this maxpathcomponent */
+                                    /* find define for this maxpathcomponent */
-                                  , nls_codepage);
+                                    , nls_codepage);
                name_len_target++;      /* trailing null */
                name_len_target *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
@@ -2949,8 +2951,8 @@ createHardLinkRetry:
                return rc;
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-                name_len = cifsConvertToUCS((__le16 *) pSMB->FileName, toName,
+                name_len = cifsConvertToUTF16((__le16 *) pSMB->FileName, toName,
-                                            PATH_MAX, nls_codepage, remap);
+                                              PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
@@ -2972,8 +2974,8 @@ createHardLinkRetry:
        data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len_target =
-                    cifsConvertToUCS((__le16 *) data_offset, fromName, PATH_MAX,
+                    cifsConvertToUTF16((__le16 *) data_offset, fromName,
-                                     nls_codepage, remap);
+                                       PATH_MAX, nls_codepage, remap);
                name_len_target++;      /* trailing null */
                name_len_target *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
@@ -3042,8 +3044,8 @@ winCreateHardLinkRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName,
+                    cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName,
-                                     PATH_MAX, nls_codepage, remap);
+                                       PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
@@ -3051,8 +3053,8 @@ winCreateHardLinkRetry:
                pSMB->OldFileName[name_len] = 0x04;
                pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
                name_len2 =
-                    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
+                    cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
-                                     toName, PATH_MAX, nls_codepage, remap);
+                                       toName, PATH_MAX, nls_codepage, remap);
                name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
                name_len2 *= 2; /* convert to bytes */
        } else {        /* BB improve the check for buffer overruns BB */
@@ -3108,8 +3110,8 @@ querySymLinkRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifs_strtoUCS((__le16 *) pSMB->FileName, searchName,
+                        cifs_strtoUTF16((__le16 *) pSMB->FileName, searchName,
-                                  PATH_MAX, nls_codepage);
+                                        PATH_MAX, nls_codepage);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
@@ -3166,8 +3168,8 @@ querySymLinkRetry:
                                is_unicode = false;
                        /* BB FIXME investigate remapping reserved chars here */
-                        *symlinkinfo = cifs_strndup_from_ucs(data_start, count,
+                        *symlinkinfo = cifs_strndup_from_utf16(data_start,
-                                                    is_unicode, nls_codepage);
+                                        count, is_unicode, nls_codepage);
                        if (!*symlinkinfo)
                                rc = -ENOMEM;
                }
@@ -3450,8 +3452,9 @@ queryAclRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                        cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+                        cifsConvertToUTF16((__le16 *) pSMB->FileName,
-                                         PATH_MAX, nls_codepage, remap);
+                                           searchName, PATH_MAX, nls_codepage,
+                                           remap);
                name_len++;     /* trailing null */
                name_len *= 2;
                pSMB->FileName[name_len] = 0;
@@ -3537,8 +3540,8 @@ setAclRetry:
                return rc;
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                        cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
+                        cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
-                                      PATH_MAX, nls_codepage, remap);
+                                           PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
@@ -3948,8 +3951,9 @@ QInfRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                        cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+                        cifsConvertToUTF16((__le16 *) pSMB->FileName,
-                                        PATH_MAX, nls_codepage, remap);
+                                           searchName, PATH_MAX, nls_codepage,
+                                           remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {
@@ -4086,8 +4090,8 @@ QPathInfoRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+                    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
-                                     PATH_MAX, nls_codepage, remap);
+                                       PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
@@ -4255,8 +4259,8 @@ UnixQPathInfoRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+                    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
-                                  PATH_MAX, nls_codepage, remap);
+                                       PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
@@ -4344,8 +4348,8 @@ findFirstRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+                    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
-                                 PATH_MAX, nls_codepage, remap);
+                                       PATH_MAX, nls_codepage, remap);
                /* We can not add the asterik earlier in case
                it got remapped to 0xF03A as if it were part of the
                directory name instead of a wildcard */
@@ -4656,8 +4660,9 @@ GetInodeNumberRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                        cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+                        cifsConvertToUTF16((__le16 *) pSMB->FileName,
-                                         PATH_MAX, nls_codepage, remap);
+                                           searchName, PATH_MAX, nls_codepage,
+                                           remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
@@ -4794,9 +4799,9 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
                                rc = -ENOMEM;
                                goto parse_DFS_referrals_exit;
                        }
-                        cifsConvertToUCS((__le16 *) tmp, searchName,
+                        cifsConvertToUTF16((__le16 *) tmp, searchName,
-                                        PATH_MAX, nls_codepage, remap);
+                                           PATH_MAX, nls_codepage, remap);
-                        node->path_consumed = cifs_ucs2_bytes(tmp,
+                        node->path_consumed = cifs_utf16_bytes(tmp,
                                        le16_to_cpu(pSMBr->PathConsumed),
                                        nls_codepage);
                        kfree(tmp);
@@ -4809,8 +4814,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
                /* copy DfsPath */
                temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
                max_len = data_end - temp;
-                node->path_name = cifs_strndup_from_ucs(temp, max_len,
+                node->path_name = cifs_strndup_from_utf16(temp, max_len,
-                                                      is_unicode, nls_codepage);
+                                                is_unicode, nls_codepage);
                if (!node->path_name) {
                        rc = -ENOMEM;
                        goto parse_DFS_referrals_exit;
@@ -4819,8 +4824,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
                /* copy link target UNC */
                temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
                max_len = data_end - temp;
-                node->node_name = cifs_strndup_from_ucs(temp, max_len,
+                node->node_name = cifs_strndup_from_utf16(temp, max_len,
-                                                      is_unicode, nls_codepage);
+                                                is_unicode, nls_codepage);
                if (!node->node_name)
                        rc = -ENOMEM;
        }
@@ -4873,8 +4878,9 @@ getDFSRetry:
        if (ses->capabilities & CAP_UNICODE) {
                pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
                name_len =
-                    cifsConvertToUCS((__le16 *) pSMB->RequestFileName,
+                    cifsConvertToUTF16((__le16 *) pSMB->RequestFileName,
-                                     searchName, PATH_MAX, nls_codepage, remap);
+                                       searchName, PATH_MAX, nls_codepage,
+                                       remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
@@ -5506,8 +5512,8 @@ SetEOFRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
+                    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
-                                     PATH_MAX, nls_codepage, remap);
+                                       PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
@@ -5796,8 +5802,8 @@ SetTimesRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
+                    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
-                                     PATH_MAX, nls_codepage, remap);
+                                       PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
@@ -5877,8 +5883,8 @@ SetAttrLgcyRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                        ConvertToUCS((__le16 *) pSMB->fileName, fileName,
+                        ConvertToUTF16((__le16 *) pSMB->fileName, fileName,
-                                PATH_MAX, nls_codepage);
+                                       PATH_MAX, nls_codepage);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
@@ -6030,8 +6036,8 @@ setPermsRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
+                    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
-                                     PATH_MAX, nls_codepage, remap);
+                                       PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
@@ -6123,8 +6129,8 @@ QAllEAsRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                list_len =
-                    cifsConvertToUCS((__le16 *) pSMB->FileName, searchName,
+                    cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
-                                     PATH_MAX, nls_codepage, remap);
+                                       PATH_MAX, nls_codepage, remap);
                list_len++;     /* trailing null */
                list_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
@@ -6301,8 +6307,8 @@ SetEARetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifsConvertToUCS((__le16 *) pSMB->FileName, fileName,
+                    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
-                                     PATH_MAX, nls_codepage, remap);
+                                       PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4666780f315d..986709a8d903 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -38,6 +38,7 @@
 #include <asm/processor.h>
 #include <linux/inet.h>
 #include <linux/module.h>
+#include <keys/user-type.h>
 #include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -225,74 +226,90 @@ static int check2ndT2(struct smb_hdr *pSMB)
 static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
 {
-        struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond;
+        struct smb_t2_rsp *pSMBs = (struct smb_t2_rsp *)psecond;
        struct smb_t2_rsp *pSMBt  = (struct smb_t2_rsp *)pTargetSMB;
-        char *data_area_of_target;
+        char *data_area_of_tgt;
-        char *data_area_of_buf2;
+        char *data_area_of_src;
        int remaining;
-        unsigned int byte_count, total_in_buf;
+        unsigned int byte_count, total_in_tgt;
-        __u16 total_data_size, total_in_buf2;
+        __u16 tgt_total_cnt, src_total_cnt, total_in_src;
-        total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
+        src_total_cnt = get_unaligned_le16(&pSMBs->t2_rsp.TotalDataCount);
+        tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
-        if (total_data_size !=
+        if (tgt_total_cnt != src_total_cnt)
-            get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount))
+                cFYI(1, "total data count of primary and secondary t2 differ "
-                cFYI(1, "total data size of primary and secondary t2 differ");
+                        "source=%hu target=%hu", src_total_cnt, tgt_total_cnt);
-        total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
+        total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
-        remaining = total_data_size - total_in_buf;
+        remaining = tgt_total_cnt - total_in_tgt;
-        if (remaining < 0)
+        if (remaining < 0) {
+                cFYI(1, "Server sent too much data. tgt_total_cnt=%hu "
+                        "total_in_tgt=%hu", tgt_total_cnt, total_in_tgt);
                return -EPROTO;
+        }
-        if (remaining == 0) /* nothing to do, ignore */
+        if (remaining == 0) {
+                /* nothing to do, ignore */
+                cFYI(1, "no more data remains");
                return 0;
+        }
-        total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount);
+        total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount);
-        if (remaining < total_in_buf2) {
+        if (remaining < total_in_src)
                cFYI(1, "transact2 2nd response contains too much data");
-        }
        /* find end of first SMB data area */
-        data_area_of_target = (char *)&pSMBt->hdr.Protocol +
+        data_area_of_tgt = (char *)&pSMBt->hdr.Protocol +
                                get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
-        /* validate target area */
-        data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol +
+        /* validate target area */
-                                get_unaligned_le16(&pSMB2->t2_rsp.DataOffset);
+        data_area_of_src = (char *)&pSMBs->hdr.Protocol +
+                                get_unaligned_le16(&pSMBs->t2_rsp.DataOffset);
-        data_area_of_target += total_in_buf;
+        data_area_of_tgt += total_in_tgt;
-        /* copy second buffer into end of first buffer */
+        total_in_tgt += total_in_src;
-        total_in_buf += total_in_buf2;
        /* is the result too big for the field? */
-        if (total_in_buf > USHRT_MAX)
+        if (total_in_tgt > USHRT_MAX) {
+                cFYI(1, "coalesced DataCount too large (%u)", total_in_tgt);
                return -EPROTO;
-        put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
+        }
+        put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount);
        /* fix up the BCC */
        byte_count = get_bcc(pTargetSMB);
-        byte_count += total_in_buf2;
+        byte_count += total_in_src;
        /* is the result too big for the field? */
-        if (byte_count > USHRT_MAX)
+        if (byte_count > USHRT_MAX) {
+                cFYI(1, "coalesced BCC too large (%u)", byte_count);
                return -EPROTO;
+        }
        put_bcc(byte_count, pTargetSMB);
        byte_count = be32_to_cpu(pTargetSMB->smb_buf_length);
-        byte_count += total_in_buf2;
+        byte_count += total_in_src;
        /* don't allow buffer to overflow */
-        if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)
+        if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+                cFYI(1, "coalesced BCC exceeds buffer size (%u)", byte_count);
                return -ENOBUFS;
+        }
        pTargetSMB->smb_buf_length = cpu_to_be32(byte_count);
-        memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
+        /* copy second buffer into end of first buffer */
+        memcpy(data_area_of_tgt, data_area_of_src, total_in_src);
-        if (remaining == total_in_buf2) {
+        if (remaining != total_in_src) {
-                cFYI(1, "found the last secondary response");
+                /* more responses to go */
-                return 0; /* we are done */
+                cFYI(1, "waiting for more secondary responses");
-        } else /* more responses to go */
                return 1;
+        }
+        /* we are done */
+        cFYI(1, "found the last secondary response");
+        return 0;
 }
 static void
@@ -1578,11 +1595,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                }
        }
-        if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
+#ifndef CONFIG_KEYS
-                cERROR(1, "Multiuser mounts currently require krb5 "
+        /* Muliuser mounts require CONFIG_KEYS support */
-                          "authentication!");
+        if (vol->multiuser) {
+                cERROR(1, "Multiuser mounts require kernels with "
+                          "CONFIG_KEYS enabled.");
                goto cifs_parse_mount_err;
        }
+#endif
        if (vol->UNCip == NULL)
                vol->UNCip = &vol->UNC[2];
@@ -1981,10 +2001,16 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
                        return 0;
                break;
        default:
+                /* NULL username means anonymous session */
+                if (ses->user_name == NULL) {
+                        if (!vol->nullauth)
+                                return 0;
+                        break;
+                }
                /* anything else takes username/password */
-                if (ses->user_name == NULL)
+                if (strncmp(ses->user_name,
-                        return 0;
+                            vol->username ? vol->username : "",
-                if (strncmp(ses->user_name, vol->username,
                            MAX_USERNAME_SIZE))
                        return 0;
                if (strlen(vol->username) != 0 &&
@@ -2039,6 +2065,132 @@ cifs_put_smb_ses(struct cifs_ses *ses)
        cifs_put_tcp_session(server);
 }
+#ifdef CONFIG_KEYS
+/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */
+#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1)
+/* Populate username and pw fields from keyring if possible */
+static int
+cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
+{
+        int rc = 0;
+        char *desc, *delim, *payload;
+        ssize_t len;
+        struct key *key;
+        struct TCP_Server_Info *server = ses->server;
+        struct sockaddr_in *sa;
+        struct sockaddr_in6 *sa6;
+        struct user_key_payload *upayload;
+        desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL);
+        if (!desc)
+                return -ENOMEM;
+        /* try to find an address key first */
+        switch (server->dstaddr.ss_family) {
+        case AF_INET:
+                sa = (struct sockaddr_in *)&server->dstaddr;
+                sprintf(desc, "cifs:a:%pI4", &sa->sin_addr.s_addr);
+                break;
+        case AF_INET6:
+                sa6 = (struct sockaddr_in6 *)&server->dstaddr;
+                sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr);
+                break;
+        default:
+                cFYI(1, "Bad ss_family (%hu)", server->dstaddr.ss_family);
+                rc = -EINVAL;
+                goto out_err;
+        }
+        cFYI(1, "%s: desc=%s", __func__, desc);
+        key = request_key(&key_type_logon, desc, "");
+        if (IS_ERR(key)) {
+                if (!ses->domainName) {
+                        cFYI(1, "domainName is NULL");
+                        rc = PTR_ERR(key);
+                        goto out_err;
+                }
+                /* didn't work, try to find a domain key */
+                sprintf(desc, "cifs:d:%s", ses->domainName);
+                cFYI(1, "%s: desc=%s", __func__, desc);
+                key = request_key(&key_type_logon, desc, "");
+                if (IS_ERR(key)) {
+                        rc = PTR_ERR(key);
+                        goto out_err;
+                }
+        }
+        down_read(&key->sem);
+        upayload = key->payload.data;
+        if (IS_ERR_OR_NULL(upayload)) {
+                rc = PTR_ERR(key);
+                goto out_key_put;
+        }
+        /* find first : in payload */
+        payload = (char *)upayload->data;
+        delim = strnchr(payload, upayload->datalen, ':');
+        cFYI(1, "payload=%s", payload);
+        if (!delim) {
+                cFYI(1, "Unable to find ':' in payload (datalen=%d)",
+                                upayload->datalen);
+                rc = -EINVAL;
+                goto out_key_put;
+        }
+        len = delim - payload;
+        if (len > MAX_USERNAME_SIZE || len <= 0) {
+                cFYI(1, "Bad value from username search (len=%ld)", len);
+                rc = -EINVAL;
+                goto out_key_put;
+        }
+        vol->username = kstrndup(payload, len, GFP_KERNEL);
+        if (!vol->username) {
+                cFYI(1, "Unable to allocate %ld bytes for username", len);
+                rc = -ENOMEM;
+                goto out_key_put;
+        }
+        cFYI(1, "%s: username=%s", __func__, vol->username);
+        len = key->datalen - (len + 1);
+        if (len > MAX_PASSWORD_SIZE || len <= 0) {
+                cFYI(1, "Bad len for password search (len=%ld)", len);
+                rc = -EINVAL;
+                kfree(vol->username);
+                vol->username = NULL;
+                goto out_key_put;
+        }
+        ++delim;
+        vol->password = kstrndup(delim, len, GFP_KERNEL);
+        if (!vol->password) {
+                cFYI(1, "Unable to allocate %ld bytes for password", len);
+                rc = -ENOMEM;
+                kfree(vol->username);
+                vol->username = NULL;
+                goto out_key_put;
+        }
+out_key_put:
+        up_read(&key->sem);
+        key_put(key);
+out_err:
+        kfree(desc);
+        cFYI(1, "%s: returning %d", __func__, rc);
+        return rc;
+}
+#else /* ! CONFIG_KEYS */
+static inline int
+cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
+                   struct cifs_ses *ses __attribute__((unused)))
+{
+        return -ENOSYS;
+}
+#endif /* CONFIG_KEYS */
 static bool warned_on_ntlm;  /* globals init to false automatically */
 static struct cifs_ses *
@@ -2914,18 +3066,33 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 #define CIFS_DEFAULT_IOSIZE (1024 * 1024)
 /*
- * Windows only supports a max of 60k reads. Default to that when posix
+ * Windows only supports a max of 60kb reads and 65535 byte writes. Default to
- * extensions aren't in force.
+ * those values when posix extensions aren't in force. In actuality here, we
+ * use 65536 to allow for a write that is a multiple of 4k. Most servers seem
+ * to be ok with the extra byte even though Windows doesn't send writes that
+ * are that large.
+ *
+ * Citation:
+ *
+ * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx
 */
 #define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
+#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536)
 static unsigned int
 cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
 {
        __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
        struct TCP_Server_Info *server = tcon->ses->server;
-        unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
+        unsigned int wsize;
-                                CIFS_DEFAULT_IOSIZE;
+        /* start with specified wsize, or default */
+        if (pvolume_info->wsize)
+                wsize = pvolume_info->wsize;
+        else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
+                wsize = CIFS_DEFAULT_IOSIZE;
+        else
+                wsize = CIFS_DEFAULT_NON_POSIX_WSIZE;
        /* can server support 24-bit write sizes? (via UNIX extensions) */
        if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
@@ -3136,10 +3303,9 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
                return -EINVAL;
        if (volume_info->nullauth) {
-                cFYI(1, "null user");
+                cFYI(1, "Anonymous login");
-                volume_info->username = kzalloc(1, GFP_KERNEL);
+                kfree(volume_info->username);
-                if (volume_info->username == NULL)
+                volume_info->username = NULL;
-                        return -ENOMEM;
        } else if (volume_info->username) {
                /* BB fixme parse for domain name here */
                cFYI(1, "Username: %s", volume_info->username);
@@ -3478,7 +3644,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
        if (ses->capabilities & CAP_UNICODE) {
                smb_buffer->Flags2 |= SMBFLG2_UNICODE;
                length =
-                    cifs_strtoUCS((__le16 *) bcc_ptr, tree,
+                    cifs_strtoUTF16((__le16 *) bcc_ptr, tree,
                        6 /* max utf8 char length in bytes */ *
                        (/* server len*/ + 256 /* share len */), nls_codepage);
                bcc_ptr += 2 * length;  /* convert num 16 bit words to bytes */
@@ -3533,7 +3699,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
                /* mostly informational -- no need to fail on error here */
                kfree(tcon->nativeFileSystem);
-                tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr,
+                tcon->nativeFileSystem = cifs_strndup_from_utf16(bcc_ptr,
                                                      bytes_left, is_unicode,
                                                      nls_codepage);
@@ -3657,16 +3823,38 @@ int cifs_setup_session(unsigned int xid, struct cifs_ses *ses,
        return rc;
 }
+static int
+cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
+{
+        switch (ses->server->secType) {
+        case Kerberos:
+                vol->secFlg = CIFSSEC_MUST_KRB5;
+                return 0;
+        case NTLMv2:
+                vol->secFlg = CIFSSEC_MUST_NTLMV2;
+                break;
+        case NTLM:
+                vol->secFlg = CIFSSEC_MUST_NTLM;
+                break;
+        case RawNTLMSSP:
+                vol->secFlg = CIFSSEC_MUST_NTLMSSP;
+                break;
+        case LANMAN:
+                vol->secFlg = CIFSSEC_MUST_LANMAN;
+                break;
+        }
+        return cifs_set_cifscreds(vol, ses);
+}
 static struct cifs_tcon *
 cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
 {
+        int rc;
        struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
        struct cifs_ses *ses;
        struct cifs_tcon *tcon = NULL;
        struct smb_vol *vol_info;
-        char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */
-                           /* We used to have this as MAX_USERNAME which is   */
-                           /* way too big now (256 instead of 32) */
        vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
        if (vol_info == NULL) {
@@ -3674,8 +3862,6 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
                goto out;
        }
-        snprintf(username, sizeof(username), "krb50x%x", fsuid);
-        vol_info->username = username;
        vol_info->local_nls = cifs_sb->local_nls;
        vol_info->linux_uid = fsuid;
        vol_info->cred_uid = fsuid;
@@ -3685,8 +3871,11 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
        vol_info->local_lease = master_tcon->local_lease;
        vol_info->no_linux_ext = !master_tcon->unix_ext;
-        /* FIXME: allow for other secFlg settings */
+        rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
-        vol_info->secFlg = CIFSSEC_MUST_KRB5;
+        if (rc) {
+                tcon = ERR_PTR(rc);
+                goto out;
+        }
        /* get a reference for the same TCP session */
        spin_lock(&cifs_tcp_ses_lock);
@@ -3709,6 +3898,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
        if (ses->capabilities & CAP_UNIX)
                reset_cifs_unix_caps(0, tcon, NULL, vol_info);
 out:
+        kfree(vol_info->username);
+        kfree(vol_info->password);
        kfree(vol_info);
        return tcon;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index a090bbe6ee29..e2bbc683e018 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -647,10 +647,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
                name.name = scratch_buf;
                name.len =
-                        cifs_from_ucs2((char *)name.name, (__le16 *)de.name,
+                        cifs_from_utf16((char *)name.name, (__le16 *)de.name,
-                                       UNICODE_NAME_MAX,
+                                        UNICODE_NAME_MAX,
-                                       min(de.namelen, (size_t)max_len), nlt,
+                                        min_t(size_t, de.namelen,
-                                       cifs_sb->mnt_cifs_flags &
+                                              (size_t)max_len), nlt,
+                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                name.len -= nls_nullsize(nlt);
        } else {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 4ec3ee9d72cc..d85efad5765f 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -167,16 +167,16 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
        int bytes_ret = 0;
        /* Copy OS version */
-        bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32,
+        bytes_ret = cifs_strtoUTF16((__le16 *)bcc_ptr, "Linux version ", 32,
-                                  nls_cp);
+                                    nls_cp);
        bcc_ptr += 2 * bytes_ret;
-        bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release,
+        bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, init_utsname()->release,
-                                  32, nls_cp);
+                                    32, nls_cp);
        bcc_ptr += 2 * bytes_ret;
        bcc_ptr += 2; /* trailing null */
-        bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
+        bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
-                                  32, nls_cp);
+                                    32, nls_cp);
        bcc_ptr += 2 * bytes_ret;
        bcc_ptr += 2; /* trailing null */
@@ -197,8 +197,8 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
                *(bcc_ptr+1) = 0;
                bytes_ret = 0;
        } else
-                bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName,
+                bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName,
-                                          256, nls_cp);
+                                            256, nls_cp);
        bcc_ptr += 2 * bytes_ret;
        bcc_ptr += 2;  /* account for null terminator */
@@ -226,8 +226,8 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
                *bcc_ptr = 0;
                *(bcc_ptr+1) = 0;
        } else {
-                bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name,
+                bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name,
-                                          MAX_USERNAME_SIZE, nls_cp);
+                                            MAX_USERNAME_SIZE, nls_cp);
        }
        bcc_ptr += 2 * bytes_ret;
        bcc_ptr += 2; /* account for null termination */
@@ -287,7 +287,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
        cFYI(1, "bleft %d", bleft);
        kfree(ses->serverOS);
-        ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+        ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
        cFYI(1, "serverOS=%s", ses->serverOS);
        len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
        data += len;
@@ -296,7 +296,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
                return;
        kfree(ses->serverNOS);
-        ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+        ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
        cFYI(1, "serverNOS=%s", ses->serverNOS);
        len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
        data += len;
@@ -305,7 +305,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
                return;
        kfree(ses->serverDomain);
-        ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp);
+        ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
        cFYI(1, "serverDomain=%s", ses->serverDomain);
        return;
@@ -502,8 +502,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                tmp += 2;
        } else {
                int len;
-                len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
+                len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName,
-                                    MAX_USERNAME_SIZE, nls_cp);
+                                      MAX_USERNAME_SIZE, nls_cp);
                len *= 2; /* unicode is 2 bytes each */
                sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
                sec_blob->DomainName.Length = cpu_to_le16(len);
@@ -518,8 +518,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
                tmp += 2;
        } else {
                int len;
-                len = cifs_strtoUCS((__le16 *)tmp, ses->user_name,
+                len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name,
-                                    MAX_USERNAME_SIZE, nls_cp);
+                                      MAX_USERNAME_SIZE, nls_cp);
                len *= 2; /* unicode is 2 bytes each */
                sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
                sec_blob->UserName.Length = cpu_to_le16(len);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 80d850881938..d5cd9aa7eacc 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -213,7 +213,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
        /* Password cannot be longer than 128 characters */
        if (passwd) /* Password must be converted to NT unicode */
-                len = cifs_strtoUCS(wpwd, passwd, 128, codepage);
+                len = cifs_strtoUTF16(wpwd, passwd, 128, codepage);
        else {
                len = 0;
                *wpwd = 0; /* Ensure string is null terminated */
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 6475877b0763..911cf30d057d 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -88,24 +88,21 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
   - link the two up if this is needed
   - fill in the attributes
 */
-int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_block *sb)
+struct inode *coda_cnode_make(struct CodaFid *fid, struct super_block *sb)
 {
        struct coda_vattr attr;
+        struct inode *inode;
        int error;
        
        /* We get inode numbers from Venus -- see venus source */
        error = venus_getattr(sb, fid, &attr);
-        if ( error ) {
+        if (error)
-            *inode = NULL;
+                return ERR_PTR(error);
-            return error;
-        } 
-        *inode = coda_iget(sb, fid, &attr);
+        inode = coda_iget(sb, fid, &attr);
-        if ( IS_ERR(*inode) ) {
+        if (IS_ERR(inode))
                printk("coda_cnode_make: coda_iget failed\n");
-                return PTR_ERR(*inode);
+        return inode;
-        }
-        return 0;
 }
@@ -156,19 +153,16 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb)
 }
 /* the CONTROL inode is made without asking attributes from Venus */
-int coda_cnode_makectl(struct inode **inode, struct super_block *sb)
+struct inode *coda_cnode_makectl(struct super_block *sb)
 {
-        int error = -ENOMEM;
+        struct inode *inode = new_inode(sb);
+        if (inode) {
-        *inode = new_inode(sb);
+                inode->i_ino = CTL_INO;
-        if (*inode) {
+                inode->i_op = &coda_ioctl_inode_operations;
-                (*inode)->i_ino = CTL_INO;
+                inode->i_fop = &coda_ioctl_operations;
-                (*inode)->i_op = &coda_ioctl_inode_operations;
+                inode->i_mode = 0444;
-                (*inode)->i_fop = &coda_ioctl_operations;
+                return inode;
-                (*inode)->i_mode = 0444;
-                error = 0;
        }
+        return ERR_PTR(-ENOMEM);
-        return error;
 }
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index e35071b1de0e..b24fdfd8a3f0 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h
@@ -49,9 +49,9 @@ struct coda_file_info {
 #define C_DYING       0x4   /* from venus (which died) */
 #define C_PURGE       0x8
-int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
+struct inode *coda_cnode_make(struct CodaFid *, struct super_block *);
 struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
-int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
+struct inode *coda_cnode_makectl(struct super_block *sb);
 struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
 void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 83d2fd8ec24b..177515829062 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -96,12 +96,11 @@ const struct file_operations coda_dir_operations = {
 /* access routines: lookup, readlink, permission */
 static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd)
 {
-        struct inode *inode = NULL;
+        struct super_block *sb = dir->i_sb;
-        struct CodaFid resfid = { { 0, } };
-        int type = 0;
-        int error = 0;
        const char *name = entry->d_name.name;
        size_t length = entry->d_name.len;
+        struct inode *inode;
+        int type = 0;
        if (length > CODA_MAXNAMLEN) {
                printk(KERN_ERR "name too long: lookup, %s (%*s)\n",
@@ -111,23 +110,21 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
        /* control object, create inode on the fly */
        if (coda_isroot(dir) && coda_iscontrol(name, length)) {
-                error = coda_cnode_makectl(&inode, dir->i_sb);
+                inode = coda_cnode_makectl(sb);
                type = CODA_NOCACHE;
-                goto exit;
+        } else {
+                struct CodaFid fid = { { 0, } };
+                int error = venus_lookup(sb, coda_i2f(dir), name, length,
+                                     &type, &fid);
+                inode = !error ? coda_cnode_make(&fid, sb) : ERR_PTR(error);
        }
-        error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
+        if (!IS_ERR(inode) && (type & CODA_NOCACHE))
-                             &type, &resfid);
-        if (!error)
-                error = coda_cnode_make(&inode, &resfid, dir->i_sb);
-        if (error && error != -ENOENT)
-                return ERR_PTR(error);
-exit:
-        if (inode && (type & CODA_NOCACHE))
                coda_flag_inode(inode, C_VATTR | C_PURGE);
+        if (inode == ERR_PTR(-ENOENT))
+                inode = NULL;
        return d_splice_alias(inode, entry);
 }
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 1c08a8cd673a..5e2e1b3f068d 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -204,10 +204,12 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid));
        
        /* make root inode */
-        error = coda_cnode_make(&root, &fid, sb);
+        root = coda_cnode_make(&fid, sb);
-        if ( error || !root ) {
+        if (IS_ERR(root)) {
-            printk("Failure of coda_cnode_make for root: error %d\n", error);
+                error = PTR_ERR(root);
-            goto error;
+                printk("Failure of coda_cnode_make for root: error %d\n", error);
+                root = NULL;
+                goto error;
        } 
        printk("coda_read_super: rootinode is %ld dev %s\n", 
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a10e428b32b4..a26bea10e81b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -105,6 +105,7 @@
 #include <linux/hiddev.h>
+#define __DVB_CORE__
 #include <linux/dvb/audio.h>
 #include <linux/dvb/dmx.h>
 #include <linux/dvb/frontend.h>
diff --git a/fs/dcache.c b/fs/dcache.c
index 9791b1e7eee4..16a53cc2cc02 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -243,6 +243,7 @@ static void dentry_lru_add(struct dentry *dentry)
 static void __dentry_lru_del(struct dentry *dentry)
 {
        list_del_init(&dentry->d_lru);
+        dentry->d_flags &= ~DCACHE_SHRINK_LIST;
        dentry->d_sb->s_nr_dentry_unused--;
        dentry_stat.nr_unused--;
 }
@@ -276,15 +277,15 @@ static void dentry_lru_prune(struct dentry *dentry)
        }
 }
-static void dentry_lru_move_tail(struct dentry *dentry)
+static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
 {
        spin_lock(&dcache_lru_lock);
        if (list_empty(&dentry->d_lru)) {
-                list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+                list_add_tail(&dentry->d_lru, list);
                dentry->d_sb->s_nr_dentry_unused++;
                dentry_stat.nr_unused++;
        } else {
-                list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+                list_move_tail(&dentry->d_lru, list);
        }
        spin_unlock(&dcache_lru_lock);
 }
@@ -770,14 +771,18 @@ static void shrink_dentry_list(struct list_head *list)
 }
 /**
- * __shrink_dcache_sb - shrink the dentry LRU on a given superblock
+ * prune_dcache_sb - shrink the dcache
- * @sb:         superblock to shrink dentry LRU.
+ * @sb: superblock
- * @count:      number of entries to prune
+ * @count: number of entries to try to free
- * @flags:      flags to control the dentry processing
+ *
+ * Attempt to shrink the superblock dcache LRU by @count entries. This is
+ * done when we need more memory an called from the superblock shrinker
+ * function.
 *
- * If flags contains DCACHE_REFERENCED reference dentries will not be pruned.
+ * This function may fail to free any resources if all the dentries are in
+ * use.
 */
-static void __shrink_dcache_sb(struct super_block *sb, int count, int flags)
+void prune_dcache_sb(struct super_block *sb, int count)
 {
        struct dentry *dentry;
        LIST_HEAD(referenced);
@@ -796,18 +801,13 @@ relock:
                        goto relock;
                }
-                /*
+                if (dentry->d_flags & DCACHE_REFERENCED) {
-                 * If we are honouring the DCACHE_REFERENCED flag and the
-                 * dentry has this flag set, don't free it.  Clear the flag
-                 * and put it back on the LRU.
-                 */
-                if (flags & DCACHE_REFERENCED &&
-                                dentry->d_flags & DCACHE_REFERENCED) {
                        dentry->d_flags &= ~DCACHE_REFERENCED;
                        list_move(&dentry->d_lru, &referenced);
                        spin_unlock(&dentry->d_lock);
                } else {
                        list_move_tail(&dentry->d_lru, &tmp);
+                        dentry->d_flags |= DCACHE_SHRINK_LIST;
                        spin_unlock(&dentry->d_lock);
                        if (!--count)
                                break;
@@ -822,23 +822,6 @@ relock:
 }
 /**
- * prune_dcache_sb - shrink the dcache
- * @sb: superblock
- * @nr_to_scan: number of entries to try to free
- *
- * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
- * done when we need more memory an called from the superblock shrinker
- * function.
- *
- * This function may fail to free any resources if all the dentries are in
- * use.
- */
-void prune_dcache_sb(struct super_block *sb, int nr_to_scan)
-{
-        __shrink_dcache_sb(sb, nr_to_scan, DCACHE_REFERENCED);
-}
-/**
 * shrink_dcache_sb - shrink dcache for a superblock
 * @sb: superblock
 *
@@ -1092,7 +1075,7 @@ EXPORT_SYMBOL(have_submounts);
 * drop the lock and return early due to latency
 * constraints.
 */
-static int select_parent(struct dentry * parent)
+static int select_parent(struct dentry *parent, struct list_head *dispose)
 {
        struct dentry *this_parent;
        struct list_head *next;
@@ -1114,17 +1097,21 @@ resume:
                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-                /* 
+                /*
-                 * move only zero ref count dentries to the end 
+                 * move only zero ref count dentries to the dispose list.
-                 * of the unused list for prune_dcache
+                 *
+                 * Those which are presently on the shrink list, being processed
+                 * by shrink_dentry_list(), shouldn't be moved.  Otherwise the
+                 * loop in shrink_dcache_parent() might not make any progress
+                 * and loop forever.
                 */
-                if (!dentry->d_count) {
+                if (dentry->d_count) {
-                        dentry_lru_move_tail(dentry);
-                        found++;
-                } else {
                        dentry_lru_del(dentry);
+                } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
+                        dentry_lru_move_list(dentry, dispose);
+                        dentry->d_flags |= DCACHE_SHRINK_LIST;
+                        found++;
                }
                /*
                 * We can return to the caller if we have found some (this
                 * ensures forward progress). We'll be coming back to find
@@ -1181,14 +1168,13 @@ rename_retry:
 *
 * Prune the dcache to remove unused children of the parent dentry.
 */
- 
 void shrink_dcache_parent(struct dentry * parent)
 {
-        struct super_block *sb = parent->d_sb;
+        LIST_HEAD(dispose);
        int found;
-        while ((found = select_parent(parent)) != 0)
+        while ((found = select_parent(parent, &dispose)) != 0)
-                __shrink_dcache_sb(sb, found, 0);
+                shrink_dentry_list(&dispose);
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
@@ -1461,6 +1447,23 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 }
 EXPORT_SYMBOL(d_alloc_root);
+struct dentry *d_make_root(struct inode *root_inode)
+{
+        struct dentry *res = NULL;
+        if (root_inode) {
+                static const struct qstr name = { .name = "/", .len = 1 };
+                res = __d_alloc(root_inode->i_sb, &name);
+                if (res)
+                        d_instantiate(res, root_inode);
+                else
+                        iput(root_inode);
+        }
+        return res;
+}
+EXPORT_SYMBOL(d_make_root);
 static struct dentry * __d_find_any_alias(struct inode *inode)
 {
        struct dentry *alias;
@@ -1472,7 +1475,14 @@ static struct dentry * __d_find_any_alias(struct inode *inode)
        return alias;
 }
-static struct dentry * d_find_any_alias(struct inode *inode)
+/**
+ * d_find_any_alias - find any alias for a given inode
+ * @inode: inode to find an alias for
+ *
+ * If any aliases exist for the given inode, take and return a
+ * reference for one of them.  If no aliases exist, return %NULL.
+ */
+struct dentry *d_find_any_alias(struct inode *inode)
 {
        struct dentry *de;
@@ -1481,7 +1491,7 @@ static struct dentry * d_find_any_alias(struct inode *inode)
        spin_unlock(&inode->i_lock);
        return de;
 }
+EXPORT_SYMBOL(d_find_any_alias);
 /**
 * d_obtain_alias - find or allocate a dentry for a given inode
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index f65d4455c5e5..ef023eef0464 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -540,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_blob);
 * debugfs_print_regs32 - use seq_print to describe a set of registers
 * @s: the seq_file structure being used to generate output
 * @regs: an array if struct debugfs_reg32 structures
- * @mregs: the length of the above array
+ * @nregs: the length of the above array
 * @base: the base address to be used in reading the registers
 * @prefix: a string to be prefixed to every output line
 *
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 79673eb71151..c4e2a58a2e82 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -301,7 +301,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
        inode = new_inode(s);
        if (!inode)
-                goto free_fsi;
+                goto fail;
        inode->i_ino = 1;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
@@ -316,8 +316,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
        printk(KERN_ERR "devpts: get root dentry failed\n");
        iput(inode);
-free_fsi:
-        kfree(s->s_fs_info);
 fail:
        return -ENOMEM;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d740ab67ff6e..4a588dbd11bf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -36,6 +36,7 @@
 #include <linux/rwsem.h>
 #include <linux/uio.h>
 #include <linux/atomic.h>
+#include <linux/prefetch.h>
 /*
 * How many user pages to map in one call to get_user_pages().  This determines
@@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
 {
        int ret;
        sector_t fs_startblk;   /* Into file, in filesystem-sized blocks */
+        sector_t fs_endblk;     /* Into file, in filesystem-sized blocks */
        unsigned long fs_count; /* Number of filesystem-sized blocks */
-        unsigned long dio_count;/* Number of dio_block-sized blocks */
-        unsigned long blkmask;
        int create;
        /*
@@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
        if (ret == 0) {
                BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
                fs_startblk = sdio->block_in_file >> sdio->blkfactor;
-                dio_count = sdio->final_block_in_request - sdio->block_in_file;
+                fs_endblk = (sdio->final_block_in_request - 1) >>
-                fs_count = dio_count >> sdio->blkfactor;
+                                        sdio->blkfactor;
-                blkmask = (1 << sdio->blkfactor) - 1;
+                fs_count = fs_endblk - fs_startblk + 1;
-                if (dio_count & blkmask)        
-                        fs_count++;
                map_bh->b_state = 0;
                map_bh->b_size = fs_count << dio->inode->i_blkbits;
@@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio)
 * individual fields and will generate much worse code. This is important
 * for the whole file.
 */
-ssize_t
+static inline ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        struct block_device *bdev, const struct iovec *iov, loff_t offset, 
        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
        dio_submit_t submit_io, int flags)
@@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        size_t size;
        unsigned long addr;
        unsigned blkbits = inode->i_blkbits;
-        unsigned bdev_blkbits = 0;
        unsigned blocksize_mask = (1 << blkbits) - 1;
        ssize_t retval = -EINVAL;
        loff_t end = offset;
@@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        if (rw & WRITE)
                rw = WRITE_ODIRECT;
-        if (bdev)
+        /*
-                bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
+         * Avoid references to bdev if not absolutely needed to give
+         * the early prefetch in the caller enough time.
+         */
        if (offset & blocksize_mask) {
                if (bdev)
-                         blkbits = bdev_blkbits;
+                        blkbits = blksize_bits(bdev_logical_block_size(bdev));
                blocksize_mask = (1 << blkbits) - 1;
                if (offset & blocksize_mask)
                        goto out;
@@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                addr = (unsigned long)iov[seg].iov_base;
                size = iov[seg].iov_len;
                end += size;
-                if ((addr & blocksize_mask) || (size & blocksize_mask))  {
+                if (unlikely((addr & blocksize_mask) ||
+                             (size & blocksize_mask))) {
                        if (bdev)
-                                 blkbits = bdev_blkbits;
+                                blkbits = blksize_bits(
+                                         bdev_logical_block_size(bdev));
                        blocksize_mask = (1 << blkbits) - 1;
-                        if ((addr & blocksize_mask) || (size & blocksize_mask))  
+                        if ((addr & blocksize_mask) || (size & blocksize_mask))
                                goto out;
                }
        }
@@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 out:
        return retval;
 }
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+        struct block_device *bdev, const struct iovec *iov, loff_t offset,
+        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+        dio_submit_t submit_io, int flags)
+{
+        /*
+         * The block device state is needed in the end to finally
+         * submit everything.  Since it's likely to be cache cold
+         * prefetch it here as first thing to hide some of the
+         * latency.
+         *
+         * Attempt to prefetch the pieces we likely need later.
+         */
+        prefetch(&bdev->bd_disk->part_tbl);
+        prefetch(bdev->bd_queue);
+        prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
+        return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+                                     nr_segs, get_block, end_io,
+                                     submit_io, flags);
+}
 EXPORT_SYMBOL(__blockdev_direct_IO);
 static __init int dio_init(void)
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 6cf72fcc0d0c..e7e327d43fa5 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/in.h>
 #include <linux/in6.h>
+#include <linux/dlmconstants.h>
 #include <net/ipv6.h>
 #include <net/sock.h>
@@ -36,6 +37,7 @@
 static struct config_group *space_list;
 static struct config_group *comm_list;
 static struct dlm_comm *local_comm;
+static uint32_t dlm_comm_count;
 struct dlm_clusters;
 struct dlm_cluster;
@@ -103,6 +105,8 @@ struct dlm_cluster {
        unsigned int cl_timewarn_cs;
        unsigned int cl_waitwarn_us;
        unsigned int cl_new_rsb_count;
+        unsigned int cl_recover_callbacks;
+        char cl_cluster_name[DLM_LOCKSPACE_LEN];
 };
 enum {
@@ -118,6 +122,8 @@ enum {
        CLUSTER_ATTR_TIMEWARN_CS,
        CLUSTER_ATTR_WAITWARN_US,
        CLUSTER_ATTR_NEW_RSB_COUNT,
+        CLUSTER_ATTR_RECOVER_CALLBACKS,
+        CLUSTER_ATTR_CLUSTER_NAME,
 };
 struct cluster_attribute {
@@ -126,6 +132,27 @@ struct cluster_attribute {
        ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
 };
+static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
+{
+        return sprintf(buf, "%s\n", cl->cl_cluster_name);
+}
+static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
+                                          const char *buf, size_t len)
+{
+        strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN);
+        strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN);
+        return len;
+}
+static struct cluster_attribute cluster_attr_cluster_name = {
+        .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "cluster_name",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+        .show   = cluster_cluster_name_read,
+        .store  = cluster_cluster_name_write,
+};
 static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
                           int *info_field, int check_zero,
                           const char *buf, size_t len)
@@ -171,6 +198,7 @@ CLUSTER_ATTR(protocol, 0);
 CLUSTER_ATTR(timewarn_cs, 1);
 CLUSTER_ATTR(waitwarn_us, 0);
 CLUSTER_ATTR(new_rsb_count, 0);
+CLUSTER_ATTR(recover_callbacks, 0);
 static struct configfs_attribute *cluster_attrs[] = {
        [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -185,6 +213,8 @@ static struct configfs_attribute *cluster_attrs[] = {
        [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
        [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
        [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr,
+        [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr,
+        [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr,
        NULL,
 };
@@ -293,6 +323,7 @@ struct dlm_comms {
 struct dlm_comm {
        struct config_item item;
+        int seq;
        int nodeid;
        int local;
        int addr_count;
@@ -309,6 +340,7 @@ struct dlm_node {
        int nodeid;
        int weight;
        int new;
+        int comm_seq; /* copy of cm->seq when nd->nodeid is set */
 };
 static struct configfs_group_operations clusters_ops = {
@@ -455,6 +487,9 @@ static struct config_group *make_cluster(struct config_group *g,
        cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
        cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
        cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
+        cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
+        memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
+               DLM_LOCKSPACE_LEN);
        space_list = &sps->ss_group;
        comm_list = &cms->cs_group;
@@ -558,6 +593,11 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
                return ERR_PTR(-ENOMEM);
        config_item_init_type_name(&cm->item, name, &comm_type);
+        cm->seq = dlm_comm_count++;
+        if (!cm->seq)
+                cm->seq = dlm_comm_count++;
        cm->nodeid = -1;
        cm->local = 0;
        cm->addr_count = 0;
@@ -801,7 +841,10 @@ static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
 static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
                                 size_t len)
 {
+        uint32_t seq = 0;
        nd->nodeid = simple_strtol(buf, NULL, 0);
+        dlm_comm_seq(nd->nodeid, &seq);
+        nd->comm_seq = seq;
        return len;
 }
@@ -908,13 +951,13 @@ static void put_comm(struct dlm_comm *cm)
 }
 /* caller must free mem */
-int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
+int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
-                    int **new_out, int *new_count_out)
+                     int *count_out)
 {
        struct dlm_space *sp;
        struct dlm_node *nd;
-        int i = 0, rv = 0, ids_count = 0, new_count = 0;
+        struct dlm_config_node *nodes, *node;
-        int *ids, *new;
+        int rv, count;
        sp = get_space(lsname);
        if (!sp)
@@ -927,73 +970,42 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
                goto out;
        }
-        ids_count = sp->members_count;
+        count = sp->members_count;
-        ids = kcalloc(ids_count, sizeof(int), GFP_NOFS);
+        nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS);
-        if (!ids) {
+        if (!nodes) {
                rv = -ENOMEM;
                goto out;
        }
+        node = nodes;
        list_for_each_entry(nd, &sp->members, list) {
-                ids[i++] = nd->nodeid;
+                node->nodeid = nd->nodeid;
-                if (nd->new)
+                node->weight = nd->weight;
-                        new_count++;
+                node->new = nd->new;
-        }
+                node->comm_seq = nd->comm_seq;
+                node++;
-        if (ids_count != i)
-                printk(KERN_ERR "dlm: bad nodeid count %d %d\n", ids_count, i);
-        if (!new_count)
-                goto out_ids;
-        new = kcalloc(new_count, sizeof(int), GFP_NOFS);
+                nd->new = 0;
-        if (!new) {
-                kfree(ids);
-                rv = -ENOMEM;
-                goto out;
        }
-        i = 0;
+        *count_out = count;
-        list_for_each_entry(nd, &sp->members, list) {
+        *nodes_out = nodes;
-                if (nd->new) {
+        rv = 0;
-                        new[i++] = nd->nodeid;
-                        nd->new = 0;
-                }
-        }
-        *new_count_out = new_count;
-        *new_out = new;
- out_ids:
-        *ids_count_out = ids_count;
-        *ids_out = ids;
 out:
        mutex_unlock(&sp->members_lock);
        put_space(sp);
        return rv;
 }
-int dlm_node_weight(char *lsname, int nodeid)
+int dlm_comm_seq(int nodeid, uint32_t *seq)
 {
-        struct dlm_space *sp;
+        struct dlm_comm *cm = get_comm(nodeid, NULL);
-        struct dlm_node *nd;
+        if (!cm)
-        int w = -EEXIST;
+                return -EEXIST;
+        *seq = cm->seq;
-        sp = get_space(lsname);
+        put_comm(cm);
-        if (!sp)
+        return 0;
-                goto out;
-        mutex_lock(&sp->members_lock);
-        list_for_each_entry(nd, &sp->members, list) {
-                if (nd->nodeid != nodeid)
-                        continue;
-                w = nd->weight;
-                break;
-        }
-        mutex_unlock(&sp->members_lock);
-        put_space(sp);
- out:
-        return w;
 }
 int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
@@ -1047,6 +1059,8 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
 #define DEFAULT_TIMEWARN_CS      500 /* 5 sec = 500 centiseconds */
 #define DEFAULT_WAITWARN_US        0
 #define DEFAULT_NEW_RSB_COUNT    128
+#define DEFAULT_RECOVER_CALLBACKS  0
+#define DEFAULT_CLUSTER_NAME      ""
 struct dlm_config_info dlm_config = {
        .ci_tcp_port = DEFAULT_TCP_PORT,
@@ -1060,6 +1074,8 @@ struct dlm_config_info dlm_config = {
        .ci_protocol = DEFAULT_PROTOCOL,
        .ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
        .ci_waitwarn_us = DEFAULT_WAITWARN_US,
-        .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT
+        .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT,
+        .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS,
+        .ci_cluster_name = DEFAULT_CLUSTER_NAME
 };
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 3099d0dd26c0..9f5e3663bb0c 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,13 @@
 #ifndef __CONFIG_DOT_H__
 #define __CONFIG_DOT_H__
+struct dlm_config_node {
+        int nodeid;
+        int weight;
+        int new;
+        uint32_t comm_seq;
+};
 #define DLM_MAX_ADDR_COUNT 3
 struct dlm_config_info {
@@ -29,15 +36,17 @@ struct dlm_config_info {
        int ci_timewarn_cs;
        int ci_waitwarn_us;
        int ci_new_rsb_count;
+        int ci_recover_callbacks;
+        char ci_cluster_name[DLM_LOCKSPACE_LEN];
 };
 extern struct dlm_config_info dlm_config;
 int dlm_config_init(void);
 void dlm_config_exit(void);
-int dlm_node_weight(char *lsname, int nodeid);
+int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
-int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
+                     int *count_out);
-                    int **new_out, int *new_count_out);
+int dlm_comm_seq(int nodeid, uint32_t *seq);
 int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
 int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
 int dlm_our_nodeid(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 59779237e2b4..3dca2b39e83f 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -393,6 +393,7 @@ static const struct seq_operations format3_seq_ops;
 static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 {
+        struct rb_node *node;
        struct dlm_ls *ls = seq->private;
        struct rsbtbl_iter *ri;
        struct dlm_rsb *r;
@@ -418,9 +419,10 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
                ri->format = 3;
        spin_lock(&ls->ls_rsbtbl[bucket].lock);
-        if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+        if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
-                list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list,
+                for (node = rb_first(&ls->ls_rsbtbl[bucket].keep); node;
-                                    res_hashchain) {
+                     node = rb_next(node)) {
+                        r = rb_entry(node, struct dlm_rsb, res_hashnode);
                        if (!entry--) {
                                dlm_hold_rsb(r);
                                ri->rsb = r;
@@ -449,9 +451,9 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
                }
                spin_lock(&ls->ls_rsbtbl[bucket].lock);
-                if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+                if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
-                        r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
+                        node = rb_first(&ls->ls_rsbtbl[bucket].keep);
-                                             struct dlm_rsb, res_hashchain);
+                        r = rb_entry(node, struct dlm_rsb, res_hashnode);
                        dlm_hold_rsb(r);
                        ri->rsb = r;
                        ri->bucket = bucket;
@@ -467,7 +469,7 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
 {
        struct dlm_ls *ls = seq->private;
        struct rsbtbl_iter *ri = iter_ptr;
-        struct list_head *next;
+        struct rb_node *next;
        struct dlm_rsb *r, *rp;
        loff_t n = *pos;
        unsigned bucket;
@@ -480,10 +482,10 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
        spin_lock(&ls->ls_rsbtbl[bucket].lock);
        rp = ri->rsb;
-        next = rp->res_hashchain.next;
+        next = rb_next(&rp->res_hashnode);
-        if (next != &ls->ls_rsbtbl[bucket].list) {
+        if (next) {
-                r = list_entry(next, struct dlm_rsb, res_hashchain);
+                r = rb_entry(next, struct dlm_rsb, res_hashnode);
                dlm_hold_rsb(r);
                ri->rsb = r;
                spin_unlock(&ls->ls_rsbtbl[bucket].lock);
@@ -511,9 +513,9 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
                }
                spin_lock(&ls->ls_rsbtbl[bucket].lock);
-                if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+                if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
-                        r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
+                        next = rb_first(&ls->ls_rsbtbl[bucket].keep);
-                                             struct dlm_rsb, res_hashchain);
+                        r = rb_entry(next, struct dlm_rsb, res_hashnode);
                        dlm_hold_rsb(r);
                        ri->rsb = r;
                        ri->bucket = bucket;
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 7b84c1dbc82e..83641574b016 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -290,7 +290,6 @@ int dlm_recover_directory(struct dlm_ls *ls)
 out_status:
        error = 0;
-        dlm_set_recover_status(ls, DLM_RS_DIR);
        log_debug(ls, "dlm_recover_directory %d entries", count);
 out_free:
        kfree(last_name);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index fe2860c02449..3a564d197e99 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -103,8 +103,8 @@ struct dlm_dirtable {
 };
 struct dlm_rsbtable {
-        struct list_head        list;
+        struct rb_root          keep;
-        struct list_head        toss;
+        struct rb_root          toss;
        spinlock_t              lock;
 };
@@ -117,6 +117,10 @@ struct dlm_member {
        struct list_head        list;
        int                     nodeid;
        int                     weight;
+        int                     slot;
+        int                     slot_prev;
+        int                     comm_seq;
+        uint32_t                generation;
 };
 /*
@@ -125,10 +129,8 @@ struct dlm_member {
 struct dlm_recover {
        struct list_head        list;
-        int                     *nodeids;   /* nodeids of all members */
+        struct dlm_config_node  *nodes;
-        int                     node_count;
+        int                     nodes_count;
-        int                     *new;       /* nodeids of new members */
-        int                     new_count;
        uint64_t                seq;
 };
@@ -285,7 +287,10 @@ struct dlm_rsb {
        unsigned long           res_toss_time;
        uint32_t                res_first_lkid;
        struct list_head        res_lookup;     /* lkbs waiting on first */
-        struct list_head        res_hashchain;  /* rsbtbl */
+        union {
+                struct list_head        res_hashchain;
+                struct rb_node          res_hashnode;   /* rsbtbl */
+        };
        struct list_head        res_grantqueue;
        struct list_head        res_convertqueue;
        struct list_head        res_waitqueue;
@@ -334,7 +339,9 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
 /* dlm_header is first element of all structs sent between nodes */
 #define DLM_HEADER_MAJOR        0x00030000
-#define DLM_HEADER_MINOR        0x00000000
+#define DLM_HEADER_MINOR        0x00000001
+#define DLM_HEADER_SLOTS        0x00000001
 #define DLM_MSG                 1
 #define DLM_RCOM                2
@@ -422,10 +429,34 @@ union dlm_packet {
        struct dlm_rcom         rcom;
 };
+#define DLM_RSF_NEED_SLOTS      0x00000001
+/* RCOM_STATUS data */
+struct rcom_status {
+        __le32                  rs_flags;
+        __le32                  rs_unused1;
+        __le64                  rs_unused2;
+};
+/* RCOM_STATUS_REPLY data */
 struct rcom_config {
        __le32                  rf_lvblen;
        __le32                  rf_lsflags;
-        __le64                  rf_unused;
+        /* DLM_HEADER_SLOTS adds: */
+        __le32                  rf_flags;
+        __le16                  rf_our_slot;
+        __le16                  rf_num_slots;
+        __le32                  rf_generation;
+        __le32                  rf_unused1;
+        __le64                  rf_unused2;
+};
+struct rcom_slot {
+        __le32                  ro_nodeid;
+        __le16                  ro_slot;
+        __le16                  ro_unused1;
+        __le64                  ro_unused2;
 };
 struct rcom_lock {
@@ -452,6 +483,7 @@ struct dlm_ls {
        struct list_head        ls_list;        /* list of lockspaces */
        dlm_lockspace_t         *ls_local_handle;
        uint32_t                ls_global_id;   /* global unique lockspace ID */
+        uint32_t                ls_generation;
        uint32_t                ls_exflags;
        int                     ls_lvblen;
        int                     ls_count;       /* refcount of processes in
@@ -490,6 +522,11 @@ struct dlm_ls {
        int                     ls_total_weight;
        int                     *ls_node_array;
+        int                     ls_slot;
+        int                     ls_num_slots;
+        int                     ls_slots_size;
+        struct dlm_slot         *ls_slots;
        struct dlm_rsb          ls_stub_rsb;    /* for returning errors */
        struct dlm_lkb          ls_stub_lkb;    /* for returning errors */
        struct dlm_message      ls_stub_ms;     /* for faking a reply */
@@ -537,6 +574,9 @@ struct dlm_ls {
        struct list_head        ls_root_list;   /* root resources */
        struct rw_semaphore     ls_root_sem;    /* protect root_list */
+        const struct dlm_lockspace_ops *ls_ops;
+        void                    *ls_ops_arg;
        int                     ls_namelen;
        char                    ls_name[1];
 };
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 83b5e32514e1..d47183043c59 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -56,6 +56,7 @@
   L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
 */
 #include <linux/types.h>
+#include <linux/rbtree.h>
 #include <linux/slab.h>
 #include "dlm_internal.h"
 #include <linux/dlm_device.h>
@@ -380,6 +381,8 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
        r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain);
        list_del(&r->res_hashchain);
+        /* Convert the empty list_head to a NULL rb_node for tree usage: */
+        memset(&r->res_hashnode, 0, sizeof(struct rb_node));
        ls->ls_new_rsb_count--;
        spin_unlock(&ls->ls_new_rsb_spin);
@@ -388,7 +391,6 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
        memcpy(r->res_name, name, len);
        mutex_init(&r->res_mutex);
-        INIT_LIST_HEAD(&r->res_hashchain);
        INIT_LIST_HEAD(&r->res_lookup);
        INIT_LIST_HEAD(&r->res_grantqueue);
        INIT_LIST_HEAD(&r->res_convertqueue);
@@ -400,14 +402,31 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
        return 0;
 }
-static int search_rsb_list(struct list_head *head, char *name, int len,
+static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen)
+{
+        char maxname[DLM_RESNAME_MAXLEN];
+        memset(maxname, 0, DLM_RESNAME_MAXLEN);
+        memcpy(maxname, name, nlen);
+        return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);
+}
+static int search_rsb_tree(struct rb_root *tree, char *name, int len,
                           unsigned int flags, struct dlm_rsb **r_ret)
 {
+        struct rb_node *node = tree->rb_node;
        struct dlm_rsb *r;
        int error = 0;
+        int rc;
-        list_for_each_entry(r, head, res_hashchain) {
-                if (len == r->res_length && !memcmp(name, r->res_name, len))
+        while (node) {
+                r = rb_entry(node, struct dlm_rsb, res_hashnode);
+                rc = rsb_cmp(r, name, len);
+                if (rc < 0)
+                        node = node->rb_left;
+                else if (rc > 0)
+                        node = node->rb_right;
+                else
                        goto found;
        }
        *r_ret = NULL;
@@ -420,22 +439,54 @@ static int search_rsb_list(struct list_head *head, char *name, int len,
        return error;
 }
+static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree)
+{
+        struct rb_node **newn = &tree->rb_node;
+        struct rb_node *parent = NULL;
+        int rc;
+        while (*newn) {
+                struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb,
+                                               res_hashnode);
+                parent = *newn;
+                rc = rsb_cmp(cur, rsb->res_name, rsb->res_length);
+                if (rc < 0)
+                        newn = &parent->rb_left;
+                else if (rc > 0)
+                        newn = &parent->rb_right;
+                else {
+                        log_print("rsb_insert match");
+                        dlm_dump_rsb(rsb);
+                        dlm_dump_rsb(cur);
+                        return -EEXIST;
+                }
+        }
+        rb_link_node(&rsb->res_hashnode, parent, newn);
+        rb_insert_color(&rsb->res_hashnode, tree);
+        return 0;
+}
 static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
                       unsigned int flags, struct dlm_rsb **r_ret)
 {
        struct dlm_rsb *r;
        int error;
-        error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
+        error = search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, flags, &r);
        if (!error) {
                kref_get(&r->res_ref);
                goto out;
        }
-        error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
+        error = search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
        if (error)
                goto out;
-        list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
+        rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
+        error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
+        if (error)
+                return error;
        if (dlm_no_directory(ls))
                goto out;
@@ -527,8 +578,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
                        nodeid = 0;
                r->res_nodeid = nodeid;
        }
-        list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
+        error = rsb_insert(r, &ls->ls_rsbtbl[bucket].keep);
-        error = 0;
 out_unlock:
        spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 out:
@@ -556,7 +606,8 @@ static void toss_rsb(struct kref *kref)
        DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
        kref_init(&r->res_ref);
-        list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
+        rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);
+        rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);
        r->res_toss_time = jiffies;
        if (r->res_lvbptr) {
                dlm_free_lvb(r->res_lvbptr);
@@ -1082,19 +1133,19 @@ static void dir_remove(struct dlm_rsb *r)
                                     r->res_name, r->res_length);
 }
-/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
+/* FIXME: make this more efficient */
-   found since they are in order of newest to oldest? */
 static int shrink_bucket(struct dlm_ls *ls, int b)
 {
+        struct rb_node *n;
        struct dlm_rsb *r;
        int count = 0, found;
        for (;;) {
                found = 0;
                spin_lock(&ls->ls_rsbtbl[b].lock);
-                list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
+                for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = rb_next(n)) {
-                                            res_hashchain) {
+                        r = rb_entry(n, struct dlm_rsb, res_hashnode);
                        if (!time_after_eq(jiffies, r->res_toss_time +
                                           dlm_config.ci_toss_secs * HZ))
                                continue;
@@ -1108,7 +1159,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
                }
                if (kref_put(&r->res_ref, kill_rsb)) {
-                        list_del(&r->res_hashchain);
+                        rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
                        spin_unlock(&ls->ls_rsbtbl[b].lock);
                        if (is_master(r))
@@ -4441,10 +4492,12 @@ int dlm_purge_locks(struct dlm_ls *ls)
 static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
 {
+        struct rb_node *n;
        struct dlm_rsb *r, *r_ret = NULL;
        spin_lock(&ls->ls_rsbtbl[bucket].lock);
-        list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
+        for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) {
+                r = rb_entry(n, struct dlm_rsb, res_hashnode);
                if (!rsb_flag(r, RSB_LOCKS_PURGED))
                        continue;
                hold_rsb(r);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index a1d8f1af144b..a1ea25face82 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -386,12 +386,15 @@ static void threads_stop(void)
        dlm_lowcomms_stop();
 }
-static int new_lockspace(const char *name, int namelen, void **lockspace,
+static int new_lockspace(const char *name, const char *cluster,
-                         uint32_t flags, int lvblen)
+                         uint32_t flags, int lvblen,
+                         const struct dlm_lockspace_ops *ops, void *ops_arg,
+                         int *ops_result, dlm_lockspace_t **lockspace)
 {
        struct dlm_ls *ls;
        int i, size, error;
        int do_unreg = 0;
+        int namelen = strlen(name);
        if (namelen > DLM_LOCKSPACE_LEN)
                return -EINVAL;
@@ -403,8 +406,24 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
                return -EINVAL;
        if (!dlm_user_daemon_available()) {
-                module_put(THIS_MODULE);
+                log_print("dlm user daemon not available");
-                return -EUNATCH;
+                error = -EUNATCH;
+                goto out;
+        }
+        if (ops && ops_result) {
+                if (!dlm_config.ci_recover_callbacks)
+                        *ops_result = -EOPNOTSUPP;
+                else
+                        *ops_result = 0;
+        }
+        if (dlm_config.ci_recover_callbacks && cluster &&
+            strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) {
+                log_print("dlm cluster name %s mismatch %s",
+                          dlm_config.ci_cluster_name, cluster);
+                error = -EBADR;
+                goto out;
        }
        error = 0;
@@ -442,6 +461,11 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        ls->ls_flags = 0;
        ls->ls_scan_time = jiffies;
+        if (ops && dlm_config.ci_recover_callbacks) {
+                ls->ls_ops = ops;
+                ls->ls_ops_arg = ops_arg;
+        }
        if (flags & DLM_LSFL_TIMEWARN)
                set_bit(LSFL_TIMEWARN, &ls->ls_flags);
@@ -457,8 +481,8 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        if (!ls->ls_rsbtbl)
                goto out_lsfree;
        for (i = 0; i < size; i++) {
-                INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
+                ls->ls_rsbtbl[i].keep.rb_node = NULL;
-                INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
+                ls->ls_rsbtbl[i].toss.rb_node = NULL;
                spin_lock_init(&ls->ls_rsbtbl[i].lock);
        }
@@ -525,6 +549,11 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        if (!ls->ls_recover_buf)
                goto out_dirfree;
+        ls->ls_slot = 0;
+        ls->ls_num_slots = 0;
+        ls->ls_slots_size = 0;
+        ls->ls_slots = NULL;
        INIT_LIST_HEAD(&ls->ls_recover_list);
        spin_lock_init(&ls->ls_recover_list_lock);
        ls->ls_recover_list_count = 0;
@@ -614,8 +643,10 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
        return error;
 }
-int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
+int dlm_new_lockspace(const char *name, const char *cluster,
-                      uint32_t flags, int lvblen)
+                      uint32_t flags, int lvblen,
+                      const struct dlm_lockspace_ops *ops, void *ops_arg,
+                      int *ops_result, dlm_lockspace_t **lockspace)
 {
        int error = 0;
@@ -625,7 +656,8 @@ int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
        if (error)
                goto out;
-        error = new_lockspace(name, namelen, lockspace, flags, lvblen);
+        error = new_lockspace(name, cluster, flags, lvblen, ops, ops_arg,
+                              ops_result, lockspace);
        if (!error)
                ls_count++;
        if (error > 0)
@@ -685,7 +717,7 @@ static int lockspace_busy(struct dlm_ls *ls, int force)
 static int release_lockspace(struct dlm_ls *ls, int force)
 {
        struct dlm_rsb *rsb;
-        struct list_head *head;
+        struct rb_node *n;
        int i, busy, rv;
        busy = lockspace_busy(ls, force);
@@ -746,20 +778,15 @@ static int release_lockspace(struct dlm_ls *ls, int force)
         */
        for (i = 0; i < ls->ls_rsbtbl_size; i++) {
-                head = &ls->ls_rsbtbl[i].list;
+                while ((n = rb_first(&ls->ls_rsbtbl[i].keep))) {
-                while (!list_empty(head)) {
+                        rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
-                        rsb = list_entry(head->next, struct dlm_rsb,
+                        rb_erase(n, &ls->ls_rsbtbl[i].keep);
-                                         res_hashchain);
-                        list_del(&rsb->res_hashchain);
                        dlm_free_rsb(rsb);
                }
-                head = &ls->ls_rsbtbl[i].toss;
+                while ((n = rb_first(&ls->ls_rsbtbl[i].toss))) {
-                while (!list_empty(head)) {
+                        rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
-                        rsb = list_entry(head->next, struct dlm_rsb,
+                        rb_erase(n, &ls->ls_rsbtbl[i].toss);
-                                         res_hashchain);
-                        list_del(&rsb->res_hashchain);
                        dlm_free_rsb(rsb);
                }
        }
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b12532e553f8..862640a36d5c 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2009 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,280 @@
 #include "config.h"
 #include "lowcomms.h"
+int dlm_slots_version(struct dlm_header *h)
+{
+        if ((h->h_version & 0x0000FFFF) < DLM_HEADER_SLOTS)
+                return 0;
+        return 1;
+}
+void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
+                   struct dlm_member *memb)
+{
+        struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
+        if (!dlm_slots_version(&rc->rc_header))
+                return;
+        memb->slot = le16_to_cpu(rf->rf_our_slot);
+        memb->generation = le32_to_cpu(rf->rf_generation);
+}
+void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+        struct dlm_slot *slot;
+        struct rcom_slot *ro;
+        int i;
+        ro = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
+        /* ls_slots array is sparse, but not rcom_slots */
+        for (i = 0; i < ls->ls_slots_size; i++) {
+                slot = &ls->ls_slots[i];
+                if (!slot->nodeid)
+                        continue;
+                ro->ro_nodeid = cpu_to_le32(slot->nodeid);
+                ro->ro_slot = cpu_to_le16(slot->slot);
+                ro++;
+        }
+}
+#define SLOT_DEBUG_LINE 128
+static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
+                            struct rcom_slot *ro0, struct dlm_slot *array,
+                            int array_size)
+{
+        char line[SLOT_DEBUG_LINE];
+        int len = SLOT_DEBUG_LINE - 1;
+        int pos = 0;
+        int ret, i;
+        if (!dlm_config.ci_log_debug)
+                return;
+        memset(line, 0, sizeof(line));
+        if (array) {
+                for (i = 0; i < array_size; i++) {
+                        if (!array[i].nodeid)
+                                continue;
+                        ret = snprintf(line + pos, len - pos, " %d:%d",
+                                       array[i].slot, array[i].nodeid);
+                        if (ret >= len - pos)
+                                break;
+                        pos += ret;
+                }
+        } else if (ro0) {
+                for (i = 0; i < num_slots; i++) {
+                        ret = snprintf(line + pos, len - pos, " %d:%d",
+                                       ro0[i].ro_slot, ro0[i].ro_nodeid);
+                        if (ret >= len - pos)
+                                break;
+                        pos += ret;
+                }
+        }
+        log_debug(ls, "generation %u slots %d%s", gen, num_slots, line);
+}
+int dlm_slots_copy_in(struct dlm_ls *ls)
+{
+        struct dlm_member *memb;
+        struct dlm_rcom *rc = ls->ls_recover_buf;
+        struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
+        struct rcom_slot *ro0, *ro;
+        int our_nodeid = dlm_our_nodeid();
+        int i, num_slots;
+        uint32_t gen;
+        if (!dlm_slots_version(&rc->rc_header))
+                return -1;
+        gen = le32_to_cpu(rf->rf_generation);
+        if (gen <= ls->ls_generation) {
+                log_error(ls, "dlm_slots_copy_in gen %u old %u",
+                          gen, ls->ls_generation);
+        }
+        ls->ls_generation = gen;
+        num_slots = le16_to_cpu(rf->rf_num_slots);
+        if (!num_slots)
+                return -1;
+        ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
+        for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
+                ro->ro_nodeid = le32_to_cpu(ro->ro_nodeid);
+                ro->ro_slot = le16_to_cpu(ro->ro_slot);
+        }
+        log_debug_slots(ls, gen, num_slots, ro0, NULL, 0);
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
+                        if (ro->ro_nodeid != memb->nodeid)
+                                continue;
+                        memb->slot = ro->ro_slot;
+                        memb->slot_prev = memb->slot;
+                        break;
+                }
+                if (memb->nodeid == our_nodeid) {
+                        if (ls->ls_slot && ls->ls_slot != memb->slot) {
+                                log_error(ls, "dlm_slots_copy_in our slot "
+                                          "changed %d %d", ls->ls_slot,
+                                          memb->slot);
+                                return -1;
+                        }
+                        if (!ls->ls_slot)
+                                ls->ls_slot = memb->slot;
+                }
+                if (!memb->slot) {
+                        log_error(ls, "dlm_slots_copy_in nodeid %d no slot",
+                                   memb->nodeid);
+                        return -1;
+                }
+        }
+        return 0;
+}
+/* for any nodes that do not support slots, we will not have set memb->slot
+   in wait_status_all(), so memb->slot will remain -1, and we will not
+   assign slots or set ls_num_slots here */
+int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
+                     struct dlm_slot **slots_out, uint32_t *gen_out)
+{
+        struct dlm_member *memb;
+        struct dlm_slot *array;
+        int our_nodeid = dlm_our_nodeid();
+        int array_size, max_slots, i;
+        int need = 0;
+        int max = 0;
+        int num = 0;
+        uint32_t gen = 0;
+        /* our own memb struct will have slot -1 gen 0 */
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                if (memb->nodeid == our_nodeid) {
+                        memb->slot = ls->ls_slot;
+                        memb->generation = ls->ls_generation;
+                        break;
+                }
+        }
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                if (memb->generation > gen)
+                        gen = memb->generation;
+                /* node doesn't support slots */
+                if (memb->slot == -1)
+                        return -1;
+                /* node needs a slot assigned */
+                if (!memb->slot)
+                        need++;
+                /* node has a slot assigned */
+                num++;
+                if (!max || max < memb->slot)
+                        max = memb->slot;
+                /* sanity check, once slot is assigned it shouldn't change */
+                if (memb->slot_prev && memb->slot && memb->slot_prev != memb->slot) {
+                        log_error(ls, "nodeid %d slot changed %d %d",
+                                  memb->nodeid, memb->slot_prev, memb->slot);
+                        return -1;
+                }
+                memb->slot_prev = memb->slot;
+        }
+        array_size = max + need;
+        array = kzalloc(array_size * sizeof(struct dlm_slot), GFP_NOFS);
+        if (!array)
+                return -ENOMEM;
+        num = 0;
+        /* fill in slots (offsets) that are used */
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                if (!memb->slot)
+                        continue;
+                if (memb->slot > array_size) {
+                        log_error(ls, "invalid slot number %d", memb->slot);
+                        kfree(array);
+                        return -1;
+                }
+                array[memb->slot - 1].nodeid = memb->nodeid;
+                array[memb->slot - 1].slot = memb->slot;
+                num++;
+        }
+        /* assign new slots from unused offsets */
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                if (memb->slot)
+                        continue;
+                for (i = 0; i < array_size; i++) {
+                        if (array[i].nodeid)
+                                continue;
+                        memb->slot = i + 1;
+                        memb->slot_prev = memb->slot;
+                        array[i].nodeid = memb->nodeid;
+                        array[i].slot = memb->slot;
+                        num++;
+                        if (!ls->ls_slot && memb->nodeid == our_nodeid)
+                                ls->ls_slot = memb->slot;
+                        break;
+                }
+                if (!memb->slot) {
+                        log_error(ls, "no free slot found");
+                        kfree(array);
+                        return -1;
+                }
+        }
+        gen++;
+        log_debug_slots(ls, gen, num, NULL, array, array_size);
+        max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) -
+                     sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
+        if (num > max_slots) {
+                log_error(ls, "num_slots %d exceeds max_slots %d",
+                          num, max_slots);
+                kfree(array);
+                return -1;
+        }
+        *gen_out = gen;
+        *slots_out = array;
+        *slots_size = array_size;
+        *num_slots = num;
+        return 0;
+}
 static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
 {
        struct dlm_member *memb = NULL;
@@ -43,59 +317,51 @@ static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
        }
 }
-static int dlm_add_member(struct dlm_ls *ls, int nodeid)
+static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node)
 {
        struct dlm_member *memb;
-        int w, error;
+        int error;
        memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
        if (!memb)
                return -ENOMEM;
-        w = dlm_node_weight(ls->ls_name, nodeid);
+        error = dlm_lowcomms_connect_node(node->nodeid);
-        if (w < 0) {
-                kfree(memb);
-                return w;
-        }
-        error = dlm_lowcomms_connect_node(nodeid);
        if (error < 0) {
                kfree(memb);
                return error;
        }
-        memb->nodeid = nodeid;
+        memb->nodeid = node->nodeid;
-        memb->weight = w;
+        memb->weight = node->weight;
+        memb->comm_seq = node->comm_seq;
        add_ordered_member(ls, memb);
        ls->ls_num_nodes++;
        return 0;
 }
-static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
+static struct dlm_member *find_memb(struct list_head *head, int nodeid)
-{
-        list_move(&memb->list, &ls->ls_nodes_gone);
-        ls->ls_num_nodes--;
-}
-int dlm_is_member(struct dlm_ls *ls, int nodeid)
 {
        struct dlm_member *memb;
-        list_for_each_entry(memb, &ls->ls_nodes, list) {
+        list_for_each_entry(memb, head, list) {
                if (memb->nodeid == nodeid)
-                        return 1;
+                        return memb;
        }
+        return NULL;
+}
+int dlm_is_member(struct dlm_ls *ls, int nodeid)
+{
+        if (find_memb(&ls->ls_nodes, nodeid))
+                return 1;
        return 0;
 }
 int dlm_is_removed(struct dlm_ls *ls, int nodeid)
 {
-        struct dlm_member *memb;
+        if (find_memb(&ls->ls_nodes_gone, nodeid))
+                return 1;
-        list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
-                if (memb->nodeid == nodeid)
-                        return 1;
-        }
        return 0;
 }
@@ -176,7 +442,7 @@ static int ping_members(struct dlm_ls *ls)
                error = dlm_recovery_stopped(ls);
                if (error)
                        break;
-                error = dlm_rcom_status(ls, memb->nodeid);
+                error = dlm_rcom_status(ls, memb->nodeid, 0);
                if (error)
                        break;
        }
@@ -186,10 +452,88 @@ static int ping_members(struct dlm_ls *ls)
        return error;
 }
+static void dlm_lsop_recover_prep(struct dlm_ls *ls)
+{
+        if (!ls->ls_ops || !ls->ls_ops->recover_prep)
+                return;
+        ls->ls_ops->recover_prep(ls->ls_ops_arg);
+}
+static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
+{
+        struct dlm_slot slot;
+        uint32_t seq;
+        int error;
+        if (!ls->ls_ops || !ls->ls_ops->recover_slot)
+                return;
+        /* if there is no comms connection with this node
+           or the present comms connection is newer
+           than the one when this member was added, then
+           we consider the node to have failed (versus
+           being removed due to dlm_release_lockspace) */
+        error = dlm_comm_seq(memb->nodeid, &seq);
+        if (!error && seq == memb->comm_seq)
+                return;
+        slot.nodeid = memb->nodeid;
+        slot.slot = memb->slot;
+        ls->ls_ops->recover_slot(ls->ls_ops_arg, &slot);
+}
+void dlm_lsop_recover_done(struct dlm_ls *ls)
+{
+        struct dlm_member *memb;
+        struct dlm_slot *slots;
+        int i, num;
+        if (!ls->ls_ops || !ls->ls_ops->recover_done)
+                return;
+        num = ls->ls_num_nodes;
+        slots = kzalloc(num * sizeof(struct dlm_slot), GFP_KERNEL);
+        if (!slots)
+                return;
+        i = 0;
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                if (i == num) {
+                        log_error(ls, "dlm_lsop_recover_done bad num %d", num);
+                        goto out;
+                }
+                slots[i].nodeid = memb->nodeid;
+                slots[i].slot = memb->slot;
+                i++;
+        }
+        ls->ls_ops->recover_done(ls->ls_ops_arg, slots, num,
+                                 ls->ls_slot, ls->ls_generation);
+ out:
+        kfree(slots);
+}
+static struct dlm_config_node *find_config_node(struct dlm_recover *rv,
+                                                int nodeid)
+{
+        int i;
+        for (i = 0; i < rv->nodes_count; i++) {
+                if (rv->nodes[i].nodeid == nodeid)
+                        return &rv->nodes[i];
+        }
+        return NULL;
+}
 int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 {
        struct dlm_member *memb, *safe;
-        int i, error, found, pos = 0, neg = 0, low = -1;
+        struct dlm_config_node *node;
+        int i, error, neg = 0, low = -1;
        /* previously removed members that we've not finished removing need to
           count as a negative change so the "neg" recovery steps will happen */
@@ -202,46 +546,32 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
        /* move departed members from ls_nodes to ls_nodes_gone */
        list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
-                found = 0;
+                node = find_config_node(rv, memb->nodeid);
-                for (i = 0; i < rv->node_count; i++) {
+                if (node && !node->new)
-                        if (memb->nodeid == rv->nodeids[i]) {
+                        continue;
-                                found = 1;
-                                break;
-                        }
-                }
-                if (!found) {
+                if (!node) {
-                        neg++;
-                        dlm_remove_member(ls, memb);
                        log_debug(ls, "remove member %d", memb->nodeid);
+                } else {
+                        /* removed and re-added */
+                        log_debug(ls, "remove member %d comm_seq %u %u",
+                                  memb->nodeid, memb->comm_seq, node->comm_seq);
                }
-        }
-        /* Add an entry to ls_nodes_gone for members that were removed and
-           then added again, so that previous state for these nodes will be
-           cleared during recovery. */
-        for (i = 0; i < rv->new_count; i++) {
-                if (!dlm_is_member(ls, rv->new[i]))
-                        continue;
-                log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
-                memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
-                if (!memb)
-                        return -ENOMEM;
-                memb->nodeid = rv->new[i];
-                list_add_tail(&memb->list, &ls->ls_nodes_gone);
                neg++;
+                list_move(&memb->list, &ls->ls_nodes_gone);
+                ls->ls_num_nodes--;
+                dlm_lsop_recover_slot(ls, memb);
        }
        /* add new members to ls_nodes */
-        for (i = 0; i < rv->node_count; i++) {
+        for (i = 0; i < rv->nodes_count; i++) {
-                if (dlm_is_member(ls, rv->nodeids[i]))
+                node = &rv->nodes[i];
+                if (dlm_is_member(ls, node->nodeid))
                        continue;
-                dlm_add_member(ls, rv->nodeids[i]);
+                dlm_add_member(ls, node);
-                pos++;
+                log_debug(ls, "add member %d", node->nodeid);
-                log_debug(ls, "add member %d", rv->nodeids[i]);
        }
        list_for_each_entry(memb, &ls->ls_nodes, list) {
@@ -251,7 +581,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
        ls->ls_low_nodeid = low;
        make_member_array(ls);
-        dlm_set_recover_status(ls, DLM_RS_NODES);
        *neg_out = neg;
        error = ping_members(ls);
@@ -261,12 +590,8 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
                ls->ls_members_result = error;
                complete(&ls->ls_members_done);
        }
-        if (error)
-                goto out;
-        error = dlm_recover_members_wait(ls);
+        log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
- out:
-        log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
        return error;
 }
@@ -327,26 +652,35 @@ int dlm_ls_stop(struct dlm_ls *ls)
         */
        dlm_recoverd_suspend(ls);
+        spin_lock(&ls->ls_recover_lock);
+        kfree(ls->ls_slots);
+        ls->ls_slots = NULL;
+        ls->ls_num_slots = 0;
+        ls->ls_slots_size = 0;
        ls->ls_recover_status = 0;
+        spin_unlock(&ls->ls_recover_lock);
        dlm_recoverd_resume(ls);
        if (!ls->ls_recover_begin)
                ls->ls_recover_begin = jiffies;
+        dlm_lsop_recover_prep(ls);
        return 0;
 }
 int dlm_ls_start(struct dlm_ls *ls)
 {
        struct dlm_recover *rv = NULL, *rv_old;
-        int *ids = NULL, *new = NULL;
+        struct dlm_config_node *nodes;
-        int error, ids_count = 0, new_count = 0;
+        int error, count;
        rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
        if (!rv)
                return -ENOMEM;
-        error = dlm_nodeid_list(ls->ls_name, &ids, &ids_count,
+        error = dlm_config_nodes(ls->ls_name, &nodes, &count);
-                                &new, &new_count);
        if (error < 0)
                goto fail;
@@ -361,10 +695,8 @@ int dlm_ls_start(struct dlm_ls *ls)
                goto fail;
        }
-        rv->nodeids = ids;
+        rv->nodes = nodes;
-        rv->node_count = ids_count;
+        rv->nodes_count = count;
-        rv->new = new;
-        rv->new_count = new_count;
        rv->seq = ++ls->ls_recover_seq;
        rv_old = ls->ls_recover_args;
        ls->ls_recover_args = rv;
@@ -372,9 +704,8 @@ int dlm_ls_start(struct dlm_ls *ls)
        if (rv_old) {
                log_error(ls, "unused recovery %llx %d",
-                          (unsigned long long)rv_old->seq, rv_old->node_count);
+                          (unsigned long long)rv_old->seq, rv_old->nodes_count);
-                kfree(rv_old->nodeids);
+                kfree(rv_old->nodes);
-                kfree(rv_old->new);
                kfree(rv_old);
        }
@@ -383,8 +714,7 @@ int dlm_ls_start(struct dlm_ls *ls)
 fail:
        kfree(rv);
-        kfree(ids);
+        kfree(nodes);
-        kfree(new);
        return error;
 }
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 7a26fca1e0b5..3deb70661c69 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -20,6 +20,14 @@ void dlm_clear_members_gone(struct dlm_ls *ls);
 int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
 int dlm_is_removed(struct dlm_ls *ls, int nodeid);
 int dlm_is_member(struct dlm_ls *ls, int nodeid);
+int dlm_slots_version(struct dlm_header *h);
+void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
+                   struct dlm_member *memb);
+void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_slots_copy_in(struct dlm_ls *ls);
+int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
+                     struct dlm_slot **slots_out, uint32_t *gen_out);
+void dlm_lsop_recover_done(struct dlm_ls *ls);
 #endif                          /* __MEMBER_DOT_H__ */
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index f10a50f24e8f..ac5c616c9696 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -23,6 +23,7 @@
 #include "memory.h"
 #include "lock.h"
 #include "util.h"
+#include "member.h"
 static int rcom_response(struct dlm_ls *ls)
@@ -72,20 +73,30 @@ static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
        dlm_lowcomms_commit_buffer(mh);
 }
+static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs,
+                            uint32_t flags)
+{
+        rs->rs_flags = cpu_to_le32(flags);
+}
 /* When replying to a status request, a node also sends back its
   configuration values.  The requesting node then checks that the remote
   node is configured the same way as itself. */
-static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
+static void set_rcom_config(struct dlm_ls *ls, struct rcom_config *rf,
+                            uint32_t num_slots)
 {
        rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen);
        rf->rf_lsflags = cpu_to_le32(ls->ls_exflags);
+        rf->rf_our_slot = cpu_to_le16(ls->ls_slot);
+        rf->rf_num_slots = cpu_to_le16(num_slots);
+        rf->rf_generation =  cpu_to_le32(ls->ls_generation);
 }
-static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
+static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
 {
        struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
-        size_t conf_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
        if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
                log_error(ls, "version mismatch: %x nodeid %d: %x",
@@ -94,12 +105,6 @@ static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
                return -EPROTO;
        }
-        if (rc->rc_header.h_length < conf_size) {
-                log_error(ls, "config too short: %d nodeid %d",
-                          rc->rc_header.h_length, nodeid);
-                return -EPROTO;
-        }
        if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen ||
            le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) {
                log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
@@ -127,7 +132,18 @@ static void disallow_sync_reply(struct dlm_ls *ls)
        spin_unlock(&ls->ls_rcom_spin);
 }
-int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
+/*
+ * low nodeid gathers one slot value at a time from each node.
+ * it sets need_slots=0, and saves rf_our_slot returned from each
+ * rcom_config.
+ *
+ * other nodes gather all slot values at once from the low nodeid.
+ * they set need_slots=1, and ignore the rf_our_slot returned from each
+ * rcom_config.  they use the rf_num_slots returned from the low
+ * node's rcom_config.
+ */
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
 {
        struct dlm_rcom *rc;
        struct dlm_mhandle *mh;
@@ -141,10 +157,13 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
                goto out;
        }
-        error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
+        error = create_rcom(ls, nodeid, DLM_RCOM_STATUS,
+                            sizeof(struct rcom_status), &rc, &mh);
        if (error)
                goto out;
+        set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags);
        allow_sync_reply(ls, &rc->rc_id);
        memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
@@ -161,8 +180,11 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
                /* we pretend the remote lockspace exists with 0 status */
                log_debug(ls, "remote node %d not ready", nodeid);
                rc->rc_result = 0;
-        } else
+                error = 0;
-                error = check_config(ls, rc, nodeid);
+        } else {
+                error = check_rcom_config(ls, rc, nodeid);
+        }
        /* the caller looks at rc_result for the remote recovery status */
 out:
        return error;
@@ -172,17 +194,60 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 {
        struct dlm_rcom *rc;
        struct dlm_mhandle *mh;
-        int error, nodeid = rc_in->rc_header.h_nodeid;
+        struct rcom_status *rs;
+        uint32_t status;
+        int nodeid = rc_in->rc_header.h_nodeid;
+        int len = sizeof(struct rcom_config);
+        int num_slots = 0;
+        int error;
+        if (!dlm_slots_version(&rc_in->rc_header)) {
+                status = dlm_recover_status(ls);
+                goto do_create;
+        }
+        rs = (struct rcom_status *)rc_in->rc_buf;
+        if (!(rs->rs_flags & DLM_RSF_NEED_SLOTS)) {
+                status = dlm_recover_status(ls);
+                goto do_create;
+        }
+        spin_lock(&ls->ls_recover_lock);
+        status = ls->ls_recover_status;
+        num_slots = ls->ls_num_slots;
+        spin_unlock(&ls->ls_recover_lock);
+        len += num_slots * sizeof(struct rcom_slot);
+ do_create:
        error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
-                            sizeof(struct rcom_config), &rc, &mh);
+                            len, &rc, &mh);
        if (error)
                return;
        rc->rc_id = rc_in->rc_id;
        rc->rc_seq_reply = rc_in->rc_seq;
-        rc->rc_result = dlm_recover_status(ls);
+        rc->rc_result = status;
-        make_config(ls, (struct rcom_config *) rc->rc_buf);
+        set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots);
+        if (!num_slots)
+                goto do_send;
+        spin_lock(&ls->ls_recover_lock);
+        if (ls->ls_num_slots != num_slots) {
+                spin_unlock(&ls->ls_recover_lock);
+                log_debug(ls, "receive_rcom_status num_slots %d to %d",
+                          num_slots, ls->ls_num_slots);
+                rc->rc_result = 0;
+                set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, 0);
+                goto do_send;
+        }
+        dlm_slots_copy_out(ls, rc);
+        spin_unlock(&ls->ls_recover_lock);
+ do_send:
        send_rcom(ls, mh, rc);
 }
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
index b09abd29ba38..206723ab744d 100644
--- a/fs/dlm/rcom.h
+++ b/fs/dlm/rcom.h
@@ -14,7 +14,7 @@
 #ifndef __RCOM_DOT_H__
 #define __RCOM_DOT_H__
-int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags);
 int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
 int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
 int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 14638235f7b2..34d5adf1fce7 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -85,14 +85,20 @@ uint32_t dlm_recover_status(struct dlm_ls *ls)
        return status;
 }
+static void _set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+        ls->ls_recover_status |= status;
+}
 void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
 {
        spin_lock(&ls->ls_recover_lock);
-        ls->ls_recover_status |= status;
+        _set_recover_status(ls, status);
        spin_unlock(&ls->ls_recover_lock);
 }
-static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
+static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
+                           int save_slots)
 {
        struct dlm_rcom *rc = ls->ls_recover_buf;
        struct dlm_member *memb;
@@ -106,10 +112,13 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
                                goto out;
                        }
-                        error = dlm_rcom_status(ls, memb->nodeid);
+                        error = dlm_rcom_status(ls, memb->nodeid, 0);
                        if (error)
                                goto out;
+                        if (save_slots)
+                                dlm_slot_save(ls, rc, memb);
                        if (rc->rc_result & wait_status)
                                break;
                        if (delay < 1000)
@@ -121,7 +130,8 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
        return error;
 }
-static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
+static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
+                           uint32_t status_flags)
 {
        struct dlm_rcom *rc = ls->ls_recover_buf;
        int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
@@ -132,7 +142,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
                        goto out;
                }
-                error = dlm_rcom_status(ls, nodeid);
+                error = dlm_rcom_status(ls, nodeid, status_flags);
                if (error)
                        break;
@@ -152,18 +162,56 @@ static int wait_status(struct dlm_ls *ls, uint32_t status)
        int error;
        if (ls->ls_low_nodeid == dlm_our_nodeid()) {
-                error = wait_status_all(ls, status);
+                error = wait_status_all(ls, status, 0);
                if (!error)
                        dlm_set_recover_status(ls, status_all);
        } else
-                error = wait_status_low(ls, status_all);
+                error = wait_status_low(ls, status_all, 0);
        return error;
 }
 int dlm_recover_members_wait(struct dlm_ls *ls)
 {
-        return wait_status(ls, DLM_RS_NODES);
+        struct dlm_member *memb;
+        struct dlm_slot *slots;
+        int num_slots, slots_size;
+        int error, rv;
+        uint32_t gen;
+        list_for_each_entry(memb, &ls->ls_nodes, list) {
+                memb->slot = -1;
+                memb->generation = 0;
+        }
+        if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+                error = wait_status_all(ls, DLM_RS_NODES, 1);
+                if (error)
+                        goto out;
+                /* slots array is sparse, slots_size may be > num_slots */
+                rv = dlm_slots_assign(ls, &num_slots, &slots_size, &slots, &gen);
+                if (!rv) {
+                        spin_lock(&ls->ls_recover_lock);
+                        _set_recover_status(ls, DLM_RS_NODES_ALL);
+                        ls->ls_num_slots = num_slots;
+                        ls->ls_slots_size = slots_size;
+                        ls->ls_slots = slots;
+                        ls->ls_generation = gen;
+                        spin_unlock(&ls->ls_recover_lock);
+                } else {
+                        dlm_set_recover_status(ls, DLM_RS_NODES_ALL);
+                }
+        } else {
+                error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS);
+                if (error)
+                        goto out;
+                dlm_slots_copy_in(ls);
+        }
+ out:
+        return error;
 }
 int dlm_recover_directory_wait(struct dlm_ls *ls)
@@ -542,8 +590,6 @@ int dlm_recover_locks(struct dlm_ls *ls)
 out:
        if (error)
                recover_list_clear(ls);
-        else
-                dlm_set_recover_status(ls, DLM_RS_LOCKS);
        return error;
 }
@@ -715,6 +761,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
 int dlm_create_root_list(struct dlm_ls *ls)
 {
+        struct rb_node *n;
        struct dlm_rsb *r;
        int i, error = 0;
@@ -727,7 +774,8 @@ int dlm_create_root_list(struct dlm_ls *ls)
        for (i = 0; i < ls->ls_rsbtbl_size; i++) {
                spin_lock(&ls->ls_rsbtbl[i].lock);
-                list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
+                for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) {
+                        r = rb_entry(n, struct dlm_rsb, res_hashnode);
                        list_add(&r->res_root_list, &ls->ls_root_list);
                        dlm_hold_rsb(r);
                }
@@ -741,7 +789,8 @@ int dlm_create_root_list(struct dlm_ls *ls)
                        continue;
                }
-                list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) {
+                for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = rb_next(n)) {
+                        r = rb_entry(n, struct dlm_rsb, res_hashnode);
                        list_add(&r->res_root_list, &ls->ls_root_list);
                        dlm_hold_rsb(r);
                }
@@ -771,16 +820,18 @@ void dlm_release_root_list(struct dlm_ls *ls)
 void dlm_clear_toss_list(struct dlm_ls *ls)
 {
-        struct dlm_rsb *r, *safe;
+        struct rb_node *n, *next;
+        struct dlm_rsb *rsb;
        int i;
        for (i = 0; i < ls->ls_rsbtbl_size; i++) {
                spin_lock(&ls->ls_rsbtbl[i].lock);
-                list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
+                for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = next) {
-                                         res_hashchain) {
+                        next = rb_next(n);;
-                        if (dlm_no_directory(ls) || !is_master(r)) {
+                        rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
-                                list_del(&r->res_hashchain);
+                        if (dlm_no_directory(ls) || !is_master(rsb)) {
-                                dlm_free_rsb(r);
+                                rb_erase(n, &ls->ls_rsbtbl[i].toss);
+                                dlm_free_rsb(rsb);
                        }
                }
                spin_unlock(&ls->ls_rsbtbl[i].lock);
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 774da3cf92c6..3780caf7ae0c 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2011 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -54,7 +54,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        unsigned long start;
        int error, neg = 0;
-        log_debug(ls, "recover %llx", (unsigned long long)rv->seq);
+        log_debug(ls, "dlm_recover %llx", (unsigned long long)rv->seq);
        mutex_lock(&ls->ls_recoverd_active);
@@ -76,14 +76,22 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        /*
         * Add or remove nodes from the lockspace's ls_nodes list.
-         * Also waits for all nodes to complete dlm_recover_members.
         */
        error = dlm_recover_members(ls, rv, &neg);
        if (error) {
-                log_debug(ls, "recover_members failed %d", error);
+                log_debug(ls, "dlm_recover_members error %d", error);
                goto fail;
        }
+        dlm_set_recover_status(ls, DLM_RS_NODES);
+        error = dlm_recover_members_wait(ls);
+        if (error) {
+                log_debug(ls, "dlm_recover_members_wait error %d", error);
+                goto fail;
+        }
        start = jiffies;
        /*
@@ -93,17 +101,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        error = dlm_recover_directory(ls);
        if (error) {
-                log_debug(ls, "recover_directory failed %d", error);
+                log_debug(ls, "dlm_recover_directory error %d", error);
                goto fail;
        }
-        /*
+        dlm_set_recover_status(ls, DLM_RS_DIR);
-         * Wait for all nodes to complete directory rebuild.
-         */
        error = dlm_recover_directory_wait(ls);
        if (error) {
-                log_debug(ls, "recover_directory_wait failed %d", error);
+                log_debug(ls, "dlm_recover_directory_wait error %d", error);
                goto fail;
        }
@@ -133,7 +139,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
                error = dlm_recover_masters(ls);
                if (error) {
-                        log_debug(ls, "recover_masters failed %d", error);
+                        log_debug(ls, "dlm_recover_masters error %d", error);
                        goto fail;
                }
@@ -143,13 +149,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
                error = dlm_recover_locks(ls);
                if (error) {
-                        log_debug(ls, "recover_locks failed %d", error);
+                        log_debug(ls, "dlm_recover_locks error %d", error);
                        goto fail;
                }
+                dlm_set_recover_status(ls, DLM_RS_LOCKS);
                error = dlm_recover_locks_wait(ls);
                if (error) {
-                        log_debug(ls, "recover_locks_wait failed %d", error);
+                        log_debug(ls, "dlm_recover_locks_wait error %d", error);
                        goto fail;
                }
@@ -170,7 +178,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
                error = dlm_recover_locks_wait(ls);
                if (error) {
-                        log_debug(ls, "recover_locks_wait failed %d", error);
+                        log_debug(ls, "dlm_recover_locks_wait error %d", error);
                        goto fail;
                }
        }
@@ -186,9 +194,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        dlm_purge_requestqueue(ls);
        dlm_set_recover_status(ls, DLM_RS_DONE);
        error = dlm_recover_done_wait(ls);
        if (error) {
-                log_debug(ls, "recover_done_wait failed %d", error);
+                log_debug(ls, "dlm_recover_done_wait error %d", error);
                goto fail;
        }
@@ -200,34 +209,35 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        error = enable_locking(ls, rv->seq);
        if (error) {
-                log_debug(ls, "enable_locking failed %d", error);
+                log_debug(ls, "enable_locking error %d", error);
                goto fail;
        }
        error = dlm_process_requestqueue(ls);
        if (error) {
-                log_debug(ls, "process_requestqueue failed %d", error);
+                log_debug(ls, "dlm_process_requestqueue error %d", error);
                goto fail;
        }
        error = dlm_recover_waiters_post(ls);
        if (error) {
-                log_debug(ls, "recover_waiters_post failed %d", error);
+                log_debug(ls, "dlm_recover_waiters_post error %d", error);
                goto fail;
        }
        dlm_grant_after_purge(ls);
-        log_debug(ls, "recover %llx done: %u ms",
+        log_debug(ls, "dlm_recover %llx generation %u done: %u ms",
-                  (unsigned long long)rv->seq,
+                  (unsigned long long)rv->seq, ls->ls_generation,
                  jiffies_to_msecs(jiffies - start));
        mutex_unlock(&ls->ls_recoverd_active);
+        dlm_lsop_recover_done(ls);
        return 0;
 fail:
        dlm_release_root_list(ls);
-        log_debug(ls, "recover %llx error %d",
+        log_debug(ls, "dlm_recover %llx error %d",
                  (unsigned long long)rv->seq, error);
        mutex_unlock(&ls->ls_recoverd_active);
        return error;
@@ -250,8 +260,7 @@ static void do_ls_recovery(struct dlm_ls *ls)
        if (rv) {
                ls_recover(ls, rv);
-                kfree(rv->nodeids);
+                kfree(rv->nodes);
-                kfree(rv->new);
                kfree(rv);
        }
 }
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index d8ea60756403..eb4ed9ba3098 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -392,8 +392,9 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        error = dlm_new_lockspace(params->name, strlen(params->name),
+        error = dlm_new_lockspace(params->name, NULL, params->flags,
-                                  &lockspace, params->flags, DLM_USER_LVB_LEN);
+                                  DLM_USER_LVB_LEN, NULL, NULL, NULL,
+                                  &lockspace);
        if (error)
                return error;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 2a834255c75d..63ab24510649 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -417,17 +417,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
                        (unsigned long long)(extent_base + extent_offset), rc);
                goto out;
        }
-        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "Encrypting extent "
-                                "with iv:\n");
-                ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
-                ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
-                                "encryption:\n");
-                ecryptfs_dump_hex((char *)
-                                  (page_address(page)
-                                   + (extent_offset * crypt_stat->extent_size)),
-                                  8);
-        }
        rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0,
                                          page, (extent_offset
                                                 * crypt_stat->extent_size),
@@ -440,14 +429,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
                goto out;
        }
        rc = 0;
-        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
-                        "rc = [%d]\n",
-                        (unsigned long long)(extent_base + extent_offset), rc);
-                ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
-                                "encryption:\n");
-                ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
-        }
 out:
        return rc;
 }
@@ -543,17 +524,6 @@ static int ecryptfs_decrypt_extent(struct page *page,
                        (unsigned long long)(extent_base + extent_offset), rc);
                goto out;
        }
-        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "Decrypting extent "
-                                "with iv:\n");
-                ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
-                ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
-                                "decryption:\n");
-                ecryptfs_dump_hex((char *)
-                                  (page_address(enc_extent_page)
-                                   + (extent_offset * crypt_stat->extent_size)),
-                                  8);
-        }
        rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
                                          (extent_offset
                                           * crypt_stat->extent_size),
@@ -567,16 +537,6 @@ static int ecryptfs_decrypt_extent(struct page *page,
                goto out;
        }
        rc = 0;
-        if (unlikely(ecryptfs_verbosity > 0)) {
-                ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
-                        "rc = [%d]\n",
-                        (unsigned long long)(extent_base + extent_offset), rc);
-                ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
-                                "decryption:\n");
-                ecryptfs_dump_hex((char *)(page_address(page)
-                                           + (extent_offset
-                                              * crypt_stat->extent_size)), 8);
-        }
 out:
        return rc;
 }
@@ -1590,8 +1550,8 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
 */
 int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 {
-        int rc = 0;
+        int rc;
-        char *page_virt = NULL;
+        char *page_virt;
        struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
        struct ecryptfs_crypt_stat *crypt_stat =
            &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
@@ -1616,11 +1576,13 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
                                                ecryptfs_dentry,
                                                ECRYPTFS_VALIDATE_HEADER_SIZE);
        if (rc) {
+                /* metadata is not in the file header, so try xattrs */
                memset(page_virt, 0, PAGE_CACHE_SIZE);
                rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
                if (rc) {
                        printk(KERN_DEBUG "Valid eCryptfs headers not found in "
-                               "file header region or xattr region\n");
+                               "file header region or xattr region, inode %lu\n",
+                                ecryptfs_inode->i_ino);
                        rc = -EINVAL;
                        goto out;
                }
@@ -1629,7 +1591,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
                                                ECRYPTFS_DONT_VALIDATE_HEADER_SIZE);
                if (rc) {
                        printk(KERN_DEBUG "Valid eCryptfs headers not found in "
-                               "file xattr region either\n");
+                               "file xattr region either, inode %lu\n",
+                                ecryptfs_inode->i_ino);
                        rc = -EINVAL;
                }
                if (crypt_stat->mount_crypt_stat->flags
@@ -1640,7 +1603,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
                               "crypto metadata only in the extended attribute "
                               "region, but eCryptfs was mounted without "
                               "xattr support enabled. eCryptfs will not treat "
-                               "this like an encrypted file.\n");
+                               "this like an encrypted file, inode %lu\n",
+                                ecryptfs_inode->i_ino);
                        rc = -EINVAL;
                }
        }
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a9f29b12fbf2..a2362df58ae8 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -151,6 +151,11 @@ ecryptfs_get_key_payload_data(struct key *key)
                                          * dentry name */
 #define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
                                          * metadata */
+#define ECRYPTFS_MIN_PKT_LEN_SIZE 1 /* Min size to specify packet length */
+#define ECRYPTFS_MAX_PKT_LEN_SIZE 2 /* Pass at least this many bytes to
+                                     * ecryptfs_parse_packet_length() and
+                                     * ecryptfs_write_packet_length()
+                                     */
 /* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
 * ECRYPTFS_MAX_IV_BYTES */
 #define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 19a8ca4ab1dd..19892d7d2ed1 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -822,18 +822,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                size_t num_zeros = (PAGE_CACHE_SIZE
                                    - (ia->ia_size & ~PAGE_CACHE_MASK));
-                /*
-                 * XXX(truncate) this should really happen at the begginning
-                 * of ->setattr.  But the code is too messy to that as part
-                 * of a larger patch.  ecryptfs is also totally missing out
-                 * on the inode_change_ok check at the beginning of
-                 * ->setattr while would include this.
-                 */
-                rc = inode_newsize_ok(inode, ia->ia_size);
-                if (rc)
-                        goto out;
                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
                        truncate_setsize(inode, ia->ia_size);
                        lower_ia->ia_size = ia->ia_size;
@@ -883,6 +871,28 @@ out:
        return rc;
 }
+static int ecryptfs_inode_newsize_ok(struct inode *inode, loff_t offset)
+{
+        struct ecryptfs_crypt_stat *crypt_stat;
+        loff_t lower_oldsize, lower_newsize;
+        crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+        lower_oldsize = upper_size_to_lower_size(crypt_stat,
+                                                 i_size_read(inode));
+        lower_newsize = upper_size_to_lower_size(crypt_stat, offset);
+        if (lower_newsize > lower_oldsize) {
+                /*
+                 * The eCryptfs inode and the new *lower* size are mixed here
+                 * because we may not have the lower i_mutex held and/or it may
+                 * not be appropriate to call inode_newsize_ok() with inodes
+                 * from other filesystems.
+                 */
+                return inode_newsize_ok(inode, lower_newsize);
+        }
+        return 0;
+}
 /**
 * ecryptfs_truncate
 * @dentry: The ecryptfs layer dentry
@@ -899,6 +909,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
        struct iattr lower_ia = { .ia_valid = 0 };
        int rc;
+        rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length);
+        if (rc)
+                return rc;
        rc = truncate_upper(dentry, &ia, &lower_ia);
        if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
                struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
@@ -978,6 +992,16 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
                }
        }
        mutex_unlock(&crypt_stat->cs_mutex);
+        rc = inode_change_ok(inode, ia);
+        if (rc)
+                goto out;
+        if (ia->ia_valid & ATTR_SIZE) {
+                rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size);
+                if (rc)
+                        goto out;
+        }
        if (S_ISREG(inode->i_mode)) {
                rc = filemap_write_and_wait(inode->i_mapping);
                if (rc)
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index ac1ad48c2376..8e3b943e330f 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -109,7 +109,7 @@ int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
                (*size) += ((unsigned char)(data[1]) + 192);
                (*length_size) = 2;
        } else if (data[0] == 255) {
-                /* Five-byte length; we're not supposed to see this */
+                /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
                ecryptfs_printk(KERN_ERR, "Five-byte packet length not "
                                "supported\n");
                rc = -EINVAL;
@@ -126,7 +126,7 @@ out:
 /**
 * ecryptfs_write_packet_length
 * @dest: The byte array target into which to write the length. Must
- *        have at least 5 bytes allocated.
+ *        have at least ECRYPTFS_MAX_PKT_LEN_SIZE bytes allocated.
 * @size: The length to write.
 * @packet_size_length: The number of bytes used to encode the packet
 *                      length is written to this address.
@@ -146,6 +146,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
                dest[1] = ((size - 192) % 256);
                (*packet_size_length) = 2;
        } else {
+                /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
                rc = -EINVAL;
                ecryptfs_printk(KERN_WARNING,
                                "Unsupported packet size: [%zd]\n", size);
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 940a82e63dc3..349209dc6a91 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -218,6 +218,29 @@ out_unlock:
        return rc;
 }
+/*
+ * miscdevfs packet format:
+ *  Octet 0: Type
+ *  Octets 1-4: network byte order msg_ctx->counter
+ *  Octets 5-N0: Size of struct ecryptfs_message to follow
+ *  Octets N0-N1: struct ecryptfs_message (including data)
+ *
+ *  Octets 5-N1 not written if the packet type does not include a message
+ */
+#define PKT_TYPE_SIZE           1
+#define PKT_CTR_SIZE            4
+#define MIN_NON_MSG_PKT_SIZE    (PKT_TYPE_SIZE + PKT_CTR_SIZE)
+#define MIN_MSG_PKT_SIZE        (PKT_TYPE_SIZE + PKT_CTR_SIZE \
+                                 + ECRYPTFS_MIN_PKT_LEN_SIZE)
+/* 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES comes from tag 65 packet format */
+#define MAX_MSG_PKT_SIZE        (PKT_TYPE_SIZE + PKT_CTR_SIZE \
+                                 + ECRYPTFS_MAX_PKT_LEN_SIZE \
+                                 + sizeof(struct ecryptfs_message) \
+                                 + 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)
+#define PKT_TYPE_OFFSET         0
+#define PKT_CTR_OFFSET          PKT_TYPE_SIZE
+#define PKT_LEN_OFFSET          (PKT_TYPE_SIZE + PKT_CTR_SIZE)
 /**
 * ecryptfs_miscdev_read - format and send message from queue
 * @file: fs/ecryptfs/euid miscdevfs handle (ignored)
@@ -237,7 +260,7 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
        struct ecryptfs_daemon *daemon;
        struct ecryptfs_msg_ctx *msg_ctx;
        size_t packet_length_size;
-        char packet_length[3];
+        char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE];
        size_t i;
        size_t total_length;
        uid_t euid = current_euid();
@@ -305,15 +328,8 @@ check_list:
                packet_length_size = 0;
                msg_ctx->msg_size = 0;
        }
-        /* miscdevfs packet format:
+        total_length = (PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_length_size
-         *  Octet 0: Type
+                        + msg_ctx->msg_size);
-         *  Octets 1-4: network byte order msg_ctx->counter
-         *  Octets 5-N0: Size of struct ecryptfs_message to follow
-         *  Octets N0-N1: struct ecryptfs_message (including data)
-         *
-         *  Octets 5-N1 not written if the packet type does not
-         *  include a message */
-        total_length = (1 + 4 + packet_length_size + msg_ctx->msg_size);
        if (count < total_length) {
                rc = 0;
                printk(KERN_WARNING "%s: Only given user buffer of "
@@ -324,9 +340,10 @@ check_list:
        rc = -EFAULT;
        if (put_user(msg_ctx->type, buf))
                goto out_unlock_msg_ctx;
-        if (put_user(cpu_to_be32(msg_ctx->counter), (__be32 __user *)(buf + 1)))
+        if (put_user(cpu_to_be32(msg_ctx->counter),
+                     (__be32 __user *)(&buf[PKT_CTR_OFFSET])))
                goto out_unlock_msg_ctx;
-        i = 5;
+        i = PKT_TYPE_SIZE + PKT_CTR_SIZE;
        if (msg_ctx->msg) {
                if (copy_to_user(&buf[i], packet_length, packet_length_size))
                        goto out_unlock_msg_ctx;
@@ -391,12 +408,6 @@ out:
 * @count: Amount of data in @buf
 * @ppos: Pointer to offset in file (ignored)
 *
- * miscdevfs packet format:
- *  Octet 0: Type
- *  Octets 1-4: network byte order msg_ctx->counter (0's for non-response)
- *  Octets 5-N0: Size of struct ecryptfs_message to follow
- *  Octets N0-N1: struct ecryptfs_message (including data)
- *
 * Returns the number of bytes read from @buf
 */
 static ssize_t
@@ -405,60 +416,78 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 {
        __be32 counter_nbo;
        u32 seq;
-        size_t packet_size, packet_size_length, i;
+        size_t packet_size, packet_size_length;
-        ssize_t sz = 0;
        char *data;
        uid_t euid = current_euid();
-        int rc;
+        unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE];
+        ssize_t rc;
-        if (count == 0)
+        if (count == 0) {
-                goto out;
+                return 0;
+        } else if (count == MIN_NON_MSG_PKT_SIZE) {
+                /* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */
+                goto memdup;
+        } else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) {
+                printk(KERN_WARNING "%s: Acceptable packet size range is "
+                       "[%d-%lu], but amount of data written is [%zu].",
+                       __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count);
+                return -EINVAL;
+        }
+        if (copy_from_user(packet_size_peek, &buf[PKT_LEN_OFFSET],
+                           sizeof(packet_size_peek))) {
+                printk(KERN_WARNING "%s: Error while inspecting packet size\n",
+                       __func__);
+                return -EFAULT;
+        }
+        rc = ecryptfs_parse_packet_length(packet_size_peek, &packet_size,
+                                          &packet_size_length);
+        if (rc) {
+                printk(KERN_WARNING "%s: Error parsing packet length; "
+                       "rc = [%zd]\n", __func__, rc);
+                return rc;
+        }
+        if ((PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_size_length + packet_size)
+            != count) {
+                printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__,
+                       packet_size);
+                return -EINVAL;
+        }
+memdup:
        data = memdup_user(buf, count);
        if (IS_ERR(data)) {
                printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
                       __func__, PTR_ERR(data));
-                goto out;
+                return PTR_ERR(data);
        }
-        sz = count;
+        switch (data[PKT_TYPE_OFFSET]) {
-        i = 0;
-        switch (data[i++]) {
        case ECRYPTFS_MSG_RESPONSE:
-                if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
+                if (count < (MIN_MSG_PKT_SIZE
+                             + sizeof(struct ecryptfs_message))) {
                        printk(KERN_WARNING "%s: Minimum acceptable packet "
                               "size is [%zd], but amount of data written is "
                               "only [%zd]. Discarding response packet.\n",
                               __func__,
-                               (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
+                               (MIN_MSG_PKT_SIZE
-                               count);
+                                + sizeof(struct ecryptfs_message)), count);
+                        rc = -EINVAL;
                        goto out_free;
                }
-                memcpy(&counter_nbo, &data[i], 4);
+                memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE);
                seq = be32_to_cpu(counter_nbo);
-                i += 4;
+                rc = ecryptfs_miscdev_response(
-                rc = ecryptfs_parse_packet_length(&data[i], &packet_size,
+                                &data[PKT_LEN_OFFSET + packet_size_length],
-                                                  &packet_size_length);
+                                packet_size, euid, current_user_ns(),
+                                task_pid(current), seq);
                if (rc) {
-                        printk(KERN_WARNING "%s: Error parsing packet length; "
-                               "rc = [%d]\n", __func__, rc);
-                        goto out_free;
-                }
-                i += packet_size_length;
-                if ((1 + 4 + packet_size_length + packet_size) != count) {
-                        printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
-                               " + packet_size([%zd]))([%zd]) != "
-                               "count([%zd]). Invalid packet format.\n",
-                               __func__, packet_size_length, packet_size,
-                               (1 + packet_size_length + packet_size), count);
-                        goto out_free;
-                }
-                rc = ecryptfs_miscdev_response(&data[i], packet_size,
-                                               euid, current_user_ns(),
-                                               task_pid(current), seq);
-                if (rc)
                        printk(KERN_WARNING "%s: Failed to deliver miscdev "
-                               "response to requesting operation; rc = [%d]\n",
+                               "response to requesting operation; rc = [%zd]\n",
                               __func__, rc);
+                        goto out_free;
+                }
                break;
        case ECRYPTFS_MSG_HELO:
        case ECRYPTFS_MSG_QUIT:
@@ -467,12 +496,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
                ecryptfs_printk(KERN_WARNING, "Dropping miscdev "
                                "message of unrecognized type [%d]\n",
                                data[0]);
-                break;
+                rc = -EINVAL;
+                goto out_free;
        }
+        rc = count;
 out_free:
        kfree(data);
-out:
+        return rc;
-        return sz;
 }
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 6a44148c5fb9..10ec695ccd68 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -57,6 +57,10 @@ struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
 * @page: Page that is locked before this call is made
 *
 * Returns zero on success; non-zero otherwise
+ *
+ * This is where we encrypt the data and pass the encrypted data to
+ * the lower filesystem.  In OpenPGP-compatible mode, we operate on
+ * entire underlying packets.
 */
 static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
 {
@@ -481,10 +485,6 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
 * @copied: The amount of data copied
 * @page: The eCryptfs page
 * @fsdata: The fsdata (unused)
- *
- * This is where we encrypt the data and pass the encrypted data to
- * the lower filesystem.  In OpenPGP-compatible mode, we operate on
- * entire underlying packets.
 */
 static int ecryptfs_write_end(struct file *file,
                        struct address_space *mapping,
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 3745f7c2b9c2..5c0106f75775 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -130,13 +130,18 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
                pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
                size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
                size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
-                size_t total_remaining_bytes = ((offset + size) - pos);
+                loff_t total_remaining_bytes = ((offset + size) - pos);
+                if (fatal_signal_pending(current)) {
+                        rc = -EINTR;
+                        break;
+                }
                if (num_bytes > total_remaining_bytes)
                        num_bytes = total_remaining_bytes;
                if (pos < offset) {
                        /* remaining zeros to write, up to destination offset */
-                        size_t total_remaining_zeros = (offset - pos);
+                        loff_t total_remaining_zeros = (offset - pos);
                        if (num_bytes > total_remaining_zeros)
                                num_bytes = total_remaining_zeros;
@@ -193,15 +198,19 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
                }
                pos += num_bytes;
        }
-        if ((offset + size) > ecryptfs_file_size) {
+        if (pos > ecryptfs_file_size) {
-                i_size_write(ecryptfs_inode, (offset + size));
+                i_size_write(ecryptfs_inode, pos);
                if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
-                        rc = ecryptfs_write_inode_size_to_metadata(
+                        int rc2;
+                        rc2 = ecryptfs_write_inode_size_to_metadata(
                                                                ecryptfs_inode);
-                        if (rc) {
+                        if (rc2) {
                                printk(KERN_ERR "Problem with "
                                       "ecryptfs_write_inode_size_to_metadata; "
-                                       "rc = [%d]\n", rc);
+                                       "rc = [%d]\n", rc2);
+                                if (!rc)
+                                        rc = rc2;
                                goto out;
                        }
                }
@@ -273,76 +282,3 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
        flush_dcache_page(page_for_ecryptfs);
        return rc;
 }
-#if 0
-/**
- * ecryptfs_read
- * @data: The virtual address into which to write the data read (and
- *        possibly decrypted) from the lower file
- * @offset: The offset in the decrypted view of the file from which to
- *          read into @data
- * @size: The number of bytes to read into @data
- * @ecryptfs_file: The eCryptfs file from which to read
- *
- * Read an arbitrary amount of data from an arbitrary location in the
- * eCryptfs page cache. This is done on an extent-by-extent basis;
- * individual extents are decrypted and read from the lower page
- * cache (via VFS reads). This function takes care of all the
- * address translation to locations in the lower filesystem.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_read(char *data, loff_t offset, size_t size,
-                  struct file *ecryptfs_file)
-{
-        struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
-        struct page *ecryptfs_page;
-        char *ecryptfs_page_virt;
-        loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
-        loff_t data_offset = 0;
-        loff_t pos;
-        int rc = 0;
-        if ((offset + size) > ecryptfs_file_size) {
-                rc = -EINVAL;
-                printk(KERN_ERR "%s: Attempt to read data past the end of the "
-                        "file; offset = [%lld]; size = [%td]; "
-                       "ecryptfs_file_size = [%lld]\n",
-                       __func__, offset, size, ecryptfs_file_size);
-                goto out;
-        }
-        pos = offset;
-        while (pos < (offset + size)) {
-                pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
-                size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
-                size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
-                size_t total_remaining_bytes = ((offset + size) - pos);
-                if (num_bytes > total_remaining_bytes)
-                        num_bytes = total_remaining_bytes;
-                ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
-                                                         ecryptfs_page_idx);
-                if (IS_ERR(ecryptfs_page)) {
-                        rc = PTR_ERR(ecryptfs_page);
-                        printk(KERN_ERR "%s: Error getting page at "
-                               "index [%ld] from eCryptfs inode "
-                               "mapping; rc = [%d]\n", __func__,
-                               ecryptfs_page_idx, rc);
-                        goto out;
-                }
-                ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
-                memcpy((data + data_offset),
-                       ((char *)ecryptfs_page_virt + start_offset_in_page),
-                       num_bytes);
-                kunmap_atomic(ecryptfs_page_virt, KM_USER0);
-                flush_dcache_page(ecryptfs_page);
-                SetPageUptodate(ecryptfs_page);
-                unlock_page(ecryptfs_page);
-                page_cache_release(ecryptfs_page);
-                pos += num_bytes;
-                data_offset += num_bytes;
-        }
-out:
-        return rc;
-}
-#endif  /*  0  */
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 828e750af23a..aabdfc38cf24 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -197,6 +197,12 @@ struct eventpoll {
        /* The user that created the eventpoll descriptor */
        struct user_struct *user;
+        struct file *file;
+        /* used to optimize loop detection check */
+        int visited;
+        struct list_head visited_list_link;
 };
 /* Wait structure used by the poll hooks */
@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly;
 /* Slab cache used to allocate "struct eppoll_entry" */
 static struct kmem_cache *pwq_cache __read_mostly;
+/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
+static LIST_HEAD(visited_list);
+/*
+ * List of files with newly added links, where we may need to limit the number
+ * of emanating paths. Protected by the epmutex.
+ */
+static LIST_HEAD(tfile_check_list);
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
@@ -276,6 +291,12 @@ ctl_table epoll_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
+static const struct file_operations eventpoll_fops;
+static inline int is_file_epoll(struct file *f)
+{
+        return f->f_op == &eventpoll_fops;
+}
 /* Setup the structure that is used as key for the RB tree */
 static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = {
        .llseek         = noop_llseek,
 };
-/* Fast test to see if the file is an eventpoll file */
-static inline int is_file_epoll(struct file *f)
-{
-        return f->f_op == &eventpoll_fops;
-}
 /*
 * This is called from eventpoll_release() to unlink files from the eventpoll
 * interface. We need to have this facility to cleanup correctly files that are
@@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
        rb_insert_color(&epi->rbn, &ep->rbr);
 }
+#define PATH_ARR_SIZE 5
+/*
+ * These are the number paths of length 1 to 5, that we are allowing to emanate
+ * from a single file of interest. For example, we allow 1000 paths of length
+ * 1, to emanate from each file of interest. This essentially represents the
+ * potential wakeup paths, which need to be limited in order to avoid massive
+ * uncontrolled wakeup storms. The common use case should be a single ep which
+ * is connected to n file sources. In this case each file source has 1 path
+ * of length 1. Thus, the numbers below should be more than sufficient. These
+ * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
+ * and delete can't add additional paths. Protected by the epmutex.
+ */
+static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
+static int path_count[PATH_ARR_SIZE];
+static int path_count_inc(int nests)
+{
+        if (++path_count[nests] > path_limits[nests])
+                return -1;
+        return 0;
+}
+static void path_count_init(void)
+{
+        int i;
+        for (i = 0; i < PATH_ARR_SIZE; i++)
+                path_count[i] = 0;
+}
+static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
+{
+        int error = 0;
+        struct file *file = priv;
+        struct file *child_file;
+        struct epitem *epi;
+        list_for_each_entry(epi, &file->f_ep_links, fllink) {
+                child_file = epi->ep->file;
+                if (is_file_epoll(child_file)) {
+                        if (list_empty(&child_file->f_ep_links)) {
+                                if (path_count_inc(call_nests)) {
+                                        error = -1;
+                                        break;
+                                }
+                        } else {
+                                error = ep_call_nested(&poll_loop_ncalls,
+                                                        EP_MAX_NESTS,
+                                                        reverse_path_check_proc,
+                                                        child_file, child_file,
+                                                        current);
+                        }
+                        if (error != 0)
+                                break;
+                } else {
+                        printk(KERN_ERR "reverse_path_check_proc: "
+                                "file is not an ep!\n");
+                }
+        }
+        return error;
+}
+/**
+ * reverse_path_check - The tfile_check_list is list of file *, which have
+ *                      links that are proposed to be newly added. We need to
+ *                      make sure that those added links don't add too many
+ *                      paths such that we will spend all our time waking up
+ *                      eventpoll objects.
+ *
+ * Returns: Returns zero if the proposed links don't create too many paths,
+ *          -1 otherwise.
+ */
+static int reverse_path_check(void)
+{
+        int length = 0;
+        int error = 0;
+        struct file *current_file;
+        /* let's call this for all tfiles */
+        list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
+                length++;
+                path_count_init();
+                error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+                                        reverse_path_check_proc, current_file,
+                                        current_file, current);
+                if (error)
+                        break;
+        }
+        return error;
+}
 /*
 * Must be called with "mtx" held.
 */
@@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
         */
        ep_rbtree_insert(ep, epi);
+        /* now check if we've created too many backpaths */
+        error = -EINVAL;
+        if (reverse_path_check())
+                goto error_remove_epi;
        /* We have to drop the new item inside our item list to keep track of it */
        spin_lock_irqsave(&ep->lock, flags);
@@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
        return 0;
+error_remove_epi:
+        spin_lock(&tfile->f_lock);
+        if (ep_is_linked(&epi->fllink))
+                list_del_init(&epi->fllink);
+        spin_unlock(&tfile->f_lock);
+        rb_erase(&epi->rbn, &ep->rbr);
 error_unregister:
        ep_unregister_pollwait(ep, epi);
@@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
        int error = 0;
        struct file *file = priv;
        struct eventpoll *ep = file->private_data;
+        struct eventpoll *ep_tovisit;
        struct rb_node *rbp;
        struct epitem *epi;
        mutex_lock_nested(&ep->mtx, call_nests + 1);
+        ep->visited = 1;
+        list_add(&ep->visited_list_link, &visited_list);
        for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);
                if (unlikely(is_file_epoll(epi->ffd.file))) {
+                        ep_tovisit = epi->ffd.file->private_data;
+                        if (ep_tovisit->visited)
+                                continue;
                        error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
-                                               ep_loop_check_proc, epi->ffd.file,
+                                        ep_loop_check_proc, epi->ffd.file,
-                                               epi->ffd.file->private_data, current);
+                                        ep_tovisit, current);
                        if (error != 0)
                                break;
+                } else {
+                        /*
+                         * If we've reached a file that is not associated with
+                         * an ep, then we need to check if the newly added
+                         * links are going to add too many wakeup paths. We do
+                         * this by adding it to the tfile_check_list, if it's
+                         * not already there, and calling reverse_path_check()
+                         * during ep_insert().
+                         */
+                        if (list_empty(&epi->ffd.file->f_tfile_llink))
+                                list_add(&epi->ffd.file->f_tfile_llink,
+                                         &tfile_check_list);
                }
        }
        mutex_unlock(&ep->mtx);
@@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
 */
 static int ep_loop_check(struct eventpoll *ep, struct file *file)
 {
-        return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+        int ret;
+        struct eventpoll *ep_cur, *ep_next;
+        ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
                              ep_loop_check_proc, file, ep, current);
+        /* clear visited list */
+        list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
+                                                        visited_list_link) {
+                ep_cur->visited = 0;
+                list_del(&ep_cur->visited_list_link);
+        }
+        return ret;
+}
+static void clear_tfile_check_list(void)
+{
+        struct file *file;
+        /* first clear the tfile_check_list */
+        while (!list_empty(&tfile_check_list)) {
+                file = list_first_entry(&tfile_check_list, struct file,
+                                        f_tfile_llink);
+                list_del_init(&file->f_tfile_llink);
+        }
+        INIT_LIST_HEAD(&tfile_check_list);
 }
 /*
@@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
 */
 SYSCALL_DEFINE1(epoll_create1, int, flags)
 {
-        int error;
+        int error, fd;
        struct eventpoll *ep = NULL;
+        struct file *file;
        /* Check the EPOLL_* constant for consistency.  */
        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
@@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
         * Creates all the items needed to setup an eventpoll file. That is,
         * a file structure and a free file descriptor.
         */
-        error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+        fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
+        if (fd < 0) {
+                error = fd;
+                goto out_free_ep;
+        }
+        file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
                                 O_RDWR | (flags & O_CLOEXEC));
-        if (error < 0)
+        if (IS_ERR(file)) {
-                ep_free(ep);
+                error = PTR_ERR(file);
+                goto out_free_fd;
+        }
+        fd_install(fd, file);
+        ep->file = file;
+        return fd;
+out_free_fd:
+        put_unused_fd(fd);
+out_free_ep:
+        ep_free(ep);
        return error;
 }
@@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        /*
         * When we insert an epoll file descriptor, inside another epoll file
         * descriptor, there is the change of creating closed loops, which are
-         * better be handled here, than in more critical paths.
+         * better be handled here, than in more critical paths. While we are
+         * checking for loops we also determine the list of files reachable
+         * and hang them on the tfile_check_list, so we can check that we
+         * haven't created too many possible wakeup paths.
         *
-         * We hold epmutex across the loop check and the insert in this case, in
+         * We need to hold the epmutex across both ep_insert and ep_remove
-         * order to prevent two separate inserts from racing and each doing the
+         * b/c we want to make sure we are looking at a coherent view of
-         * insert "at the same time" such that ep_loop_check passes on both
+         * epoll network.
-         * before either one does the insert, thereby creating a cycle.
         */
-        if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+        if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
                mutex_lock(&epmutex);
                did_lock_epmutex = 1;
-                error = -ELOOP;
-                if (ep_loop_check(ep, tfile) != 0)
-                        goto error_tgt_fput;
        }
+        if (op == EPOLL_CTL_ADD) {
+                if (is_file_epoll(tfile)) {
+                        error = -ELOOP;
+                        if (ep_loop_check(ep, tfile) != 0)
+                                goto error_tgt_fput;
+                } else
+                        list_add(&tfile->f_tfile_llink, &tfile_check_list);
+        }
        mutex_lock_nested(&ep->mtx, 0);
@@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                        error = ep_insert(ep, &epds, tfile, fd);
                } else
                        error = -EEXIST;
+                clear_tfile_check_list();
                break;
        case EPOLL_CTL_DEL:
                if (epi)
@@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        mutex_unlock(&ep->mtx);
 error_tgt_fput:
-        if (unlikely(did_lock_epmutex))
+        if (did_lock_epmutex)
                mutex_unlock(&epmutex);
        fput(tfile);
diff --git a/fs/exec.c b/fs/exec.c
index 3f64b9f26e7d..aeb135c7ff5c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,6 +59,8 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
+#include <trace/events/task.h>
 #include "internal.h"
 int core_uses_pid;
@@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 {
        task_lock(tsk);
+        trace_task_rename(tsk, buf);
        /*
         * Threads may access current->comm without holding
         * the task lock, so write the string carefully.
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index da42f32c49be..86194b2f799d 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,14 +1,3 @@
-# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
-# for every ORE user we do it like this. Any user should add itself here
-# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
-# selected here, and we default to "ON". So in effect it is like been
-# selected by any of the users.
-config ORE
-        tristate
-        depends on EXOFS_FS || PNFS_OBJLAYOUT
-        select ASYNC_XOR
-        default SCSI_OSD_ULD
 config EXOFS_FS
        tristate "exofs: OSD based file system support"
        depends on SCSI_OSD_ULD
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore
new file mode 100644
index 000000000000..1ca7fb7b6ba8
--- /dev/null
+++ b/fs/exofs/Kconfig.ore
@@ -0,0 +1,12 @@
+# ORE - Objects Raid Engine (libore.ko)
+#
+# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
+# for every ORE user we do it like this. Any user should add itself here
+# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
+# selected here, and we default to "ON". So in effect it is like been
+# selected by any of the users.
+config ORE
+        tristate
+        depends on EXOFS_FS || PNFS_OBJLAYOUT
+        select ASYNC_XOR
+        default SCSI_OSD_ULD
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index d271ad837202..49cf230554a2 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -266,7 +266,7 @@ int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
                        /* first/last seg is split */
                        num_raid_units += layout->group_width;
-                        sgs_per_dev = div_u64(num_raid_units, data_devs);
+                        sgs_per_dev = div_u64(num_raid_units, data_devs) + 2;
                } else {
                        /* For Writes add parity pages array. */
                        max_par_pages = num_raid_units * pages_in_unit *
@@ -445,10 +445,10 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
                        u64 residual = ios->reading ?
                                        or->in.residual : or->out.residual;
                        u64 offset = (ios->offset + ios->length) - residual;
-                        struct ore_dev *od = ios->oc->ods[
+                        unsigned dev = per_dev->dev - ios->oc->first_dev;
-                                        per_dev->dev - ios->oc->first_dev];
+                        struct ore_dev *od = ios->oc->ods[dev];
-                        on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri,
+                        on_dev_error(ios, od, dev, osi.osd_err_pri,
                                     offset, residual);
                }
                if (osi.osd_err_pri >= acumulated_osd_err) {
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 29c47e5c4a86..d222c77cfa1b 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -328,8 +328,8 @@ static int _alloc_read_4_write(struct ore_io_state *ios)
 /* @si contains info of the to-be-inserted page. Update of @si should be
 * maintained by caller. Specificaly si->dev, si->obj_offset, ...
 */
-static int _add_to_read_4_write(struct ore_io_state *ios,
+static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si,
-                                struct ore_striping_info *si, struct page *page)
+                       struct page *page, unsigned pg_len)
 {
        struct request_queue *q;
        struct ore_per_dev_state *per_dev;
@@ -366,17 +366,60 @@ static int _add_to_read_4_write(struct ore_io_state *ios,
                _ore_add_sg_seg(per_dev, gap, true);
        }
        q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
-        added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0);
+        added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len,
-        if (unlikely(added_len != PAGE_SIZE)) {
+                                    si->obj_offset % PAGE_SIZE);
+        if (unlikely(added_len != pg_len)) {
                ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
                              per_dev->bio->bi_vcnt);
                return -ENOMEM;
        }
-        per_dev->length += PAGE_SIZE;
+        per_dev->length += pg_len;
        return 0;
 }
+/* read the beginning of an unaligned first page */
+static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page)
+{
+        struct ore_striping_info si;
+        unsigned pg_len;
+        ore_calc_stripe_info(ios->layout, ios->offset, 0, &si);
+        pg_len = si.obj_offset % PAGE_SIZE;
+        si.obj_offset -= pg_len;
+        ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n",
+                   _LLU(si.obj_offset), pg_len, page->index, si.dev);
+        return _add_to_r4w(ios, &si, page, pg_len);
+}
+/* read the end of an incomplete last page */
+static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
+{
+        struct ore_striping_info si;
+        struct page *page;
+        unsigned pg_len, p, c;
+        ore_calc_stripe_info(ios->layout, *offset, 0, &si);
+        p = si.unit_off / PAGE_SIZE;
+        c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
+                       ios->layout->mirrors_p1, si.par_dev, si.dev);
+        page = ios->sp2d->_1p_stripes[p].pages[c];
+        pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
+        *offset += pg_len;
+        ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n",
+                   p, c, _LLU(*offset), pg_len, si.dev, si.par_dev);
+        BUG_ON(!page);
+        return _add_to_r4w(ios, &si, page, pg_len);
+}
 static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
 {
        struct bio_vec *bv;
@@ -444,9 +487,13 @@ static int _read_4_write(struct ore_io_state *ios)
                        struct page **pp = &_1ps->pages[c];
                        bool uptodate;
-                        if (*pp)
+                        if (*pp) {
+                                if (ios->offset % PAGE_SIZE)
+                                        /* Read the remainder of the page */
+                                        _add_to_r4w_first_page(ios, *pp);
                                /* to-be-written pages start here */
                                goto read_last_stripe;
+                        }
                        *pp = ios->r4w->get_page(ios->private, offset,
                                                 &uptodate);
@@ -454,7 +501,7 @@ static int _read_4_write(struct ore_io_state *ios)
                                return -ENOMEM;
                        if (!uptodate)
-                                _add_to_read_4_write(ios, &read_si, *pp);
+                                _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE);
                        /* Mark read-pages to be cache_released */
                        _1ps->page_is_read[c] = true;
@@ -465,8 +512,11 @@ static int _read_4_write(struct ore_io_state *ios)
        }
 read_last_stripe:
-        offset = ios->offset + (ios->length + PAGE_SIZE - 1) /
+        offset = ios->offset + ios->length;
-                                PAGE_SIZE * PAGE_SIZE;
+        if (offset % PAGE_SIZE)
+                _add_to_r4w_last_page(ios, &offset);
+                /* offset will be aligned to next page */
        last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
                                 * bytes_in_stripe;
        if (offset == last_stripe_end) /* Optimize for the aligned case */
@@ -503,7 +553,7 @@ read_last_stripe:
                        /* Mark read-pages to be cache_released */
                        _1ps->page_is_read[c] = true;
                        if (!uptodate)
-                                _add_to_read_4_write(ios, &read_si, page);
+                                _add_to_r4w(ios, &read_si, page, PAGE_SIZE);
                }
                offset += PAGE_SIZE;
@@ -551,7 +601,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
                            unsigned cur_len)
 {
        if (ios->reading) {
-                BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev);
+                if (per_dev->cur_sg >= ios->sgs_per_dev) {
+                        ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" ,
+                                per_dev->cur_sg, ios->sgs_per_dev);
+                        return -ENOMEM;
+                }
                _ore_add_sg_seg(per_dev, cur_len, true);
        } else {
                struct __stripe_pages_2d *sp2d = ios->sp2d;
@@ -612,8 +666,6 @@ int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
                        return -ENOMEM;
                }
-                BUG_ON(ios->offset % PAGE_SIZE);
                /* Round io down to last full strip */
                first_stripe = div_u64(ios->offset, stripe_size);
                last_stripe = div_u64(ios->offset + ios->length, stripe_size);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 8addfe314dc7..d22cd168c6ee 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -838,6 +838,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
        if (ret) {
                EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
+                dput(sb->s_root);
+                sb->s_root = NULL;
                goto free_sbi;
        }
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index cd7f5f424a75..8b15cf8cef37 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -573,8 +573,11 @@ got:
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
        if (insert_inode_locked(inode) < 0) {
-                err = -EINVAL;
+                ext2_error(sb, "ext2_new_inode",
-                goto fail_drop;
+                           "inode number already in use - inode=%lu",
+                           (unsigned long) ino);
+                err = -EIO;
+                goto fail;
        }
        dquot_initialize(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 91a6945af6d8..740cad8dcd8d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -26,7 +26,6 @@
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
-#include <linux/module.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
@@ -36,10 +35,6 @@
 #include "acl.h"
 #include "xip.h"
-MODULE_AUTHOR("Remy Card and others");
-MODULE_DESCRIPTION("Second Extended Filesystem");
-MODULE_LICENSE("GPL");
 static int __ext2_write_inode(struct inode *inode, int do_sync);
 /*
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 1089f760c847..2de655f5d625 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -77,10 +77,11 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                flags = flags & EXT2_FL_USER_MODIFIABLE;
                flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE;
                ei->i_flags = flags;
-                mutex_unlock(&inode->i_mutex);
                ext2_set_inode_flags(inode);
                inode->i_ctime = CURRENT_TIME_SEC;
+                mutex_unlock(&inode->i_mutex);
                mark_inode_dirty(inode);
 setflags_out:
                mnt_drop_write_file(filp);
@@ -88,20 +89,29 @@ setflags_out:
        }
        case EXT2_IOC_GETVERSION:
                return put_user(inode->i_generation, (int __user *) arg);
-        case EXT2_IOC_SETVERSION:
+        case EXT2_IOC_SETVERSION: {
+                __u32 generation;
                if (!inode_owner_or_capable(inode))
                        return -EPERM;
                ret = mnt_want_write_file(filp);
                if (ret)
                        return ret;
-                if (get_user(inode->i_generation, (int __user *) arg)) {
+                if (get_user(generation, (int __user *) arg)) {
                        ret = -EFAULT;
-                } else {
+                        goto setversion_out;
-                        inode->i_ctime = CURRENT_TIME_SEC;
-                        mark_inode_dirty(inode);
                }
+                mutex_lock(&inode->i_mutex);
+                inode->i_ctime = CURRENT_TIME_SEC;
+                inode->i_generation = generation;
+                mutex_unlock(&inode->i_mutex);
+                mark_inode_dirty(inode);
+setversion_out:
                mnt_drop_write_file(filp);
                return ret;
+        }
        case EXT2_IOC_GETRSVSZ:
                if (test_opt(inode->i_sb, RESERVATION)
                        && S_ISREG(inode->i_mode)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9b403f064ce0..0090595beb28 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1520,5 +1520,8 @@ static void __exit exit_ext2_fs(void)
        exit_ext2_xattr();
 }
+MODULE_AUTHOR("Remy Card and others");
+MODULE_DESCRIPTION("Second Extended Filesystem");
+MODULE_LICENSE("GPL");
 module_init(init_ext2_fs)
 module_exit(exit_ext2_fs)
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index d27b71f1d183..6dcafc7efdfd 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -54,7 +54,6 @@
 */
 #include <linux/buffer_head.h>
-#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/mbcache.h>
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index c922adc8ef41..be7a8d02c9a7 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -3,7 +3,6 @@
 * Handler for storing security labels as extended attributes.
 */
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/fs.h>
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 667e46a8d62d..2989467d3595 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -5,7 +5,6 @@
 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 099d20f47163..f470e44c4b8d 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -6,7 +6,6 @@
 */
 #include <linux/init.h>
-#include <linux/module.h>
 #include <linux/string.h>
 #include "ext2.h"
 #include "xattr.h"
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 92cc86dfa23d..1cde28438014 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -525,8 +525,12 @@ got:
        if (IS_DIRSYNC(inode))
                handle->h_sync = 1;
        if (insert_inode_locked(inode) < 0) {
-                err = -EINVAL;
+                /*
-                goto fail_drop;
+                 * Likely a bitmap corruption causing inode to be allocated
+                 * twice.
+                 */
+                err = -EIO;
+                goto fail;
        }
        spin_lock(&sbi->s_next_gen_lock);
        inode->i_generation = sbi->s_next_generation++;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 15cb47088aac..2d0afeca0b47 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -22,7 +22,6 @@
 *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
 */
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/ext3_jbd.h>
@@ -223,8 +222,12 @@ void ext3_evict_inode (struct inode *inode)
         *
         * Note that directories do not have this problem because they don't
         * use page cache.
+         *
+         * The s_journal check handles the case when ext3_get_journal() fails
+         * and puts the journal inode.
         */
        if (inode->i_nlink && ext3_should_journal_data(inode) &&
+            EXT3_SB(inode->i_sb)->s_journal &&
            (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
                tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
                journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
@@ -1132,9 +1135,11 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
        bh = ext3_getblk(handle, inode, block, create, err);
        if (!bh)
                return bh;
-        if (buffer_uptodate(bh))
+        if (bh_uptodate_or_lock(bh))
                return bh;
-        ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
+        get_bh(bh);
+        bh->b_end_io = end_buffer_read_sync;
+        submit_bh(READ | REQ_META | REQ_PRIO, bh);
        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                return bh;
@@ -1617,7 +1622,13 @@ static int ext3_ordered_writepage(struct page *page,
        int err;
        J_ASSERT(PageLocked(page));
-        WARN_ON_ONCE(IS_RDONLY(inode));
+        /*
+         * We don't want to warn for emergency remount. The condition is
+         * ordered to avoid dereferencing inode->i_sb in non-error case to
+         * avoid slow-downs.
+         */
+        WARN_ON_ONCE(IS_RDONLY(inode) &&
+                     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
        /*
         * We give up here if we're reentered, because it might be for a
@@ -1692,7 +1703,13 @@ static int ext3_writeback_writepage(struct page *page,
        int err;
        J_ASSERT(PageLocked(page));
-        WARN_ON_ONCE(IS_RDONLY(inode));
+        /*
+         * We don't want to warn for emergency remount. The condition is
+         * ordered to avoid dereferencing inode->i_sb in non-error case to
+         * avoid slow-downs.
+         */
+        WARN_ON_ONCE(IS_RDONLY(inode) &&
+                     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
        if (ext3_journal_current_handle())
                goto out_fail;
@@ -1735,7 +1752,13 @@ static int ext3_journalled_writepage(struct page *page,
        int err;
        J_ASSERT(PageLocked(page));
-        WARN_ON_ONCE(IS_RDONLY(inode));
+        /*
+         * We don't want to warn for emergency remount. The condition is
+         * ordered to avoid dereferencing inode->i_sb in non-error case to
+         * avoid slow-downs.
+         */
+        WARN_ON_ONCE(IS_RDONLY(inode) &&
+                     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
        if (ext3_journal_current_handle())
                goto no_write;
@@ -2064,12 +2087,10 @@ static int ext3_block_truncate_page(struct inode *inode, loff_t from)
        if (PageUptodate(page))
                set_buffer_uptodate(bh);
-        if (!buffer_uptodate(bh)) {
+        if (!bh_uptodate_or_lock(bh)) {
-                err = -EIO;
+                err = bh_submit_read(bh);
-                ll_rw_block(READ, 1, &bh);
-                wait_on_buffer(bh);
                /* Uhhuh. Read error. Complain and punt. */
-                if (!buffer_uptodate(bh))
+                if (err)
                        goto unlock;
        }
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 8e37c41a071b..4af574ce4a46 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -134,10 +134,11 @@ flags_out:
                        goto setversion_out;
                }
+                mutex_lock(&inode->i_mutex);
                handle = ext3_journal_start(inode, 1);
                if (IS_ERR(handle)) {
                        err = PTR_ERR(handle);
-                        goto setversion_out;
+                        goto unlock_out;
                }
                err = ext3_reserve_inode_write(handle, inode, &iloc);
                if (err == 0) {
@@ -146,6 +147,9 @@ flags_out:
                        err = ext3_mark_iloc_dirty(handle, inode, &iloc);
                }
                ext3_journal_stop(handle);
+unlock_out:
+                mutex_unlock(&inode->i_mutex);
 setversion_out:
                mnt_drop_write_file(filp);
                return err;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 4f35b2f315d4..e8e211795e9f 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -921,9 +921,12 @@ restart:
                                num++;
                                bh = ext3_getblk(NULL, dir, b++, 0, &err);
                                bh_use[ra_max] = bh;
-                                if (bh)
+                                if (bh && !bh_uptodate_or_lock(bh)) {
-                                        ll_rw_block(READ | REQ_META | REQ_PRIO,
+                                        get_bh(bh);
-                                                    1, &bh);
+                                        bh->b_end_io = end_buffer_read_sync;
+                                        submit_bh(READ | REQ_META | REQ_PRIO,
+                                                  bh);
+                                }
                        }
                }
                if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -2272,7 +2275,7 @@ retry:
                        err = PTR_ERR(handle);
                        goto err_drop_inode;
                }
-                inc_nlink(inode);
+                set_nlink(inode, 1);
                err = ext3_orphan_del(handle, inode);
                if (err) {
                        ext3_journal_stop(handle);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3a10b884e1be..726c7ef6cdf1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2059,9 +2059,10 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
        ext3_orphan_cleanup(sb, es);
        EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
-        if (needs_recovery)
+        if (needs_recovery) {
+                ext3_mark_recovery_complete(sb, es);
                ext3_msg(sb, KERN_INFO, "recovery complete");
-        ext3_mark_recovery_complete(sb, es);
+        }
        ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
                test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
                test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
@@ -2229,11 +2230,11 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
                goto out_bdev;
        }
        journal->j_private = sb;
-        ll_rw_block(READ, 1, &journal->j_sb_buffer);
+        if (!bh_uptodate_or_lock(journal->j_sb_buffer)) {
-        wait_on_buffer(journal->j_sb_buffer);
+                if (bh_submit_read(journal->j_sb_buffer)) {
-        if (!buffer_uptodate(journal->j_sb_buffer)) {
+                        ext3_msg(sb, KERN_ERR, "I/O error on journal device");
-                ext3_msg(sb, KERN_ERR, "I/O error on journal device");
+                        goto out_journal;
-                goto out_journal;
+                }
        }
        if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
                ext3_msg(sb, KERN_ERR,
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3c218b8a51d4..ea26f2acab94 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -3,7 +3,6 @@
 * Handler for storing security labels as extended attributes.
 */
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/fs.h>
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index dc8edda9ffe0..2526a8829de8 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -5,7 +5,6 @@
 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 7a321974d584..b32e473a1e33 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -5,7 +5,6 @@
 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/ext3_jbd.h>
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 12ccacda44e0..f9e2cd8cf711 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -23,6 +23,8 @@
 #include <trace/events/ext4.h>
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+                                            ext4_group_t block_group);
 /*
 * balloc.c contains the blocks allocation and deallocation routines
 */
@@ -668,7 +670,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
 * This function returns the number of file system metadata clusters at
 * the beginning of a block group, including the reserved gdt blocks.
 */
-unsigned ext4_num_base_meta_clusters(struct super_block *sb,
+static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
                                     ext4_group_t block_group)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 8efb2f0a3447..3f11656bd72e 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -13,7 +13,6 @@
 #include <linux/namei.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
-#include <linux/module.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1554b15f91bc..513004fc3d84 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -511,6 +511,14 @@ struct ext4_new_group_data {
        __u32 free_blocks_count;
 };
+/* Indexes used to index group tables in ext4_new_group_data */
+enum {
+        BLOCK_BITMAP = 0,       /* block bitmap */
+        INODE_BITMAP,           /* inode bitmap */
+        INODE_TABLE,            /* inode tables */
+        GROUP_TABLE_COUNT,
+};
 /*
 * Flags used by ext4_map_blocks()
 */
@@ -575,6 +583,7 @@ struct ext4_new_group_data {
 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 #define EXT4_IOC_ALLOC_DA_BLKS          _IO('f', 12)
 #define EXT4_IOC_MOVE_EXT               _IOWR('f', 15, struct move_extent)
+#define EXT4_IOC_RESIZE_FS              _IOW('f', 16, __u64)
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -957,12 +966,13 @@ struct ext4_inode_info {
 #define test_opt2(sb, opt)              (EXT4_SB(sb)->s_mount_opt2 & \
                                         EXT4_MOUNT2_##opt)
-#define ext4_set_bit                    __test_and_set_bit_le
+#define ext4_test_and_set_bit           __test_and_set_bit_le
+#define ext4_set_bit                    __set_bit_le
 #define ext4_set_bit_atomic             ext2_set_bit_atomic
-#define ext4_clear_bit                  __test_and_clear_bit_le
+#define ext4_test_and_clear_bit         __test_and_clear_bit_le
+#define ext4_clear_bit                  __clear_bit_le
 #define ext4_clear_bit_atomic           ext2_clear_bit_atomic
 #define ext4_test_bit                   test_bit_le
-#define ext4_find_first_zero_bit        find_first_zero_bit_le
 #define ext4_find_next_zero_bit         find_next_zero_bit_le
 #define ext4_find_next_bit              find_next_bit_le
@@ -1397,6 +1407,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE      0x0040
 #define EXT4_FEATURE_RO_COMPAT_QUOTA            0x0100
 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC         0x0200
+#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM    0x0400
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION       0x0001
 #define EXT4_FEATURE_INCOMPAT_FILETYPE          0x0002
@@ -1409,6 +1420,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_FLEX_BG           0x0200
 #define EXT4_FEATURE_INCOMPAT_EA_INODE          0x0400 /* EA in inode */
 #define EXT4_FEATURE_INCOMPAT_DIRDATA           0x1000 /* data in dirent */
+#define EXT4_FEATURE_INCOMPAT_INLINEDATA        0x2000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_LARGEDIR          0x4000 /* >2GB or 3-lvl htree */
 #define EXT2_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP      (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1790,8 +1803,6 @@ extern void ext4_init_block_bitmap(struct super_block *sb,
 extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                              ext4_group_t block_group,
                                              struct ext4_group_desc *gdp);
-extern unsigned ext4_num_base_meta_clusters(struct super_block *sb,
-                                            ext4_group_t block_group);
 extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
                                           ext4_group_t block_group,
                                           struct ext4_group_desc *gdp);
@@ -1880,16 +1891,9 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_block_truncate_page(handle_t *handle,
-                struct address_space *mapping, loff_t from);
-extern int ext4_block_zero_page_range(handle_t *handle,
-                struct address_space *mapping, loff_t from, loff_t length);
 extern int ext4_discard_partial_page_buffers(handle_t *handle,
                struct address_space *mapping, loff_t from,
                loff_t length, int flags);
-extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
-                struct inode *inode, struct page *page, loff_t from,
-                loff_t length, int flags);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1924,6 +1928,7 @@ extern int ext4_group_add(struct super_block *sb,
 extern int ext4_group_extend(struct super_block *sb,
                                struct ext4_super_block *es,
                                ext4_fsblk_t n_blocks_count);
+extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
 /* super.c */
 extern void *ext4_kvmalloc(size_t size, gfp_t flags);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 607b1557d292..74f23c292e1b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -29,7 +29,6 @@
 *   - smart tree reduction
 */
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/jbd2.h>
@@ -3281,6 +3280,9 @@ static int ext4_find_delalloc_range(struct inode *inode,
        ext4_lblk_t i, pg_lblk;
        pgoff_t index;
+        if (!test_opt(inode->i_sb, DELALLOC))
+                return 0;
        /* reverse search wont work if fs block size is less than page size */
        if (inode->i_blkbits < PAGE_CACHE_SHIFT)
                search_hint_reverse = 0;
@@ -3453,8 +3455,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        int err = 0;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
-        ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
+        ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
-                  "block %llu, max_blocks %u, flags %d, allocated %u",
+                  "block %llu, max_blocks %u, flags %x, allocated %u\n",
                  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
                  flags, allocated);
        ext4_ext_show_leaf(inode, path);
@@ -3625,7 +3627,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
        ext4_lblk_t ex_cluster_start, ex_cluster_end;
-        ext4_lblk_t rr_cluster_start, rr_cluster_end;
+        ext4_lblk_t rr_cluster_start;
        ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
        ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
        unsigned short ee_len = ext4_ext_get_actual_len(ex);
@@ -3636,7 +3638,6 @@ static int get_implied_cluster_alloc(struct super_block *sb,
        /* The requested region passed into ext4_map_blocks() */
        rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
-        rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
        if ((rr_cluster_start == ex_cluster_end) ||
            (rr_cluster_start == ex_cluster_start)) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4637af036d9c..25d8c9781ad9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -252,7 +252,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                fatal = ext4_journal_get_write_access(handle, bh2);
        }
        ext4_lock_group(sb, block_group);
-        cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+        cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
        if (fatal || !cleared) {
                ext4_unlock_group(sb, block_group);
                goto out;
@@ -358,7 +358,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t real_ngroups = ext4_get_groups_count(sb);
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
-        unsigned int freei, avefreei;
+        unsigned int freei, avefreei, grp_free;
        ext4_fsblk_t freeb, avefreec;
        unsigned int ndirs;
        int max_dirs, min_inodes;
@@ -477,8 +477,8 @@ fallback_retry:
        for (i = 0; i < ngroups; i++) {
                grp = (parent_group + i) % ngroups;
                desc = ext4_get_group_desc(sb, grp, NULL);
-                if (desc && ext4_free_inodes_count(sb, desc) &&
+                grp_free = ext4_free_inodes_count(sb, desc);
-                    ext4_free_inodes_count(sb, desc) >= avefreei) {
+                if (desc && grp_free && grp_free >= avefreei) {
                        *group = grp;
                        return 0;
                }
@@ -618,7 +618,7 @@ static int ext4_claim_inode(struct super_block *sb,
         */
        down_read(&grp->alloc_sem);
        ext4_lock_group(sb, group);
-        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+        if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) {
                /* not a free inode */
                retval = 1;
                goto err_ret;
@@ -885,8 +885,12 @@ got:
        if (IS_DIRSYNC(inode))
                ext4_handle_sync(handle);
        if (insert_inode_locked(inode) < 0) {
-                err = -EINVAL;
+                /*
-                goto fail_drop;
+                 * Likely a bitmap corruption causing inode to be allocated
+                 * twice.
+                 */
+                err = -EIO;
+                goto fail;
        }
        spin_lock(&sbi->s_next_gen_lock);
        inode->i_generation = sbi->s_next_generation++;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 3cfc73fbca8e..830e1b2bf145 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,7 +20,6 @@
 *      (sct@redhat.com), 1993, 1998
 */
-#include <linux/module.h>
 #include "ext4_jbd2.h"
 #include "truncate.h"
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7dbcc3e84570..feaa82fe629d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -18,7 +18,6 @@
 *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
 */
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/jbd2.h>
@@ -72,6 +71,9 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 static int __ext4_journalled_writepage(struct page *page, unsigned int len);
 static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+                struct inode *inode, struct page *page, loff_t from,
+                loff_t length, int flags);
 /*
 * Test whether an inode is a fast symlink.
@@ -2760,7 +2762,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
        if (!io_end || !size)
                goto out;
-        ext_debug("ext4_end_io_dio(): io_end 0x%p"
+        ext_debug("ext4_end_io_dio(): io_end 0x%p "
                  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
                  iocb->private, io_end->inode->i_ino, iocb, offset,
                  size);
@@ -3161,7 +3163,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
 *
 * Returns zero on sucess or negative on failure.
 */
-int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
                struct inode *inode, struct page *page, loff_t from,
                loff_t length, int flags)
 {
@@ -3301,126 +3303,6 @@ next:
        return err;
 }
-/*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
- */
-int ext4_block_truncate_page(handle_t *handle,
-                struct address_space *mapping, loff_t from)
-{
-        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-        unsigned length;
-        unsigned blocksize;
-        struct inode *inode = mapping->host;
-        blocksize = inode->i_sb->s_blocksize;
-        length = blocksize - (offset & (blocksize - 1));
-        return ext4_block_zero_page_range(handle, mapping, from, length);
-}
-/*
- * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * starting from file offset 'from'.  The range to be zero'd must
- * be contained with in one block.  If the specified range exceeds
- * the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
- */
-int ext4_block_zero_page_range(handle_t *handle,
-                struct address_space *mapping, loff_t from, loff_t length)
-{
-        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
-        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-        unsigned blocksize, max, pos;
-        ext4_lblk_t iblock;
-        struct inode *inode = mapping->host;
-        struct buffer_head *bh;
-        struct page *page;
-        int err = 0;
-        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
-                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
-        if (!page)
-                return -ENOMEM;
-        blocksize = inode->i_sb->s_blocksize;
-        max = blocksize - (offset & (blocksize - 1));
-        /*
-         * correct length if it does not fall between
-         * 'from' and the end of the block
-         */
-        if (length > max || length < 0)
-                length = max;
-        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
-        if (!page_has_buffers(page))
-                create_empty_buffers(page, blocksize, 0);
-        /* Find the buffer that contains "offset" */
-        bh = page_buffers(page);
-        pos = blocksize;
-        while (offset >= pos) {
-                bh = bh->b_this_page;
-                iblock++;
-                pos += blocksize;
-        }
-        err = 0;
-        if (buffer_freed(bh)) {
-                BUFFER_TRACE(bh, "freed: skip");
-                goto unlock;
-        }
-        if (!buffer_mapped(bh)) {
-                BUFFER_TRACE(bh, "unmapped");
-                ext4_get_block(inode, iblock, bh, 0);
-                /* unmapped? It's a hole - nothing to do */
-                if (!buffer_mapped(bh)) {
-                        BUFFER_TRACE(bh, "still unmapped");
-                        goto unlock;
-                }
-        }
-        /* Ok, it's mapped. Make sure it's up-to-date */
-        if (PageUptodate(page))
-                set_buffer_uptodate(bh);
-        if (!buffer_uptodate(bh)) {
-                err = -EIO;
-                ll_rw_block(READ, 1, &bh);
-                wait_on_buffer(bh);
-                /* Uhhuh. Read error. Complain and punt. */
-                if (!buffer_uptodate(bh))
-                        goto unlock;
-        }
-        if (ext4_should_journal_data(inode)) {
-                BUFFER_TRACE(bh, "get write access");
-                err = ext4_journal_get_write_access(handle, bh);
-                if (err)
-                        goto unlock;
-        }
-        zero_user(page, offset, length);
-        BUFFER_TRACE(bh, "zeroed end of block");
-        err = 0;
-        if (ext4_should_journal_data(inode)) {
-                err = ext4_handle_dirty_metadata(handle, inode, bh);
-        } else
-                mark_buffer_dirty(bh);
-unlock:
-        unlock_page(page);
-        page_cache_release(page);
-        return err;
-}
 int ext4_can_truncate(struct inode *inode)
 {
        if (S_ISREG(inode->i_mode))
@@ -4647,9 +4529,19 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
                return 0;
        if (is_journal_aborted(journal))
                return -EROFS;
+        /* We have to allocate physical blocks for delalloc blocks
+         * before flushing journal. otherwise delalloc blocks can not
+         * be allocated any more. even more truncate on delalloc blocks
+         * could trigger BUG by flushing delalloc blocks in journal.
+         * There is no delalloc block in non-journal data mode.
+         */
+        if (val && test_opt(inode->i_sb, DELALLOC)) {
+                err = ext4_alloc_da_blocks(inode);
+                if (err < 0)
+                        return err;
+        }
        jbd2_journal_lock_updates(journal);
-        jbd2_journal_flush(journal);
        /*
         * OK, there are no updates running now, and all cached data is
@@ -4661,8 +4553,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
        if (val)
                ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
-        else
+        else {
+                jbd2_journal_flush(journal);
                ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+        }
        ext4_set_aops(inode);
        jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d37b3bb2a3b8..6eee25591b81 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -18,6 +18,8 @@
 #include "ext4_jbd2.h"
 #include "ext4.h"
+#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = filp->f_dentry->d_inode;
@@ -158,10 +160,11 @@ flags_out:
                        goto setversion_out;
                }
+                mutex_lock(&inode->i_mutex);
                handle = ext4_journal_start(inode, 1);
                if (IS_ERR(handle)) {
                        err = PTR_ERR(handle);
-                        goto setversion_out;
+                        goto unlock_out;
                }
                err = ext4_reserve_inode_write(handle, inode, &iloc);
                if (err == 0) {
@@ -170,6 +173,9 @@ flags_out:
                        err = ext4_mark_iloc_dirty(handle, inode, &iloc);
                }
                ext4_journal_stop(handle);
+unlock_out:
+                mutex_unlock(&inode->i_mutex);
 setversion_out:
                mnt_drop_write_file(filp);
                return err;
@@ -182,19 +188,22 @@ setversion_out:
                if (err)
                        return err;
-                if (get_user(n_blocks_count, (__u32 __user *)arg))
+                if (get_user(n_blocks_count, (__u32 __user *)arg)) {
-                        return -EFAULT;
+                        err = -EFAULT;
+                        goto group_extend_out;
+                }
                if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
                               EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
                        ext4_msg(sb, KERN_ERR,
                                 "Online resizing not supported with bigalloc");
-                        return -EOPNOTSUPP;
+                        err = -EOPNOTSUPP;
+                        goto group_extend_out;
                }
                err = mnt_want_write_file(filp);
                if (err)
-                        return err;
+                        goto group_extend_out;
                err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
                if (EXT4_SB(sb)->s_journal) {
@@ -205,8 +214,8 @@ setversion_out:
                if (err == 0)
                        err = err2;
                mnt_drop_write_file(filp);
+group_extend_out:
                ext4_resize_end(sb);
                return err;
        }
@@ -247,8 +256,7 @@ setversion_out:
                err = ext4_move_extents(filp, donor_filp, me.orig_start,
                                        me.donor_start, me.len, &me.moved_len);
                mnt_drop_write_file(filp);
-                if (me.moved_len > 0)
+                mnt_drop_write(filp->f_path.mnt);
-                        file_remove_suid(donor_filp);
                if (copy_to_user((struct move_extent __user *)arg,
                                 &me, sizeof(me)))
@@ -267,19 +275,22 @@ mext_out:
                        return err;
                if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
-                                sizeof(input)))
+                                sizeof(input))) {
-                        return -EFAULT;
+                        err = -EFAULT;
+                        goto group_add_out;
+                }
                if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
                               EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
                        ext4_msg(sb, KERN_ERR,
                                 "Online resizing not supported with bigalloc");
-                        return -EOPNOTSUPP;
+                        err = -EOPNOTSUPP;
+                        goto group_add_out;
                }
                err = mnt_want_write_file(filp);
                if (err)
-                        return err;
+                        goto group_add_out;
                err = ext4_group_add(sb, &input);
                if (EXT4_SB(sb)->s_journal) {
@@ -290,8 +301,8 @@ mext_out:
                if (err == 0)
                        err = err2;
                mnt_drop_write_file(filp);
+group_add_out:
                ext4_resize_end(sb);
                return err;
        }
@@ -331,6 +342,60 @@ mext_out:
                return err;
        }
+        case EXT4_IOC_RESIZE_FS: {
+                ext4_fsblk_t n_blocks_count;
+                struct super_block *sb = inode->i_sb;
+                int err = 0, err2 = 0;
+                if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                               EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+                        ext4_msg(sb, KERN_ERR,
+                                 "Online resizing not (yet) supported with bigalloc");
+                        return -EOPNOTSUPP;
+                }
+                if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+                               EXT4_FEATURE_INCOMPAT_META_BG)) {
+                        ext4_msg(sb, KERN_ERR,
+                                 "Online resizing not (yet) supported with meta_bg");
+                        return -EOPNOTSUPP;
+                }
+                if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+                                   sizeof(__u64))) {
+                        return -EFAULT;
+                }
+                if (n_blocks_count > MAX_32_NUM &&
+                    !EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                               EXT4_FEATURE_INCOMPAT_64BIT)) {
+                        ext4_msg(sb, KERN_ERR,
+                                 "File system only supports 32-bit block numbers");
+                        return -EOPNOTSUPP;
+                }
+                err = ext4_resize_begin(sb);
+                if (err)
+                        return err;
+                err = mnt_want_write(filp->f_path.mnt);
+                if (err)
+                        goto resizefs_out;
+                err = ext4_resize_fs(sb, n_blocks_count);
+                if (EXT4_SB(sb)->s_journal) {
+                        jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+                        err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+                        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+                }
+                if (err == 0)
+                        err = err2;
+                mnt_drop_write(filp->f_path.mnt);
+resizefs_out:
+                ext4_resize_end(sb);
+                return err;
+        }
        case FITRIM:
        {
                struct request_queue *q = bdev_get_queue(sb->s_bdev);
@@ -429,6 +494,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        }
        case EXT4_IOC_MOVE_EXT:
        case FITRIM:
+        case EXT4_IOC_RESIZE_FS:
                break;
        default:
                return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e2d8be8f28bf..cb990b21c698 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3671,7 +3671,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
        ext4_group_t group;
        ext4_grpblk_t bit;
-        trace_ext4_mb_release_group_pa(pa);
+        trace_ext4_mb_release_group_pa(sb, pa);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 16ac228dbec6..e7d6bb0acfa6 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -12,7 +12,6 @@
 *
 */
-#include <linux/module.h>
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 86edc45b52a4..2043f482375d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2315,7 +2315,7 @@ retry:
                        err = PTR_ERR(handle);
                        goto err_drop_inode;
                }
-                inc_nlink(inode);
+                set_nlink(inode, 1);
                err = ext4_orphan_del(handle, inode);
                if (err) {
                        ext4_journal_stop(handle);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7e106c810c62..475851896518 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -6,7 +6,6 @@
 * Written by Theodore Ts'o, 2010.
 */
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/jbd2.h>
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 996780ab4f4e..f9d948f0eb86 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -134,6 +134,172 @@ static int verify_group_input(struct super_block *sb,
        return err;
 }
+/*
+ * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex
+ * group each time.
+ */
+struct ext4_new_flex_group_data {
+        struct ext4_new_group_data *groups;     /* new_group_data for groups
+                                                   in the flex group */
+        __u16 *bg_flags;                        /* block group flags of groups
+                                                   in @groups */
+        ext4_group_t count;                     /* number of groups in @groups
+                                                 */
+};
+/*
+ * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
+ * @flexbg_size.
+ *
+ * Returns NULL on failure otherwise address of the allocated structure.
+ */
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
+{
+        struct ext4_new_flex_group_data *flex_gd;
+        flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
+        if (flex_gd == NULL)
+                goto out3;
+        flex_gd->count = flexbg_size;
+        flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
+                                  flexbg_size, GFP_NOFS);
+        if (flex_gd->groups == NULL)
+                goto out2;
+        flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS);
+        if (flex_gd->bg_flags == NULL)
+                goto out1;
+        return flex_gd;
+out1:
+        kfree(flex_gd->groups);
+out2:
+        kfree(flex_gd);
+out3:
+        return NULL;
+}
+static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
+{
+        kfree(flex_gd->bg_flags);
+        kfree(flex_gd->groups);
+        kfree(flex_gd);
+}
+/*
+ * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps
+ * and inode tables for a flex group.
+ *
+ * This function is used by 64bit-resize.  Note that this function allocates
+ * group tables from the 1st group of groups contained by @flexgd, which may
+ * be a partial of a flex group.
+ *
+ * @sb: super block of fs to which the groups belongs
+ */
+static void ext4_alloc_group_tables(struct super_block *sb,
+                                struct ext4_new_flex_group_data *flex_gd,
+                                int flexbg_size)
+{
+        struct ext4_new_group_data *group_data = flex_gd->groups;
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        ext4_fsblk_t start_blk;
+        ext4_fsblk_t last_blk;
+        ext4_group_t src_group;
+        ext4_group_t bb_index = 0;
+        ext4_group_t ib_index = 0;
+        ext4_group_t it_index = 0;
+        ext4_group_t group;
+        ext4_group_t last_group;
+        unsigned overhead;
+        BUG_ON(flex_gd->count == 0 || group_data == NULL);
+        src_group = group_data[0].group;
+        last_group  = src_group + flex_gd->count - 1;
+        BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) !=
+               (last_group & ~(flexbg_size - 1))));
+next_group:
+        group = group_data[0].group;
+        start_blk = ext4_group_first_block_no(sb, src_group);
+        last_blk = start_blk + group_data[src_group - group].blocks_count;
+        overhead = ext4_bg_has_super(sb, src_group) ?
+                   (1 + ext4_bg_num_gdb(sb, src_group) +
+                    le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+        start_blk += overhead;
+        BUG_ON(src_group >= group_data[0].group + flex_gd->count);
+        /* We collect contiguous blocks as much as possible. */
+        src_group++;
+        for (; src_group <= last_group; src_group++)
+                if (!ext4_bg_has_super(sb, src_group))
+                        last_blk += group_data[src_group - group].blocks_count;
+                else
+                        break;
+        /* Allocate block bitmaps */
+        for (; bb_index < flex_gd->count; bb_index++) {
+                if (start_blk >= last_blk)
+                        goto next_group;
+                group_data[bb_index].block_bitmap = start_blk++;
+                ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+                group -= group_data[0].group;
+                group_data[group].free_blocks_count--;
+                if (flexbg_size > 1)
+                        flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+        }
+        /* Allocate inode bitmaps */
+        for (; ib_index < flex_gd->count; ib_index++) {
+                if (start_blk >= last_blk)
+                        goto next_group;
+                group_data[ib_index].inode_bitmap = start_blk++;
+                ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
+                group -= group_data[0].group;
+                group_data[group].free_blocks_count--;
+                if (flexbg_size > 1)
+                        flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+        }
+        /* Allocate inode tables */
+        for (; it_index < flex_gd->count; it_index++) {
+                if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
+                        goto next_group;
+                group_data[it_index].inode_table = start_blk;
+                ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
+                group -= group_data[0].group;
+                group_data[group].free_blocks_count -=
+                                        EXT4_SB(sb)->s_itb_per_group;
+                if (flexbg_size > 1)
+                        flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+                start_blk += EXT4_SB(sb)->s_itb_per_group;
+        }
+        if (test_opt(sb, DEBUG)) {
+                int i;
+                group = group_data[0].group;
+                printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
+                       "%d groups, flexbg size is %d:\n", flex_gd->count,
+                       flexbg_size);
+                for (i = 0; i < flex_gd->count; i++) {
+                        printk(KERN_DEBUG "adding %s group %u: %u "
+                               "blocks (%d free)\n",
+                               ext4_bg_has_super(sb, group + i) ? "normal" :
+                               "no-super", group + i,
+                               group_data[i].blocks_count,
+                               group_data[i].free_blocks_count);
+                }
+        }
+}
 static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
                                  ext4_fsblk_t blk)
 {
@@ -179,131 +345,250 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh)
 }
 /*
- * Set up the block and inode bitmaps, and the inode table for the new group.
+ * set_flexbg_block_bitmap() mark @count blocks starting from @block used.
+ *
+ * Helper function for ext4_setup_new_group_blocks() which set .
+ *
+ * @sb: super block
+ * @handle: journal handle
+ * @flex_gd: flex group data
+ */
+static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
+                        struct ext4_new_flex_group_data *flex_gd,
+                        ext4_fsblk_t block, ext4_group_t count)
+{
+        ext4_group_t count2;
+        ext4_debug("mark blocks [%llu/%u] used\n", block, count);
+        for (count2 = count; count > 0; count -= count2, block += count2) {
+                ext4_fsblk_t start;
+                struct buffer_head *bh;
+                ext4_group_t group;
+                int err;
+                ext4_get_group_no_and_offset(sb, block, &group, NULL);
+                start = ext4_group_first_block_no(sb, group);
+                group -= flex_gd->groups[0].group;
+                count2 = sb->s_blocksize * 8 - (block - start);
+                if (count2 > count)
+                        count2 = count;
+                if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) {
+                        BUG_ON(flex_gd->count > 1);
+                        continue;
+                }
+                err = extend_or_restart_transaction(handle, 1);
+                if (err)
+                        return err;
+                bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
+                if (!bh)
+                        return -EIO;
+                err = ext4_journal_get_write_access(handle, bh);
+                if (err)
+                        return err;
+                ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block,
+                           block - start, count2);
+                ext4_set_bits(bh->b_data, block - start, count2);
+                err = ext4_handle_dirty_metadata(handle, NULL, bh);
+                if (unlikely(err))
+                        return err;
+                brelse(bh);
+        }
+        return 0;
+}
+/*
+ * Set up the block and inode bitmaps, and the inode table for the new groups.
 * This doesn't need to be part of the main transaction, since we are only
 * changing blocks outside the actual filesystem.  We still do journaling to
 * ensure the recovery is correct in case of a failure just after resize.
 * If any part of this fails, we simply abort the resize.
+ *
+ * setup_new_flex_group_blocks handles a flex group as follow:
+ *  1. copy super block and GDT, and initialize group tables if necessary.
+ *     In this step, we only set bits in blocks bitmaps for blocks taken by
+ *     super block and GDT.
+ *  2. allocate group tables in block bitmaps, that is, set bits in block
+ *     bitmap for blocks taken by group tables.
 */
-static int setup_new_group_blocks(struct super_block *sb,
+static int setup_new_flex_group_blocks(struct super_block *sb,
-                                  struct ext4_new_group_data *input)
+                                struct ext4_new_flex_group_data *flex_gd)
 {
+        int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group};
+        ext4_fsblk_t start;
+        ext4_fsblk_t block;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
+        struct ext4_super_block *es = sbi->s_es;
-        int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
+        struct ext4_new_group_data *group_data = flex_gd->groups;
-                le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
+        __u16 *bg_flags = flex_gd->bg_flags;
-        unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
-        struct buffer_head *bh;
        handle_t *handle;
-        ext4_fsblk_t block;
+        ext4_group_t group, count;
-        ext4_grpblk_t bit;
+        struct buffer_head *bh = NULL;
-        int i;
+        int reserved_gdb, i, j, err = 0, err2;
-        int err = 0, err2;
+        BUG_ON(!flex_gd->count || !group_data ||
+               group_data[0].group != sbi->s_groups_count);
+        reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
        /* This transaction may be extended/restarted along the way */
        handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
-        BUG_ON(input->group != sbi->s_groups_count);
+        group = group_data[0].group;
+        for (i = 0; i < flex_gd->count; i++, group++) {
+                unsigned long gdblocks;
-        /* Copy all of the GDT blocks into the backup in this group */
+                gdblocks = ext4_bg_num_gdb(sb, group);
-        for (i = 0, bit = 1, block = start + 1;
+                start = ext4_group_first_block_no(sb, group);
-             i < gdblocks; i++, block++, bit++) {
-                struct buffer_head *gdb;
-                ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
+                /* Copy all of the GDT blocks into the backup in this group */
-                err = extend_or_restart_transaction(handle, 1);
+                for (j = 0, block = start + 1; j < gdblocks; j++, block++) {
-                if (err)
+                        struct buffer_head *gdb;
-                        goto exit_journal;
-                gdb = sb_getblk(sb, block);
+                        ext4_debug("update backup group %#04llx\n", block);
-                if (!gdb) {
+                        err = extend_or_restart_transaction(handle, 1);
-                        err = -EIO;
+                        if (err)
-                        goto exit_journal;
+                                goto out;
-                }
-                if ((err = ext4_journal_get_write_access(handle, gdb))) {
+                        gdb = sb_getblk(sb, block);
+                        if (!gdb) {
+                                err = -EIO;
+                                goto out;
+                        }
+                        err = ext4_journal_get_write_access(handle, gdb);
+                        if (err) {
+                                brelse(gdb);
+                                goto out;
+                        }
+                        memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
+                               gdb->b_size);
+                        set_buffer_uptodate(gdb);
+                        err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+                        if (unlikely(err)) {
+                                brelse(gdb);
+                                goto out;
+                        }
                        brelse(gdb);
-                        goto exit_journal;
                }
-                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
-                set_buffer_uptodate(gdb);
+                /* Zero out all of the reserved backup group descriptor
-                err = ext4_handle_dirty_metadata(handle, NULL, gdb);
+                 * table blocks
-                if (unlikely(err)) {
+                 */
-                        brelse(gdb);
+                if (ext4_bg_has_super(sb, group)) {
-                        goto exit_journal;
+                        err = sb_issue_zeroout(sb, gdblocks + start + 1,
+                                        reserved_gdb, GFP_NOFS);
+                        if (err)
+                                goto out;
                }
-                brelse(gdb);
-        }
-        /* Zero out all of the reserved backup group descriptor table blocks */
+                /* Initialize group tables of the grop @group */
-        ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+                if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
-                        block, sbi->s_itb_per_group);
+                        goto handle_bb;
-        err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
-                               GFP_NOFS);
-        if (err)
-                goto exit_journal;
-        err = extend_or_restart_transaction(handle, 2);
+                /* Zero out all of the inode table blocks */
-        if (err)
+                block = group_data[i].inode_table;
-                goto exit_journal;
+                ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+                           block, sbi->s_itb_per_group);
+                err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
+                                       GFP_NOFS);
+                if (err)
+                        goto out;
-        bh = bclean(handle, sb, input->block_bitmap);
+handle_bb:
-        if (IS_ERR(bh)) {
+                if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT)
-                err = PTR_ERR(bh);
+                        goto handle_ib;
-                goto exit_journal;
-        }
-        if (ext4_bg_has_super(sb, input->group)) {
+                /* Initialize block bitmap of the @group */
-                ext4_debug("mark backup group tables %#04llx (+0)\n", start);
+                block = group_data[i].block_bitmap;
-                ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1);
+                err = extend_or_restart_transaction(handle, 1);
-        }
+                if (err)
+                        goto out;
-        ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
+                bh = bclean(handle, sb, block);
-                   input->block_bitmap - start);
+                if (IS_ERR(bh)) {
-        ext4_set_bit(input->block_bitmap - start, bh->b_data);
+                        err = PTR_ERR(bh);
-        ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap,
+                        goto out;
-                   input->inode_bitmap - start);
+                }
-        ext4_set_bit(input->inode_bitmap - start, bh->b_data);
+                if (ext4_bg_has_super(sb, group)) {
+                        ext4_debug("mark backup superblock %#04llx (+0)\n",
-        /* Zero out all of the inode table blocks */
+                                   start);
-        block = input->inode_table;
+                        ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb +
-        ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
+                                                     1);
-                        block, sbi->s_itb_per_group);
+                }
-        err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
+                ext4_mark_bitmap_end(group_data[i].blocks_count,
-        if (err)
+                                     sb->s_blocksize * 8, bh->b_data);
-                goto exit_bh;
+                err = ext4_handle_dirty_metadata(handle, NULL, bh);
-        ext4_set_bits(bh->b_data, input->inode_table - start,
+                if (err)
-                      sbi->s_itb_per_group);
+                        goto out;
+                brelse(bh);
+handle_ib:
+                if (bg_flags[i] & EXT4_BG_INODE_UNINIT)
+                        continue;
-        ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
+                /* Initialize inode bitmap of the @group */
-                             bh->b_data);
+                block = group_data[i].inode_bitmap;
-        err = ext4_handle_dirty_metadata(handle, NULL, bh);
+                err = extend_or_restart_transaction(handle, 1);
-        if (unlikely(err)) {
+                if (err)
-                ext4_std_error(sb, err);
+                        goto out;
-                goto exit_bh;
+                /* Mark unused entries in inode bitmap used */
+                bh = bclean(handle, sb, block);
+                if (IS_ERR(bh)) {
+                        err = PTR_ERR(bh);
+                        goto out;
+                }
+                ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
+                                     sb->s_blocksize * 8, bh->b_data);
+                err = ext4_handle_dirty_metadata(handle, NULL, bh);
+                if (err)
+                        goto out;
+                brelse(bh);
        }
-        brelse(bh);
+        bh = NULL;
-        /* Mark unused entries in inode bitmap used */
-        ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
+        /* Mark group tables in block bitmap */
-                   input->inode_bitmap, input->inode_bitmap - start);
+        for (j = 0; j < GROUP_TABLE_COUNT; j++) {
-        if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
+                count = group_table_count[j];
-                err = PTR_ERR(bh);
+                start = (&group_data[0].block_bitmap)[j];
-                goto exit_journal;
+                block = start;
+                for (i = 1; i < flex_gd->count; i++) {
+                        block += group_table_count[j];
+                        if (block == (&group_data[i].block_bitmap)[j]) {
+                                count += group_table_count[j];
+                                continue;
+                        }
+                        err = set_flexbg_block_bitmap(sb, handle,
+                                                flex_gd, start, count);
+                        if (err)
+                                goto out;
+                        count = group_table_count[j];
+                        start = group_data[i].block_bitmap;
+                        block = start;
+                }
+                if (count) {
+                        err = set_flexbg_block_bitmap(sb, handle,
+                                                flex_gd, start, count);
+                        if (err)
+                                goto out;
+                }
        }
-        ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
+out:
-                             bh->b_data);
-        err = ext4_handle_dirty_metadata(handle, NULL, bh);
-        if (unlikely(err))
-                ext4_std_error(sb, err);
-exit_bh:
        brelse(bh);
+        err2 = ext4_journal_stop(handle);
-exit_journal:
+        if (err2 && !err)
-        if ((err2 = ext4_journal_stop(handle)) && !err)
                err = err2;
        return err;
@@ -351,10 +636,10 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
 * groups in current filesystem that have BACKUPS, or -ve error code.
 */
 static int verify_reserved_gdb(struct super_block *sb,
+                               ext4_group_t end,
                               struct buffer_head *primary)
 {
        const ext4_fsblk_t blk = primary->b_blocknr;
-        const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
        unsigned three = 1;
        unsigned five = 5;
        unsigned seven = 7;
@@ -429,7 +714,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        if (!gdb_bh)
                return -EIO;
-        gdbackups = verify_reserved_gdb(sb, gdb_bh);
+        gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
        if (gdbackups < 0) {
                err = gdbackups;
                goto exit_bh;
@@ -592,7 +877,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
                        err = -EIO;
                        goto exit_bh;
                }
-                if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+                gdbackups = verify_reserved_gdb(sb, group, primary[res]);
+                if (gdbackups < 0) {
                        brelse(primary[res]);
                        err = gdbackups;
                        goto exit_bh;
@@ -735,6 +1021,348 @@ exit_err:
        }
 }
+/*
+ * ext4_add_new_descs() adds @count group descriptor of groups
+ * starting at @group
+ *
+ * @handle: journal handle
+ * @sb: super block
+ * @group: the group no. of the first group desc to be added
+ * @resize_inode: the resize inode
+ * @count: number of group descriptors to be added
+ */
+static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
+                              ext4_group_t group, struct inode *resize_inode,
+                              ext4_group_t count)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_super_block *es = sbi->s_es;
+        struct buffer_head *gdb_bh;
+        int i, gdb_off, gdb_num, err = 0;
+        for (i = 0; i < count; i++, group++) {
+                int reserved_gdb = ext4_bg_has_super(sb, group) ?
+                        le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+                gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+                gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+                /*
+                 * We will only either add reserved group blocks to a backup group
+                 * or remove reserved blocks for the first group in a new group block.
+                 * Doing both would be mean more complex code, and sane people don't
+                 * use non-sparse filesystems anymore.  This is already checked above.
+                 */
+                if (gdb_off) {
+                        gdb_bh = sbi->s_group_desc[gdb_num];
+                        err = ext4_journal_get_write_access(handle, gdb_bh);
+                        if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
+                                err = reserve_backup_gdb(handle, resize_inode, group);
+                } else
+                        err = add_new_gdb(handle, resize_inode, group);
+                if (err)
+                        break;
+        }
+        return err;
+}
+/*
+ * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
+ */
+static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
+                                struct ext4_new_flex_group_data *flex_gd)
+{
+        struct ext4_new_group_data      *group_data = flex_gd->groups;
+        struct ext4_group_desc          *gdp;
+        struct ext4_sb_info             *sbi = EXT4_SB(sb);
+        struct buffer_head              *gdb_bh;
+        ext4_group_t                    group;
+        __u16                           *bg_flags = flex_gd->bg_flags;
+        int                             i, gdb_off, gdb_num, err = 0;
+        
+        for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) {
+                group = group_data->group;
+                gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
+                gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
+                /*
+                 * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
+                 */
+                gdb_bh = sbi->s_group_desc[gdb_num];
+                /* Update group descriptor block for new group */
+                gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data +
+                                                 gdb_off * EXT4_DESC_SIZE(sb));
+                memset(gdp, 0, EXT4_DESC_SIZE(sb));
+                ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
+                ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
+                ext4_inode_table_set(sb, gdp, group_data->inode_table);
+                ext4_free_group_clusters_set(sb, gdp,
+                                             EXT4_B2C(sbi, group_data->free_blocks_count));
+                ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+                gdp->bg_flags = cpu_to_le16(*bg_flags);
+                gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+                err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
+                if (unlikely(err)) {
+                        ext4_std_error(sb, err);
+                        break;
+                }
+                /*
+                 * We can allocate memory for mb_alloc based on the new group
+                 * descriptor
+                 */
+                err = ext4_mb_add_groupinfo(sb, group, gdp);
+                if (err)
+                        break;
+        }
+        return err;
+}
+/*
+ * ext4_update_super() updates the super block so that the newly added
+ * groups can be seen by the filesystem.
+ *
+ * @sb: super block
+ * @flex_gd: new added groups
+ */
+static void ext4_update_super(struct super_block *sb,
+                             struct ext4_new_flex_group_data *flex_gd)
+{
+        ext4_fsblk_t blocks_count = 0;
+        ext4_fsblk_t free_blocks = 0;
+        ext4_fsblk_t reserved_blocks = 0;
+        struct ext4_new_group_data *group_data = flex_gd->groups;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_super_block *es = sbi->s_es;
+        int i;
+        BUG_ON(flex_gd->count == 0 || group_data == NULL);
+        /*
+         * Make the new blocks and inodes valid next.  We do this before
+         * increasing the group count so that once the group is enabled,
+         * all of its blocks and inodes are already valid.
+         *
+         * We always allocate group-by-group, then block-by-block or
+         * inode-by-inode within a group, so enabling these
+         * blocks/inodes before the group is live won't actually let us
+         * allocate the new space yet.
+         */
+        for (i = 0; i < flex_gd->count; i++) {
+                blocks_count += group_data[i].blocks_count;
+                free_blocks += group_data[i].free_blocks_count;
+        }
+        reserved_blocks = ext4_r_blocks_count(es) * 100;
+        do_div(reserved_blocks, ext4_blocks_count(es));
+        reserved_blocks *= blocks_count;
+        do_div(reserved_blocks, 100);
+        ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+        le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+                     flex_gd->count);
+        /*
+         * We need to protect s_groups_count against other CPUs seeing
+         * inconsistent state in the superblock.
+         *
+         * The precise rules we use are:
+         *
+         * * Writers must perform a smp_wmb() after updating all
+         *   dependent data and before modifying the groups count
+         *
+         * * Readers must perform an smp_rmb() after reading the groups
+         *   count and before reading any dependent data.
+         *
+         * NB. These rules can be relaxed when checking the group count
+         * while freeing data, as we can only allocate from a block
+         * group after serialising against the group count, and we can
+         * only then free after serialising in turn against that
+         * allocation.
+         */
+        smp_wmb();
+        /* Update the global fs size fields */
+        sbi->s_groups_count += flex_gd->count;
+        /* Update the reserved block counts only once the new group is
+         * active. */
+        ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
+                                reserved_blocks);
+        /* Update the free space counts */
+        percpu_counter_add(&sbi->s_freeclusters_counter,
+                           EXT4_B2C(sbi, free_blocks));
+        percpu_counter_add(&sbi->s_freeinodes_counter,
+                           EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                      EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+            sbi->s_log_groups_per_flex) {
+                ext4_group_t flex_group;
+                flex_group = ext4_flex_group(sbi, group_data[0].group);
+                atomic_add(EXT4_B2C(sbi, free_blocks),
+                           &sbi->s_flex_groups[flex_group].free_clusters);
+                atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
+                           &sbi->s_flex_groups[flex_group].free_inodes);
+        }
+        if (test_opt(sb, DEBUG))
+                printk(KERN_DEBUG "EXT4-fs: added group %u:"
+                       "%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
+                       blocks_count, free_blocks, reserved_blocks);
+}
+/* Add a flex group to an fs. Ensure we handle all possible error conditions
+ * _before_ we start modifying the filesystem, because we cannot abort the
+ * transaction and not have it write the data to disk.
+ */
+static int ext4_flex_group_add(struct super_block *sb,
+                               struct inode *resize_inode,
+                               struct ext4_new_flex_group_data *flex_gd)
+{
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_super_block *es = sbi->s_es;
+        ext4_fsblk_t o_blocks_count;
+        ext4_grpblk_t last;
+        ext4_group_t group;
+        handle_t *handle;
+        unsigned reserved_gdb;
+        int err = 0, err2 = 0, credit;
+        BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags);
+        reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
+        o_blocks_count = ext4_blocks_count(es);
+        ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+        BUG_ON(last);
+        err = setup_new_flex_group_blocks(sb, flex_gd);
+        if (err)
+                goto exit;
+        /*
+         * We will always be modifying at least the superblock and  GDT
+         * block.  If we are adding a group past the last current GDT block,
+         * we will also modify the inode and the dindirect block.  If we
+         * are adding a group with superblock/GDT backups  we will also
+         * modify each of the reserved GDT dindirect blocks.
+         */
+        credit = flex_gd->count * 4 + reserved_gdb;
+        handle = ext4_journal_start_sb(sb, credit);
+        if (IS_ERR(handle)) {
+                err = PTR_ERR(handle);
+                goto exit;
+        }
+        err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+        if (err)
+                goto exit_journal;
+        group = flex_gd->groups[0].group;
+        BUG_ON(group != EXT4_SB(sb)->s_groups_count);
+        err = ext4_add_new_descs(handle, sb, group,
+                                resize_inode, flex_gd->count);
+        if (err)
+                goto exit_journal;
+        err = ext4_setup_new_descs(handle, sb, flex_gd);
+        if (err)
+                goto exit_journal;
+        ext4_update_super(sb, flex_gd);
+        err = ext4_handle_dirty_super(handle, sb);
+exit_journal:
+        err2 = ext4_journal_stop(handle);
+        if (!err)
+                err = err2;
+        if (!err) {
+                int i;
+                update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+                               sizeof(struct ext4_super_block));
+                for (i = 0; i < flex_gd->count; i++, group++) {
+                        struct buffer_head *gdb_bh;
+                        int gdb_num;
+                        gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb);
+                        gdb_bh = sbi->s_group_desc[gdb_num];
+                        update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
+                                       gdb_bh->b_size);
+                }
+        }
+exit:
+        return err;
+}
+static int ext4_setup_next_flex_gd(struct super_block *sb,
+                                    struct ext4_new_flex_group_data *flex_gd,
+                                    ext4_fsblk_t n_blocks_count,
+                                    unsigned long flexbg_size)
+{
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        struct ext4_new_group_data *group_data = flex_gd->groups;
+        ext4_fsblk_t o_blocks_count;
+        ext4_group_t n_group;
+        ext4_group_t group;
+        ext4_group_t last_group;
+        ext4_grpblk_t last;
+        ext4_grpblk_t blocks_per_group;
+        unsigned long i;
+        blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb);
+        o_blocks_count = ext4_blocks_count(es);
+        if (o_blocks_count == n_blocks_count)
+                return 0;
+        ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
+        BUG_ON(last);
+        ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
+        last_group = group | (flexbg_size - 1);
+        if (last_group > n_group)
+                last_group = n_group;
+        flex_gd->count = last_group - group + 1;
+        for (i = 0; i < flex_gd->count; i++) {
+                int overhead;
+                group_data[i].group = group + i;
+                group_data[i].blocks_count = blocks_per_group;
+                overhead = ext4_bg_has_super(sb, group + i) ?
+                           (1 + ext4_bg_num_gdb(sb, group + i) +
+                            le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+                group_data[i].free_blocks_count = blocks_per_group - overhead;
+                if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                               EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+                        flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
+                                               EXT4_BG_INODE_UNINIT;
+                else
+                        flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
+        }
+        if (last_group == n_group &&
+            EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+                /* We need to initialize block bitmap of last group. */
+                flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
+        if ((last_group == n_group) && (last != blocks_per_group - 1)) {
+                group_data[i - 1].blocks_count = last + 1;
+                group_data[i - 1].free_blocks_count -= blocks_per_group-
+                                        last - 1;
+        }
+        return 1;
+}
 /* Add group descriptor data to an existing or new group descriptor block.
 * Ensure we handle all possible error conditions _before_ we start modifying
 * the filesystem, because we cannot abort the transaction and not have it
@@ -750,16 +1378,15 @@ exit_err:
 */
 int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 {
+        struct ext4_new_flex_group_data flex_gd;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
                le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
-        struct buffer_head *primary = NULL;
-        struct ext4_group_desc *gdp;
        struct inode *inode = NULL;
-        handle_t *handle;
        int gdb_off, gdb_num;
-        int err, err2;
+        int err;
+        __u16 bg_flags = 0;
        gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
        gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
@@ -798,175 +1425,69 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        }
-        if ((err = verify_group_input(sb, input)))
+        err = verify_group_input(sb, input);
-                goto exit_put;
+        if (err)
+                goto out;
-        if ((err = setup_new_group_blocks(sb, input)))
+        flex_gd.count = 1;
-                goto exit_put;
+        flex_gd.groups = input;
+        flex_gd.bg_flags = &bg_flags;
+        err = ext4_flex_group_add(sb, inode, &flex_gd);
+out:
+        iput(inode);
+        return err;
+} /* ext4_group_add */
-        /*
+/*
-         * We will always be modifying at least the superblock and a GDT
+ * extend a group without checking assuming that checking has been done.
-         * block.  If we are adding a group past the last current GDT block,
+ */
-         * we will also modify the inode and the dindirect block.  If we
+static int ext4_group_extend_no_check(struct super_block *sb,
-         * are adding a group with superblock/GDT backups  we will also
+                                      ext4_fsblk_t o_blocks_count, ext4_grpblk_t add)
-         * modify each of the reserved GDT dindirect blocks.
+{
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        handle_t *handle;
+        int err = 0, err2;
+        /* We will update the superblock, one block bitmap, and
+         * one group descriptor via ext4_group_add_blocks().
         */
-        handle = ext4_journal_start_sb(sb,
+        handle = ext4_journal_start_sb(sb, 3);
-                                       ext4_bg_has_super(sb, input->group) ?
-                                       3 + reserved_gdb : 4);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
-                goto exit_put;
+                ext4_warning(sb, "error %d on journal start", err);
+                return err;
        }
-        if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
+        err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
-                goto exit_journal;
+        if (err) {
+                ext4_warning(sb, "error %d on journal write access", err);
-        /*
+                goto errout;
-         * We will only either add reserved group blocks to a backup group
-         * or remove reserved blocks for the first group in a new group block.
-         * Doing both would be mean more complex code, and sane people don't
-         * use non-sparse filesystems anymore.  This is already checked above.
-         */
-        if (gdb_off) {
-                primary = sbi->s_group_desc[gdb_num];
-                if ((err = ext4_journal_get_write_access(handle, primary)))
-                        goto exit_journal;
-                if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) {
-                        err = reserve_backup_gdb(handle, inode, input->group);
-                        if (err)
-                                goto exit_journal;
-                }
-        } else {
-                /*
-                 * Note that we can access new group descriptor block safely
-                 * only if add_new_gdb() succeeds.
-                 */
-                err = add_new_gdb(handle, inode, input->group);
-                if (err)
-                        goto exit_journal;
-                primary = sbi->s_group_desc[gdb_num];
        }
-        /*
+        ext4_blocks_count_set(es, o_blocks_count + add);
-         * OK, now we've set up the new group.  Time to make it active.
+        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
-         *
+                   o_blocks_count + add);
-         * so we have to be safe wrt. concurrent accesses the group
+        /* We add the blocks to the bitmap and set the group need init bit */
-         * data.  So we need to be careful to set all of the relevant
+        err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
-         * group descriptor data etc. *before* we enable the group.
-         *
-         * The key field here is sbi->s_groups_count: as long as
-         * that retains its old value, nobody is going to access the new
-         * group.
-         *
-         * So first we update all the descriptor metadata for the new
-         * group; then we update the total disk blocks count; then we
-         * update the groups count to enable the group; then finally we
-         * update the free space counts so that the system can start
-         * using the new disk blocks.
-         */
-        /* Update group descriptor block for new group */
-        gdp = (struct ext4_group_desc *)((char *)primary->b_data +
-                                         gdb_off * EXT4_DESC_SIZE(sb));
-        memset(gdp, 0, EXT4_DESC_SIZE(sb));
-        ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
-        ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
-        ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
-        ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count);
-        ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
-        gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
-        gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
-        /*
-         * We can allocate memory for mb_alloc based on the new group
-         * descriptor
-         */
-        err = ext4_mb_add_groupinfo(sb, input->group, gdp);
        if (err)
-                goto exit_journal;
+                goto errout;
-        /*
-         * Make the new blocks and inodes valid next.  We do this before
-         * increasing the group count so that once the group is enabled,
-         * all of its blocks and inodes are already valid.
-         *
-         * We always allocate group-by-group, then block-by-block or
-         * inode-by-inode within a group, so enabling these
-         * blocks/inodes before the group is live won't actually let us
-         * allocate the new space yet.
-         */
-        ext4_blocks_count_set(es, ext4_blocks_count(es) +
-                input->blocks_count);
-        le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb));
-        /*
-         * We need to protect s_groups_count against other CPUs seeing
-         * inconsistent state in the superblock.
-         *
-         * The precise rules we use are:
-         *
-         * * Writers must perform a smp_wmb() after updating all dependent
-         *   data and before modifying the groups count
-         *
-         * * Readers must perform an smp_rmb() after reading the groups count
-         *   and before reading any dependent data.
-         *
-         * NB. These rules can be relaxed when checking the group count
-         * while freeing data, as we can only allocate from a block
-         * group after serialising against the group count, and we can
-         * only then free after serialising in turn against that
-         * allocation.
-         */
-        smp_wmb();
-        /* Update the global fs size fields */
-        sbi->s_groups_count++;
-        err = ext4_handle_dirty_metadata(handle, NULL, primary);
-        if (unlikely(err)) {
-                ext4_std_error(sb, err);
-                goto exit_journal;
-        }
-        /* Update the reserved block counts only once the new group is
-         * active. */
-        ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
-                input->reserved_blocks);
-        /* Update the free space counts */
-        percpu_counter_add(&sbi->s_freeclusters_counter,
-                           EXT4_B2C(sbi, input->free_blocks_count));
-        percpu_counter_add(&sbi->s_freeinodes_counter,
-                           EXT4_INODES_PER_GROUP(sb));
-        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
-            sbi->s_log_groups_per_flex) {
-                ext4_group_t flex_group;
-                flex_group = ext4_flex_group(sbi, input->group);
-                atomic_add(EXT4_B2C(sbi, input->free_blocks_count),
-                           &sbi->s_flex_groups[flex_group].free_clusters);
-                atomic_add(EXT4_INODES_PER_GROUP(sb),
-                           &sbi->s_flex_groups[flex_group].free_inodes);
-        }
        ext4_handle_dirty_super(handle, sb);
+        ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
-exit_journal:
+                   o_blocks_count + add);
-        if ((err2 = ext4_journal_stop(handle)) && !err)
+errout:
+        err2 = ext4_journal_stop(handle);
+        if (err2 && !err)
                err = err2;
-        if (!err && primary) {
-                update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+        if (!err) {
+                if (test_opt(sb, DEBUG))
+                        printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
+                               "blocks\n", ext4_blocks_count(es));
+                update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
                               sizeof(struct ext4_super_block));
-                update_backups(sb, primary->b_blocknr, primary->b_data,
-                               primary->b_size);
        }
-exit_put:
-        iput(inode);
        return err;
-} /* ext4_group_add */
+}
 /*
 * Extend the filesystem to the new number of blocks specified.  This entry
@@ -985,8 +1506,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        ext4_grpblk_t last;
        ext4_grpblk_t add;
        struct buffer_head *bh;
-        handle_t *handle;
+        int err;
-        int err, err2;
        ext4_group_t group;
        o_blocks_count = ext4_blocks_count(es);
@@ -1042,42 +1562,119 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        }
        brelse(bh);
-        /* We will update the superblock, one block bitmap, and
+        err = ext4_group_extend_no_check(sb, o_blocks_count, add);
-         * one group descriptor via ext4_free_blocks().
+        return err;
-         */
+} /* ext4_group_extend */
-        handle = ext4_journal_start_sb(sb, 3);
-        if (IS_ERR(handle)) {
+/*
-                err = PTR_ERR(handle);
+ * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
-                ext4_warning(sb, "error %d on journal start", err);
+ *
-                goto exit_put;
+ * @sb: super block of the fs to be resized
+ * @n_blocks_count: the number of blocks resides in the resized fs
+ */
+int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
+{
+        struct ext4_new_flex_group_data *flex_gd = NULL;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_super_block *es = sbi->s_es;
+        struct buffer_head *bh;
+        struct inode *resize_inode;
+        ext4_fsblk_t o_blocks_count;
+        ext4_group_t o_group;
+        ext4_group_t n_group;
+        ext4_grpblk_t offset;
+        unsigned long n_desc_blocks;
+        unsigned long o_desc_blocks;
+        unsigned long desc_blocks;
+        int err = 0, flexbg_size = 1;
+        o_blocks_count = ext4_blocks_count(es);
+        if (test_opt(sb, DEBUG))
+                printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu "
+                       "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+        if (n_blocks_count < o_blocks_count) {
+                /* On-line shrinking not supported */
+                ext4_warning(sb, "can't shrink FS - resize aborted");
+                return -EINVAL;
        }
-        if ((err = ext4_journal_get_write_access(handle,
+        if (n_blocks_count == o_blocks_count)
-                                                 EXT4_SB(sb)->s_sbh))) {
+                /* Nothing need to do */
-                ext4_warning(sb, "error %d on journal write access", err);
+                return 0;
-                ext4_journal_stop(handle);
-                goto exit_put;
+        ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
+        ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset);
+        n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
+                        EXT4_DESC_PER_BLOCK(sb);
+        o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+                        EXT4_DESC_PER_BLOCK(sb);
+        desc_blocks = n_desc_blocks - o_desc_blocks;
+        if (desc_blocks &&
+            (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) ||
+             le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) {
+                ext4_warning(sb, "No reserved GDT blocks, can't resize");
+                return -EPERM;
        }
-        ext4_blocks_count_set(es, o_blocks_count + add);
-        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
-                   o_blocks_count + add);
-        /* We add the blocks to the bitmap and set the group need init bit */
-        err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
-        ext4_handle_dirty_super(handle, sb);
-        ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
-                   o_blocks_count + add);
-        err2 = ext4_journal_stop(handle);
-        if (!err && err2)
-                err = err2;
-        if (err)
+        resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
-                goto exit_put;
+        if (IS_ERR(resize_inode)) {
+                ext4_warning(sb, "Error opening resize inode");
+                return PTR_ERR(resize_inode);
+        }
+        /* See if the device is actually as big as what was requested */
+        bh = sb_bread(sb, n_blocks_count - 1);
+        if (!bh) {
+                ext4_warning(sb, "can't read last block, resize aborted");
+                return -ENOSPC;
+        }
+        brelse(bh);
+        if (offset != 0) {
+                /* extend the last group */
+                ext4_grpblk_t add;
+                add = EXT4_BLOCKS_PER_GROUP(sb) - offset;
+                err = ext4_group_extend_no_check(sb, o_blocks_count, add);
+                if (err)
+                        goto out;
+        }
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+            es->s_log_groups_per_flex)
+                flexbg_size = 1 << es->s_log_groups_per_flex;
+        o_blocks_count = ext4_blocks_count(es);
+        if (o_blocks_count == n_blocks_count)
+                goto out;
+        flex_gd = alloc_flex_gd(flexbg_size);
+        if (flex_gd == NULL) {
+                err = -ENOMEM;
+                goto out;
+        }
+        /* Add flex groups. Note that a regular group is a
+         * flex group with 1 group.
+         */
+        while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
+                                              flexbg_size)) {
+                ext4_alloc_group_tables(sb, flex_gd, flexbg_size);
+                err = ext4_flex_group_add(sb, resize_inode, flex_gd);
+                if (unlikely(err))
+                        break;
+        }
+out:
+        if (flex_gd)
+                free_flex_gd(flex_gd);
+        iput(resize_inode);
        if (test_opt(sb, DEBUG))
-                printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
+                printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu "
-                       ext4_blocks_count(es));
+                       "upto %llu blocks\n", o_blocks_count, n_blocks_count);
-        update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
-                       sizeof(struct ext4_super_block));
-exit_put:
        return err;
-} /* ext4_group_extend */
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 64e2529ae9bb..502c61fd7392 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1095,7 +1095,7 @@ static int ext4_show_options(struct seq_file *seq, struct dentry *root)
        }
        if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
                seq_printf(seq, ",max_batch_time=%u",
-                           (unsigned) sbi->s_min_batch_time);
+                           (unsigned) sbi->s_max_batch_time);
        }
        /*
@@ -2005,17 +2005,16 @@ static int ext4_fill_flex_info(struct super_block *sb)
        struct ext4_group_desc *gdp = NULL;
        ext4_group_t flex_group_count;
        ext4_group_t flex_group;
-        int groups_per_flex = 0;
+        unsigned int groups_per_flex = 0;
        size_t size;
        int i;
        sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
-        groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+        if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
-        if (groups_per_flex < 2) {
                sbi->s_log_groups_per_flex = 0;
                return 1;
        }
+        groups_per_flex = 1 << sbi->s_log_groups_per_flex;
        /* We allocate both existing and potentially added groups */
        flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
@@ -3506,7 +3505,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         * of the filesystem.
         */
        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-                ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
+                ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
                         "block %u is beyond end of filesystem (%llu)",
                         le32_to_cpu(es->s_first_data_block),
                         ext4_blocks_count(es));
@@ -3733,10 +3732,12 @@ no_journal:
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
+                iput(root);
                goto failed_mount4;
        }
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
+                iput(root);
                ext4_msg(sb, KERN_ERR, "get root dentry failed");
                ret = -ENOMEM;
                goto failed_mount4;
@@ -3773,7 +3774,7 @@ no_journal:
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize system "
                         "zone (%d)", err);
-                goto failed_mount4;
+                goto failed_mount4a;
        }
        ext4_ext_init(sb);
@@ -3830,13 +3831,14 @@ cantfind_ext4:
 failed_mount7:
        ext4_unregister_li_request(sb);
 failed_mount6:
-        ext4_ext_release(sb);
-failed_mount5:
        ext4_mb_release(sb);
+failed_mount5:
+        ext4_ext_release(sb);
        ext4_release_system_zone(sb);
-failed_mount4:
+failed_mount4a:
-        iput(root);
+        dput(sb->s_root);
        sb->s_root = NULL;
+failed_mount4:
        ext4_msg(sb, KERN_ERR, "mount failed");
        destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
 failed_mount_wq:
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 34e4350dd4d9..d2a200624af5 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -3,7 +3,6 @@
 * Handler for storing security labels as extended attributes.
 */
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/security.h>
@@ -48,8 +47,9 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+static int
-                    void *fs_info)
+ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+                void *fs_info)
 {
        const struct xattr *xattr;
        handle_t *handle = fs_info;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 37e6ebca2cc3..95f1f4ab59a4 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -5,7 +5,6 @@
 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 98c375352d0e..0edb7611ffbe 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -5,7 +5,6 @@
 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include "ext4_jbd2.h"
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 3a444b4e2368..a81eb2367d39 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -512,7 +512,8 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
        int charlen;
        if (utf8) {
-                *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
+                *outlen = utf8s_to_utf16s(name, len, UTF16_HOST_ENDIAN,
+                                (wchar_t *) outname, FAT_LFN_LEN + 2);
                if (*outlen < 0)
                        return *outlen;
                else if (*outlen > FAT_LFN_LEN)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e2951506434d..f855916657ba 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/writeback.h>
@@ -29,6 +30,11 @@
 #include "internal.h"
 /*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES     (4096UL >> (PAGE_CACHE_SHIFT - 10))
+/*
 * Passed into wb_writeback(), essentially a subset of writeback_control
 */
 struct wb_writeback_work {
@@ -742,11 +748,17 @@ static long wb_writeback(struct bdi_writeback *wb,
                if (work->for_background && !over_bground_thresh(wb->bdi))
                        break;
+                /*
+                 * Kupdate and background works are special and we want to
+                 * include all inodes that need writing. Livelock avoidance is
+                 * handled by these works yielding to any other work so we are
+                 * safe.
+                 */
                if (work->for_kupdate) {
                        oldest_jif = jiffies -
                                msecs_to_jiffies(dirty_expire_interval * 10);
-                        work->older_than_this = &oldest_jif;
+                } else if (work->for_background)
-                }
+                        oldest_jif = jiffies;
                trace_writeback_start(wb->bdi, work);
                if (list_empty(&wb->b_io))
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 2aaf3eaaf13d..5f3368ab0fa9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1378,7 +1378,59 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
        down_read(&fc->killsb);
        err = -ENOENT;
        if (fc->sb)
-                err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
+                err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
+        up_read(&fc->killsb);
+        kfree(buf);
+        return err;
+err:
+        kfree(buf);
+        fuse_copy_finish(cs);
+        return err;
+}
+static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
+                              struct fuse_copy_state *cs)
+{
+        struct fuse_notify_delete_out outarg;
+        int err = -ENOMEM;
+        char *buf;
+        struct qstr name;
+        buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+        if (!buf)
+                goto err;
+        err = -EINVAL;
+        if (size < sizeof(outarg))
+                goto err;
+        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+        if (err)
+                goto err;
+        err = -ENAMETOOLONG;
+        if (outarg.namelen > FUSE_NAME_MAX)
+                goto err;
+        err = -EINVAL;
+        if (size != sizeof(outarg) + outarg.namelen + 1)
+                goto err;
+        name.name = buf;
+        name.len = outarg.namelen;
+        err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+        if (err)
+                goto err;
+        fuse_copy_finish(cs);
+        buf[outarg.namelen] = 0;
+        name.hash = full_name_hash(name.name, name.len);
+        down_read(&fc->killsb);
+        err = -ENOENT;
+        if (fc->sb)
+                err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
+                                               outarg.child, &name);
        up_read(&fc->killsb);
        kfree(buf);
        return err;
@@ -1597,6 +1649,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
        case FUSE_NOTIFY_RETRIEVE:
                return fuse_notify_retrieve(fc, size, cs);
+        case FUSE_NOTIFY_DELETE:
+                return fuse_notify_delete(fc, size, cs);
        default:
                fuse_copy_finish(cs);
                return -EINVAL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5ddd6ea8f839..206632887bb4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -868,7 +868,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 }
 int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
-                             struct qstr *name)
+                             u64 child_nodeid, struct qstr *name)
 {
        int err = -ENOTDIR;
        struct inode *parent;
@@ -895,8 +895,36 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
        fuse_invalidate_attr(parent);
        fuse_invalidate_entry(entry);
+        if (child_nodeid != 0 && entry->d_inode) {
+                mutex_lock(&entry->d_inode->i_mutex);
+                if (get_node_id(entry->d_inode) != child_nodeid) {
+                        err = -ENOENT;
+                        goto badentry;
+                }
+                if (d_mountpoint(entry)) {
+                        err = -EBUSY;
+                        goto badentry;
+                }
+                if (S_ISDIR(entry->d_inode->i_mode)) {
+                        shrink_dcache_parent(entry);
+                        if (!simple_empty(entry)) {
+                                err = -ENOTEMPTY;
+                                goto badentry;
+                        }
+                        entry->d_inode->i_flags |= S_DEAD;
+                }
+                dont_mount(entry);
+                clear_nlink(entry->d_inode);
+                err = 0;
+ badentry:
+                mutex_unlock(&entry->d_inode->i_mutex);
+                if (!err)
+                        d_delete(entry);
+        } else {
+                err = 0;
+        }
        dput(entry);
-        err = 0;
 unlock:
        mutex_unlock(&parent->i_mutex);
@@ -1182,6 +1210,30 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
        return fuse_fsync_common(file, start, end, datasync, 1);
 }
+static long fuse_dir_ioctl(struct file *file, unsigned int cmd,
+                            unsigned long arg)
+{
+        struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+        /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */
+        if (fc->minor < 18)
+                return -ENOTTY;
+        return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR);
+}
+static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
+                                   unsigned long arg)
+{
+        struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+        if (fc->minor < 18)
+                return -ENOTTY;
+        return fuse_ioctl_common(file, cmd, arg,
+                                 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
+}
 static bool update_mtime(unsigned ivalid)
 {
        /* Always update if mtime is explicitly set  */
@@ -1596,6 +1648,8 @@ static const struct file_operations fuse_dir_operations = {
        .open           = fuse_dir_open,
        .release        = fuse_dir_release,
        .fsync          = fuse_dir_fsync,
+        .unlocked_ioctl = fuse_dir_ioctl,
+        .compat_ioctl   = fuse_dir_compat_ioctl,
 };
 static const struct inode_operations fuse_common_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0c84100acd44..4a199fd93fbd 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1555,48 +1555,16 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
        loff_t retval;
        struct inode *inode = file->f_path.dentry->d_inode;
-        mutex_lock(&inode->i_mutex);
+        /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
-        if (origin != SEEK_CUR && origin != SEEK_SET) {
+        if (origin == SEEK_CUR || origin == SEEK_SET)
-                retval = fuse_update_attributes(inode, NULL, file, NULL);
+                return generic_file_llseek(file, offset, origin);
-                if (retval)
-                        goto exit;
-        }
-        switch (origin) {
+        mutex_lock(&inode->i_mutex);
-        case SEEK_END:
+        retval = fuse_update_attributes(inode, NULL, file, NULL);
-                offset += i_size_read(inode);
+        if (!retval)
-                break;
+                retval = generic_file_llseek(file, offset, origin);
-        case SEEK_CUR:
-                if (offset == 0) {
-                        retval = file->f_pos;
-                        goto exit;
-                }
-                offset += file->f_pos;
-                break;
-        case SEEK_DATA:
-                if (offset >= i_size_read(inode)) {
-                        retval = -ENXIO;
-                        goto exit;
-                }
-                break;
-        case SEEK_HOLE:
-                if (offset >= i_size_read(inode)) {
-                        retval = -ENXIO;
-                        goto exit;
-                }
-                offset = i_size_read(inode);
-                break;
-        }
-        retval = -EINVAL;
-        if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
-                if (offset != file->f_pos) {
-                        file->f_pos = offset;
-                        file->f_version = 0;
-                }
-                retval = offset;
-        }
-exit:
        mutex_unlock(&inode->i_mutex);
        return retval;
 }
@@ -1808,7 +1776,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
        BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
        err = -ENOMEM;
-        pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
+        pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
        iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
        if (!pages || !iov_page)
                goto out;
@@ -1958,8 +1926,8 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 }
 EXPORT_SYMBOL_GPL(fuse_do_ioctl);
-static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
-                                   unsigned long arg, unsigned int flags)
+                       unsigned long arg, unsigned int flags)
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1976,13 +1944,13 @@ static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
 static long fuse_file_ioctl(struct file *file, unsigned int cmd,
                            unsigned long arg)
 {
-        return fuse_file_ioctl_common(file, cmd, arg, 0);
+        return fuse_ioctl_common(file, cmd, arg, 0);
 }
 static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
                                   unsigned long arg)
 {
-        return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
+        return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
 }
 /*
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1964da0257d9..572cefc78012 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -755,9 +755,15 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
 /**
 * File-system tells the kernel to invalidate parent attributes and
 * the dentry matching parent/name.
+ *
+ * If the child_nodeid is non-zero and:
+ *    - matches the inode number for the dentry matching parent/name,
+ *    - is not a mount point
+ *    - is a file or oan empty directory
+ * then the dentry is unhashed (d_delete()).
 */
 int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
-                             struct qstr *name);
+                             u64 child_nodeid, struct qstr *name);
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
                 bool isdir);
@@ -765,6 +771,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
                       size_t count, loff_t *ppos, int write);
 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
                   unsigned int flags);
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+                       unsigned long arg, unsigned int flags);
 unsigned fuse_file_poll(struct file *file, poll_table *wait);
 int fuse_dev_release(struct inode *inode, struct file *file);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 88e8a23d0026..376816fcd040 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1353,7 +1353,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
        spin_lock(&gl->gl_spin);
        gl->gl_reply = ret;
-        if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
+        if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
                if (gfs2_should_freeze(gl)) {
                        set_bit(GLF_FROZEN, &gl->gl_flags);
                        spin_unlock(&gl->gl_spin);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2553b858a72e..307ac31df781 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -121,8 +121,11 @@ enum {
 struct lm_lockops {
        const char *lm_proto_name;
-        int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
+        int (*lm_mount) (struct gfs2_sbd *sdp, const char *table);
-        void (*lm_unmount) (struct gfs2_sbd *sdp);
+        void (*lm_first_done) (struct gfs2_sbd *sdp);
+        void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid,
+                                    unsigned int result);
+        void (*lm_unmount) (struct gfs2_sbd *sdp);
        void (*lm_withdraw) (struct gfs2_sbd *sdp);
        void (*lm_put_lock) (struct gfs2_glock *gl);
        int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e1d3bb59945c..97742a7ea9cc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -139,8 +139,45 @@ struct gfs2_bufdata {
 #define GDLM_STRNAME_BYTES      25
 #define GDLM_LVB_SIZE           32
+/*
+ * ls_recover_flags:
+ *
+ * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been
+ * held by failed nodes whose journals need recovery.  Those locks should
+ * only be used for journal recovery until the journal recovery is done.
+ * This is set by the dlm recover_prep callback and cleared by the
+ * gfs2_control thread when journal recovery is complete.  To avoid
+ * races between recover_prep setting and gfs2_control clearing, recover_spin
+ * is held while changing this bit and reading/writing recover_block
+ * and recover_start.
+ *
+ * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used.
+ *
+ * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing
+ * recovery of all journals before allowing other nodes to mount the fs.
+ * This is cleared when FIRST_MOUNT_DONE is set.
+ *
+ * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished
+ * recovery of all journals, and now allows other nodes to mount the fs.
+ *
+ * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared
+ * BLOCK_LOCKS for the first time.  The gfs2_control thread should now
+ * control clearing BLOCK_LOCKS for further recoveries.
+ *
+ * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq.
+ *
+ * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep()
+ * and recover_done(), i.e. set while recover_block == recover_start.
+ */
 enum {
        DFL_BLOCK_LOCKS         = 0,
+        DFL_NO_DLM_OPS          = 1,
+        DFL_FIRST_MOUNT         = 2,
+        DFL_FIRST_MOUNT_DONE    = 3,
+        DFL_MOUNT_DONE          = 4,
+        DFL_UNMOUNT             = 5,
+        DFL_DLM_RECOVERY        = 6,
 };
 struct lm_lockname {
@@ -392,6 +429,7 @@ struct gfs2_jdesc {
 #define JDF_RECOVERY 1
        unsigned int jd_jid;
        unsigned int jd_blocks;
+        int jd_recover_error;
 };
 struct gfs2_statfs_change_host {
@@ -461,6 +499,7 @@ enum {
        SDF_NORECOVERY          = 4,
        SDF_DEMOTE              = 5,
        SDF_NOJOURNALID         = 6,
+        SDF_RORECOVERY          = 7, /* read only recovery */
 };
 #define GFS2_FSNAME_LEN         256
@@ -499,14 +538,26 @@ struct gfs2_sb_host {
 struct lm_lockstruct {
        int ls_jid;
        unsigned int ls_first;
-        unsigned int ls_first_done;
        unsigned int ls_nodir;
        const struct lm_lockops *ls_ops;
-        unsigned long ls_flags;
        dlm_lockspace_t *ls_dlm;
-        int ls_recover_jid_done;
+        int ls_recover_jid_done;   /* These two are deprecated, */
-        int ls_recover_jid_status;
+        int ls_recover_jid_status; /* used previously by gfs_controld */
+        struct dlm_lksb ls_mounted_lksb; /* mounted_lock */
+        struct dlm_lksb ls_control_lksb; /* control_lock */
+        char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */
+        struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */
+        spinlock_t ls_recover_spin; /* protects following fields */
+        unsigned long ls_recover_flags; /* DFL_ */
+        uint32_t ls_recover_mount; /* gen in first recover_done cb */
+        uint32_t ls_recover_start; /* gen in last recover_done cb */
+        uint32_t ls_recover_block; /* copy recover_start in last recover_prep */
+        uint32_t ls_recover_size; /* size of recover_submit, recover_result */
+        uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */
+        uint32_t *ls_recover_result; /* result of last jid recovery */
 };
 struct gfs2_sbd {
@@ -544,6 +595,7 @@ struct gfs2_sbd {
        wait_queue_head_t sd_glock_wait;
        atomic_t sd_glock_disposal;
        struct completion sd_locking_init;
+        struct delayed_work sd_control_work;
        /* Inode Stuff */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 017960cf1d7a..a7d611b93f0f 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -599,9 +599,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (error)
                goto fail_end_trans;
-        inc_nlink(&ip->i_inode);
+        set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1);
-        if (S_ISDIR(ip->i_inode.i_mode))
-                inc_nlink(&ip->i_inode);
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 98c80d8c2a62..8944d1e32ab5 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1,6 +1,6 @@
 /*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
+ * Copyright 2004-2011 Red Hat, Inc.
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
@@ -11,12 +11,15 @@
 #include <linux/dlm.h>
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/delay.h>
 #include <linux/gfs2_ondisk.h>
 #include "incore.h"
 #include "glock.h"
 #include "util.h"
+#include "sys.h"
+extern struct workqueue_struct *gfs2_control_wq;
 static void gdlm_ast(void *arg)
 {
@@ -185,34 +188,1002 @@ static void gdlm_cancel(struct gfs2_glock *gl)
        dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
 }
-static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname)
+/*
+ * dlm/gfs2 recovery coordination using dlm_recover callbacks
+ *
+ *  1. dlm_controld sees lockspace members change
+ *  2. dlm_controld blocks dlm-kernel locking activity
+ *  3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep)
+ *  4. dlm_controld starts and finishes its own user level recovery
+ *  5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery
+ *  6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot)
+ *  7. dlm_recoverd does its own lock recovery
+ *  8. dlm_recoverd unblocks dlm-kernel locking activity
+ *  9. dlm_recoverd notifies gfs2 when done (recover_done with new generation)
+ * 10. gfs2_control updates control_lock lvb with new generation and jid bits
+ * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none)
+ * 12. gfs2_recover dequeues and recovers journals of failed nodes
+ * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result)
+ * 14. gfs2_control updates control_lock lvb jid bits for recovered journals
+ * 15. gfs2_control unblocks normal locking when all journals are recovered
+ *
+ * - failures during recovery
+ *
+ * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control
+ * clears BLOCK_LOCKS (step 15), e.g. another node fails while still
+ * recovering for a prior failure.  gfs2_control needs a way to detect
+ * this so it can leave BLOCK_LOCKS set in step 15.  This is managed using
+ * the recover_block and recover_start values.
+ *
+ * recover_done() provides a new lockspace generation number each time it
+ * is called (step 9).  This generation number is saved as recover_start.
+ * When recover_prep() is called, it sets BLOCK_LOCKS and sets
+ * recover_block = recover_start.  So, while recover_block is equal to
+ * recover_start, BLOCK_LOCKS should remain set.  (recover_spin must
+ * be held around the BLOCK_LOCKS/recover_block/recover_start logic.)
+ *
+ * - more specific gfs2 steps in sequence above
+ *
+ *  3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start
+ *  6. recover_slot records any failed jids (maybe none)
+ *  9. recover_done sets recover_start = new generation number
+ * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids
+ * 12. gfs2_recover does journal recoveries for failed jids identified above
+ * 14. gfs2_control clears control_lock lvb bits for recovered jids
+ * 15. gfs2_control checks if recover_block == recover_start (step 3 occured
+ *     again) then do nothing, otherwise if recover_start > recover_block
+ *     then clear BLOCK_LOCKS.
+ *
+ * - parallel recovery steps across all nodes
+ *
+ * All nodes attempt to update the control_lock lvb with the new generation
+ * number and jid bits, but only the first to get the control_lock EX will
+ * do so; others will see that it's already done (lvb already contains new
+ * generation number.)
+ *
+ * . All nodes get the same recover_prep/recover_slot/recover_done callbacks
+ * . All nodes attempt to set control_lock lvb gen + bits for the new gen
+ * . One node gets control_lock first and writes the lvb, others see it's done
+ * . All nodes attempt to recover jids for which they see control_lock bits set
+ * . One node succeeds for a jid, and that one clears the jid bit in the lvb
+ * . All nodes will eventually see all lvb bits clear and unblock locks
+ *
+ * - is there a problem with clearing an lvb bit that should be set
+ *   and missing a journal recovery?
+ *
+ * 1. jid fails
+ * 2. lvb bit set for step 1
+ * 3. jid recovered for step 1
+ * 4. jid taken again (new mount)
+ * 5. jid fails (for step 4)
+ * 6. lvb bit set for step 5 (will already be set)
+ * 7. lvb bit cleared for step 3
+ *
+ * This is not a problem because the failure in step 5 does not
+ * require recovery, because the mount in step 4 could not have
+ * progressed far enough to unblock locks and access the fs.  The
+ * control_mount() function waits for all recoveries to be complete
+ * for the latest lockspace generation before ever unblocking locks
+ * and returning.  The mount in step 4 waits until the recovery in
+ * step 1 is done.
+ *
+ * - special case of first mounter: first node to mount the fs
+ *
+ * The first node to mount a gfs2 fs needs to check all the journals
+ * and recover any that need recovery before other nodes are allowed
+ * to mount the fs.  (Others may begin mounting, but they must wait
+ * for the first mounter to be done before taking locks on the fs
+ * or accessing the fs.)  This has two parts:
+ *
+ * 1. The mounted_lock tells a node it's the first to mount the fs.
+ * Each node holds the mounted_lock in PR while it's mounted.
+ * Each node tries to acquire the mounted_lock in EX when it mounts.
+ * If a node is granted the mounted_lock EX it means there are no
+ * other mounted nodes (no PR locks exist), and it is the first mounter.
+ * The mounted_lock is demoted to PR when first recovery is done, so
+ * others will fail to get an EX lock, but will get a PR lock.
+ *
+ * 2. The control_lock blocks others in control_mount() while the first
+ * mounter is doing first mount recovery of all journals.
+ * A mounting node needs to acquire control_lock in EX mode before
+ * it can proceed.  The first mounter holds control_lock in EX while doing
+ * the first mount recovery, blocking mounts from other nodes, then demotes
+ * control_lock to NL when it's done (others_may_mount/first_done),
+ * allowing other nodes to continue mounting.
+ *
+ * first mounter:
+ * control_lock EX/NOQUEUE success
+ * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters)
+ * set first=1
+ * do first mounter recovery
+ * mounted_lock EX->PR
+ * control_lock EX->NL, write lvb generation
+ *
+ * other mounter:
+ * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry)
+ * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR)
+ * mounted_lock PR/NOQUEUE success
+ * read lvb generation
+ * control_lock EX->NL
+ * set first=0
+ *
+ * - mount during recovery
+ *
+ * If a node mounts while others are doing recovery (not first mounter),
+ * the mounting node will get its initial recover_done() callback without
+ * having seen any previous failures/callbacks.
+ *
+ * It must wait for all recoveries preceding its mount to be finished
+ * before it unblocks locks.  It does this by repeating the "other mounter"
+ * steps above until the lvb generation number is >= its mount generation
+ * number (from initial recover_done) and all lvb bits are clear.
+ *
+ * - control_lock lvb format
+ *
+ * 4 bytes generation number: the latest dlm lockspace generation number
+ * from recover_done callback.  Indicates the jid bitmap has been updated
+ * to reflect all slot failures through that generation.
+ * 4 bytes unused.
+ * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates
+ * that jid N needs recovery.
+ */
+#define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */
+static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen,
+                             char *lvb_bits)
+{
+        uint32_t gen;
+        memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE);
+        memcpy(&gen, lvb_bits, sizeof(uint32_t));
+        *lvb_gen = le32_to_cpu(gen);
+}
+static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,
+                              char *lvb_bits)
+{
+        uint32_t gen;
+        memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE);
+        gen = cpu_to_le32(lvb_gen);
+        memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t));
+}
+static int all_jid_bits_clear(char *lvb)
+{
+        int i;
+        for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) {
+                if (lvb[i])
+                        return 0;
+        }
+        return 1;
+}
+static void sync_wait_cb(void *arg)
+{
+        struct lm_lockstruct *ls = arg;
+        complete(&ls->ls_sync_wait);
+}
+static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name)
 {
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
        int error;
-        if (fsname == NULL) {
+        error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls);
-                fs_info(sdp, "no fsname found\n");
+        if (error) {
-                return -EINVAL;
+                fs_err(sdp, "%s lkid %x error %d\n",
+                       name, lksb->sb_lkid, error);
+                return error;
+        }
+        wait_for_completion(&ls->ls_sync_wait);
+        if (lksb->sb_status != -DLM_EUNLOCK) {
+                fs_err(sdp, "%s lkid %x status %d\n",
+                       name, lksb->sb_lkid, lksb->sb_status);
+                return -1;
+        }
+        return 0;
+}
+static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags,
+                     unsigned int num, struct dlm_lksb *lksb, char *name)
+{
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        char strname[GDLM_STRNAME_BYTES];
+        int error, status;
+        memset(strname, 0, GDLM_STRNAME_BYTES);
+        snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num);
+        error = dlm_lock(ls->ls_dlm, mode, lksb, flags,
+                         strname, GDLM_STRNAME_BYTES - 1,
+                         0, sync_wait_cb, ls, NULL);
+        if (error) {
+                fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n",
+                       name, lksb->sb_lkid, flags, mode, error);
+                return error;
+        }
+        wait_for_completion(&ls->ls_sync_wait);
+        status = lksb->sb_status;
+        if (status && status != -EAGAIN) {
+                fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n",
+                       name, lksb->sb_lkid, flags, mode, status);
+        }
+        return status;
+}
+static int mounted_unlock(struct gfs2_sbd *sdp)
+{
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock");
+}
+static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
+{
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK,
+                         &ls->ls_mounted_lksb, "mounted_lock");
+}
+static int control_unlock(struct gfs2_sbd *sdp)
+{
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock");
+}
+static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
+{
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK,
+                         &ls->ls_control_lksb, "control_lock");
+}
+static void gfs2_control_func(struct work_struct *work)
+{
+        struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work);
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        char lvb_bits[GDLM_LVB_SIZE];
+        uint32_t block_gen, start_gen, lvb_gen, flags;
+        int recover_set = 0;
+        int write_lvb = 0;
+        int recover_size;
+        int i, error;
+        spin_lock(&ls->ls_recover_spin);
+        /*
+         * No MOUNT_DONE means we're still mounting; control_mount()
+         * will set this flag, after which this thread will take over
+         * all further clearing of BLOCK_LOCKS.
+         *
+         * FIRST_MOUNT means this node is doing first mounter recovery,
+         * for which recovery control is handled by
+         * control_mount()/control_first_done(), not this thread.
+         */
+        if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+             test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+                spin_unlock(&ls->ls_recover_spin);
+                return;
+        }
+        block_gen = ls->ls_recover_block;
+        start_gen = ls->ls_recover_start;
+        spin_unlock(&ls->ls_recover_spin);
+        /*
+         * Equal block_gen and start_gen implies we are between
+         * recover_prep and recover_done callbacks, which means
+         * dlm recovery is in progress and dlm locking is blocked.
+         * There's no point trying to do any work until recover_done.
+         */
+        if (block_gen == start_gen)
+                return;
+        /*
+         * Propagate recover_submit[] and recover_result[] to lvb:
+         * dlm_recoverd adds to recover_submit[] jids needing recovery
+         * gfs2_recover adds to recover_result[] journal recovery results
+         *
+         * set lvb bit for jids in recover_submit[] if the lvb has not
+         * yet been updated for the generation of the failure
+         *
+         * clear lvb bit for jids in recover_result[] if the result of
+         * the journal recovery is SUCCESS
+         */
+        error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+        if (error) {
+                fs_err(sdp, "control lock EX error %d\n", error);
+                return;
+        }
+        control_lvb_read(ls, &lvb_gen, lvb_bits);
+        spin_lock(&ls->ls_recover_spin);
+        if (block_gen != ls->ls_recover_block ||
+            start_gen != ls->ls_recover_start) {
+                fs_info(sdp, "recover generation %u block1 %u %u\n",
+                        start_gen, block_gen, ls->ls_recover_block);
+                spin_unlock(&ls->ls_recover_spin);
+                control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+                return;
+        }
+        recover_size = ls->ls_recover_size;
+        if (lvb_gen <= start_gen) {
+                /*
+                 * Clear lvb bits for jids we've successfully recovered.
+                 * Because all nodes attempt to recover failed journals,
+                 * a journal can be recovered multiple times successfully
+                 * in succession.  Only the first will really do recovery,
+                 * the others find it clean, but still report a successful
+                 * recovery.  So, another node may have already recovered
+                 * the jid and cleared the lvb bit for it.
+                 */
+                for (i = 0; i < recover_size; i++) {
+                        if (ls->ls_recover_result[i] != LM_RD_SUCCESS)
+                                continue;
+                        ls->ls_recover_result[i] = 0;
+                        if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET))
+                                continue;
+                        __clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
+                        write_lvb = 1;
+                }
+        }
+        if (lvb_gen == start_gen) {
+                /*
+                 * Failed slots before start_gen are already set in lvb.
+                 */
+                for (i = 0; i < recover_size; i++) {
+                        if (!ls->ls_recover_submit[i])
+                                continue;
+                        if (ls->ls_recover_submit[i] < lvb_gen)
+                                ls->ls_recover_submit[i] = 0;
+                }
+        } else if (lvb_gen < start_gen) {
+                /*
+                 * Failed slots before start_gen are not yet set in lvb.
+                 */
+                for (i = 0; i < recover_size; i++) {
+                        if (!ls->ls_recover_submit[i])
+                                continue;
+                        if (ls->ls_recover_submit[i] < start_gen) {
+                                ls->ls_recover_submit[i] = 0;
+                                __set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
+                        }
+                }
+                /* even if there are no bits to set, we need to write the
+                   latest generation to the lvb */
+                write_lvb = 1;
+        } else {
+                /*
+                 * we should be getting a recover_done() for lvb_gen soon
+                 */
+        }
+        spin_unlock(&ls->ls_recover_spin);
+        if (write_lvb) {
+                control_lvb_write(ls, start_gen, lvb_bits);
+                flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK;
+        } else {
+                flags = DLM_LKF_CONVERT;
+        }
+        error = control_lock(sdp, DLM_LOCK_NL, flags);
+        if (error) {
+                fs_err(sdp, "control lock NL error %d\n", error);
+                return;
+        }
+        /*
+         * Everyone will see jid bits set in the lvb, run gfs2_recover_set(),
+         * and clear a jid bit in the lvb if the recovery is a success.
+         * Eventually all journals will be recovered, all jid bits will
+         * be cleared in the lvb, and everyone will clear BLOCK_LOCKS.
+         */
+        for (i = 0; i < recover_size; i++) {
+                if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) {
+                        fs_info(sdp, "recover generation %u jid %d\n",
+                                start_gen, i);
+                        gfs2_recover_set(sdp, i);
+                        recover_set++;
+                }
+        }
+        if (recover_set)
+                return;
+        /*
+         * No more jid bits set in lvb, all recovery is done, unblock locks
+         * (unless a new recover_prep callback has occured blocking locks
+         * again while working above)
+         */
+        spin_lock(&ls->ls_recover_spin);
+        if (ls->ls_recover_block == block_gen &&
+            ls->ls_recover_start == start_gen) {
+                clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+                spin_unlock(&ls->ls_recover_spin);
+                fs_info(sdp, "recover generation %u done\n", start_gen);
+                gfs2_glock_thaw(sdp);
+        } else {
+                fs_info(sdp, "recover generation %u block2 %u %u\n",
+                        start_gen, block_gen, ls->ls_recover_block);
+                spin_unlock(&ls->ls_recover_spin);
+        }
+}
+static int control_mount(struct gfs2_sbd *sdp)
+{
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        char lvb_bits[GDLM_LVB_SIZE];
+        uint32_t start_gen, block_gen, mount_gen, lvb_gen;
+        int mounted_mode;
+        int retries = 0;
+        int error;
+        memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb));
+        memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb));
+        memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE);
+        ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb;
+        init_completion(&ls->ls_sync_wait);
+        set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+        error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK);
+        if (error) {
+                fs_err(sdp, "control_mount control_lock NL error %d\n", error);
+                return error;
+        }
+        error = mounted_lock(sdp, DLM_LOCK_NL, 0);
+        if (error) {
+                fs_err(sdp, "control_mount mounted_lock NL error %d\n", error);
+                control_unlock(sdp);
+                return error;
+        }
+        mounted_mode = DLM_LOCK_NL;
+restart:
+        if (retries++ && signal_pending(current)) {
+                error = -EINTR;
+                goto fail;
+        }
+        /*
+         * We always start with both locks in NL. control_lock is
+         * demoted to NL below so we don't need to do it here.
+         */
+        if (mounted_mode != DLM_LOCK_NL) {
+                error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+                if (error)
+                        goto fail;
+                mounted_mode = DLM_LOCK_NL;
+        }
+        /*
+         * Other nodes need to do some work in dlm recovery and gfs2_control
+         * before the recover_done and control_lock will be ready for us below.
+         * A delay here is not required but often avoids having to retry.
+         */
+        msleep_interruptible(500);
+        /*
+         * Acquire control_lock in EX and mounted_lock in either EX or PR.
+         * control_lock lvb keeps track of any pending journal recoveries.
+         * mounted_lock indicates if any other nodes have the fs mounted.
+         */
+        error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK);
+        if (error == -EAGAIN) {
+                goto restart;
+        } else if (error) {
+                fs_err(sdp, "control_mount control_lock EX error %d\n", error);
+                goto fail;
+        }
+        error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
+        if (!error) {
+                mounted_mode = DLM_LOCK_EX;
+                goto locks_done;
+        } else if (error != -EAGAIN) {
+                fs_err(sdp, "control_mount mounted_lock EX error %d\n", error);
+                goto fail;
+        }
+        error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
+        if (!error) {
+                mounted_mode = DLM_LOCK_PR;
+                goto locks_done;
+        } else {
+                /* not even -EAGAIN should happen here */
+                fs_err(sdp, "control_mount mounted_lock PR error %d\n", error);
+                goto fail;
+        }
+locks_done:
+        /*
+         * If we got both locks above in EX, then we're the first mounter.
+         * If not, then we need to wait for the control_lock lvb to be
+         * updated by other mounted nodes to reflect our mount generation.
+         *
+         * In simple first mounter cases, first mounter will see zero lvb_gen,
+         * but in cases where all existing nodes leave/fail before mounting
+         * nodes finish control_mount, then all nodes will be mounting and
+         * lvb_gen will be non-zero.
+         */
+        control_lvb_read(ls, &lvb_gen, lvb_bits);
+        if (lvb_gen == 0xFFFFFFFF) {
+                /* special value to force mount attempts to fail */
+                fs_err(sdp, "control_mount control_lock disabled\n");
+                error = -EINVAL;
+                goto fail;
+        }
+        if (mounted_mode == DLM_LOCK_EX) {
+                /* first mounter, keep both EX while doing first recovery */
+                spin_lock(&ls->ls_recover_spin);
+                clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+                set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
+                set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+                spin_unlock(&ls->ls_recover_spin);
+                fs_info(sdp, "first mounter control generation %u\n", lvb_gen);
+                return 0;
+        }
+        error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
+        if (error)
+                goto fail;
+        /*
+         * We are not first mounter, now we need to wait for the control_lock
+         * lvb generation to be >= the generation from our first recover_done
+         * and all lvb bits to be clear (no pending journal recoveries.)
+         */
+        if (!all_jid_bits_clear(lvb_bits)) {
+                /* journals need recovery, wait until all are clear */
+                fs_info(sdp, "control_mount wait for journal recovery\n");
+                goto restart;
+        }
+        spin_lock(&ls->ls_recover_spin);
+        block_gen = ls->ls_recover_block;
+        start_gen = ls->ls_recover_start;
+        mount_gen = ls->ls_recover_mount;
+        if (lvb_gen < mount_gen) {
+                /* wait for mounted nodes to update control_lock lvb to our
+                   generation, which might include new recovery bits set */
+                fs_info(sdp, "control_mount wait1 block %u start %u mount %u "
+                        "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+                        lvb_gen, ls->ls_recover_flags);
+                spin_unlock(&ls->ls_recover_spin);
+                goto restart;
+        }
+        if (lvb_gen != start_gen) {
+                /* wait for mounted nodes to update control_lock lvb to the
+                   latest recovery generation */
+                fs_info(sdp, "control_mount wait2 block %u start %u mount %u "
+                        "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+                        lvb_gen, ls->ls_recover_flags);
+                spin_unlock(&ls->ls_recover_spin);
+                goto restart;
+        }
+        if (block_gen == start_gen) {
+                /* dlm recovery in progress, wait for it to finish */
+                fs_info(sdp, "control_mount wait3 block %u start %u mount %u "
+                        "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
+                        lvb_gen, ls->ls_recover_flags);
+                spin_unlock(&ls->ls_recover_spin);
+                goto restart;
        }
-        error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm,
+        clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
-                                  DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
+        set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
-                                  (ls->ls_nodir ? DLM_LSFL_NODIR : 0),
+        memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
-                                  GDLM_LVB_SIZE);
+        memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
+        spin_unlock(&ls->ls_recover_spin);
+        return 0;
+fail:
+        mounted_unlock(sdp);
+        control_unlock(sdp);
+        return error;
+}
+static int dlm_recovery_wait(void *word)
+{
+        schedule();
+        return 0;
+}
+static int control_first_done(struct gfs2_sbd *sdp)
+{
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        char lvb_bits[GDLM_LVB_SIZE];
+        uint32_t start_gen, block_gen;
+        int error;
+restart:
+        spin_lock(&ls->ls_recover_spin);
+        start_gen = ls->ls_recover_start;
+        block_gen = ls->ls_recover_block;
+        if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) ||
+            !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+            !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+                /* sanity check, should not happen */
+                fs_err(sdp, "control_first_done start %u block %u flags %lx\n",
+                       start_gen, block_gen, ls->ls_recover_flags);
+                spin_unlock(&ls->ls_recover_spin);
+                control_unlock(sdp);
+                return -1;
+        }
+        if (start_gen == block_gen) {
+                /*
+                 * Wait for the end of a dlm recovery cycle to switch from
+                 * first mounter recovery.  We can ignore any recover_slot
+                 * callbacks between the recover_prep and next recover_done
+                 * because we are still the first mounter and any failed nodes
+                 * have not fully mounted, so they don't need recovery.
+                 */
+                spin_unlock(&ls->ls_recover_spin);
+                fs_info(sdp, "control_first_done wait gen %u\n", start_gen);
+                wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY,
+                            dlm_recovery_wait, TASK_UNINTERRUPTIBLE);
+                goto restart;
+        }
+        clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+        set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags);
+        memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
+        memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
+        spin_unlock(&ls->ls_recover_spin);
+        memset(lvb_bits, 0, sizeof(lvb_bits));
+        control_lvb_write(ls, start_gen, lvb_bits);
+        error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT);
+        if (error)
+                fs_err(sdp, "control_first_done mounted PR error %d\n", error);
+        error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
        if (error)
-                printk(KERN_ERR "dlm_new_lockspace error %d", error);
+                fs_err(sdp, "control_first_done control NL error %d\n", error);
        return error;
 }
+/*
+ * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC)
+ * to accomodate the largest slot number.  (NB dlm slot numbers start at 1,
+ * gfs2 jids start at 0, so jid = slot - 1)
+ */
+#define RECOVER_SIZE_INC 16
+static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots,
+                            int num_slots)
+{
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        uint32_t *submit = NULL;
+        uint32_t *result = NULL;
+        uint32_t old_size, new_size;
+        int i, max_jid;
+        max_jid = 0;
+        for (i = 0; i < num_slots; i++) {
+                if (max_jid < slots[i].slot - 1)
+                        max_jid = slots[i].slot - 1;
+        }
+        old_size = ls->ls_recover_size;
+        if (old_size >= max_jid + 1)
+                return 0;
+        new_size = old_size + RECOVER_SIZE_INC;
+        submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
+        result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
+        if (!submit || !result) {
+                kfree(submit);
+                kfree(result);
+                return -ENOMEM;
+        }
+        spin_lock(&ls->ls_recover_spin);
+        memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t));
+        memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t));
+        kfree(ls->ls_recover_submit);
+        kfree(ls->ls_recover_result);
+        ls->ls_recover_submit = submit;
+        ls->ls_recover_result = result;
+        ls->ls_recover_size = new_size;
+        spin_unlock(&ls->ls_recover_spin);
+        return 0;
+}
+static void free_recover_size(struct lm_lockstruct *ls)
+{
+        kfree(ls->ls_recover_submit);
+        kfree(ls->ls_recover_result);
+        ls->ls_recover_submit = NULL;
+        ls->ls_recover_result = NULL;
+        ls->ls_recover_size = 0;
+}
+/* dlm calls before it does lock recovery */
+static void gdlm_recover_prep(void *arg)
+{
+        struct gfs2_sbd *sdp = arg;
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        spin_lock(&ls->ls_recover_spin);
+        ls->ls_recover_block = ls->ls_recover_start;
+        set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
+        if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
+             test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+                spin_unlock(&ls->ls_recover_spin);
+                return;
+        }
+        set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
+        spin_unlock(&ls->ls_recover_spin);
+}
+/* dlm calls after recover_prep has been completed on all lockspace members;
+   identifies slot/jid of failed member */
+static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
+{
+        struct gfs2_sbd *sdp = arg;
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        int jid = slot->slot - 1;
+        spin_lock(&ls->ls_recover_spin);
+        if (ls->ls_recover_size < jid + 1) {
+                fs_err(sdp, "recover_slot jid %d gen %u short size %d",
+                       jid, ls->ls_recover_block, ls->ls_recover_size);
+                spin_unlock(&ls->ls_recover_spin);
+                return;
+        }
+        if (ls->ls_recover_submit[jid]) {
+                fs_info(sdp, "recover_slot jid %d gen %u prev %u",
+                        jid, ls->ls_recover_block, ls->ls_recover_submit[jid]);
+        }
+        ls->ls_recover_submit[jid] = ls->ls_recover_block;
+        spin_unlock(&ls->ls_recover_spin);
+}
+/* dlm calls after recover_slot and after it completes lock recovery */
+static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
+                              int our_slot, uint32_t generation)
+{
+        struct gfs2_sbd *sdp = arg;
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        /* ensure the ls jid arrays are large enough */
+        set_recover_size(sdp, slots, num_slots);
+        spin_lock(&ls->ls_recover_spin);
+        ls->ls_recover_start = generation;
+        if (!ls->ls_recover_mount) {
+                ls->ls_recover_mount = generation;
+                ls->ls_jid = our_slot - 1;
+        }
+        if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
+                queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0);
+        clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY);
+        spin_unlock(&ls->ls_recover_spin);
+}
+/* gfs2_recover thread has a journal recovery result */
+static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid,
+                                 unsigned int result)
+{
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+                return;
+        /* don't care about the recovery of own journal during mount */
+        if (jid == ls->ls_jid)
+                return;
+        spin_lock(&ls->ls_recover_spin);
+        if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
+                spin_unlock(&ls->ls_recover_spin);
+                return;
+        }
+        if (ls->ls_recover_size < jid + 1) {
+                fs_err(sdp, "recovery_result jid %d short size %d",
+                       jid, ls->ls_recover_size);
+                spin_unlock(&ls->ls_recover_spin);
+                return;
+        }
+        fs_info(sdp, "recover jid %d result %s\n", jid,
+                result == LM_RD_GAVEUP ? "busy" : "success");
+        ls->ls_recover_result[jid] = result;
+        /* GAVEUP means another node is recovering the journal; delay our
+           next attempt to recover it, to give the other node a chance to
+           finish before trying again */
+        if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
+                queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work,
+                                   result == LM_RD_GAVEUP ? HZ : 0);
+        spin_unlock(&ls->ls_recover_spin);
+}
+const struct dlm_lockspace_ops gdlm_lockspace_ops = {
+        .recover_prep = gdlm_recover_prep,
+        .recover_slot = gdlm_recover_slot,
+        .recover_done = gdlm_recover_done,
+};
+static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
+{
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        char cluster[GFS2_LOCKNAME_LEN];
+        const char *fsname;
+        uint32_t flags;
+        int error, ops_result;
+        /*
+         * initialize everything
+         */
+        INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func);
+        spin_lock_init(&ls->ls_recover_spin);
+        ls->ls_recover_flags = 0;
+        ls->ls_recover_mount = 0;
+        ls->ls_recover_start = 0;
+        ls->ls_recover_block = 0;
+        ls->ls_recover_size = 0;
+        ls->ls_recover_submit = NULL;
+        ls->ls_recover_result = NULL;
+        error = set_recover_size(sdp, NULL, 0);
+        if (error)
+                goto fail;
+        /*
+         * prepare dlm_new_lockspace args
+         */
+        fsname = strchr(table, ':');
+        if (!fsname) {
+                fs_info(sdp, "no fsname found\n");
+                error = -EINVAL;
+                goto fail_free;
+        }
+        memset(cluster, 0, sizeof(cluster));
+        memcpy(cluster, table, strlen(table) - strlen(fsname));
+        fsname++;
+        flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL;
+        if (ls->ls_nodir)
+                flags |= DLM_LSFL_NODIR;
+        /*
+         * create/join lockspace
+         */
+        error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE,
+                                  &gdlm_lockspace_ops, sdp, &ops_result,
+                                  &ls->ls_dlm);
+        if (error) {
+                fs_err(sdp, "dlm_new_lockspace error %d\n", error);
+                goto fail_free;
+        }
+        if (ops_result < 0) {
+                /*
+                 * dlm does not support ops callbacks,
+                 * old dlm_controld/gfs_controld are used, try without ops.
+                 */
+                fs_info(sdp, "dlm lockspace ops not used\n");
+                free_recover_size(ls);
+                set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags);
+                return 0;
+        }
+        if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) {
+                fs_err(sdp, "dlm lockspace ops disallow jid preset\n");
+                error = -EINVAL;
+                goto fail_release;
+        }
+        /*
+         * control_mount() uses control_lock to determine first mounter,
+         * and for later mounts, waits for any recoveries to be cleared.
+         */
+        error = control_mount(sdp);
+        if (error) {
+                fs_err(sdp, "mount control error %d\n", error);
+                goto fail_release;
+        }
+        ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
+        clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
+        return 0;
+fail_release:
+        dlm_release_lockspace(ls->ls_dlm, 2);
+fail_free:
+        free_recover_size(ls);
+fail:
+        return error;
+}
+static void gdlm_first_done(struct gfs2_sbd *sdp)
+{
+        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        int error;
+        if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+                return;
+        error = control_first_done(sdp);
+        if (error)
+                fs_err(sdp, "mount first_done error %d\n", error);
+}
 static void gdlm_unmount(struct gfs2_sbd *sdp)
 {
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
+                goto release;
+        /* wait for gfs2_control_wq to be done with this mount */
+        spin_lock(&ls->ls_recover_spin);
+        set_bit(DFL_UNMOUNT, &ls->ls_recover_flags);
+        spin_unlock(&ls->ls_recover_spin);
+        flush_delayed_work_sync(&sdp->sd_control_work);
+        /* mounted_lock and control_lock will be purged in dlm recovery */
+release:
        if (ls->ls_dlm) {
                dlm_release_lockspace(ls->ls_dlm, 2);
                ls->ls_dlm = NULL;
        }
+        free_recover_size(ls);
 }
 static const match_table_t dlm_tokens = {
@@ -226,6 +1197,8 @@ static const match_table_t dlm_tokens = {
 const struct lm_lockops gfs2_dlm_ops = {
        .lm_proto_name = "lock_dlm",
        .lm_mount = gdlm_mount,
+        .lm_first_done = gdlm_first_done,
+        .lm_recovery_result = gdlm_recovery_result,
        .lm_unmount = gdlm_unmount,
        .lm_put_lock = gdlm_put_lock,
        .lm_lock = gdlm_lock,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index c150298e2d8e..a8d9bcd0e19c 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -28,6 +28,8 @@
 #include "recovery.h"
 #include "dir.h"
+struct workqueue_struct *gfs2_control_wq;
 static struct shrinker qd_shrinker = {
        .shrink = gfs2_shrink_qd_memory,
        .seeks = DEFAULT_SEEKS,
@@ -146,12 +148,19 @@ static int __init init_gfs2_fs(void)
        if (!gfs_recovery_wq)
                goto fail_wq;
+        gfs2_control_wq = alloc_workqueue("gfs2_control",
+                               WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0);
+        if (!gfs2_control_wq)
+                goto fail_control;
        gfs2_register_debugfs();
        printk("GFS2 installed\n");
        return 0;
+fail_control:
+        destroy_workqueue(gfs_recovery_wq);
 fail_wq:
        unregister_filesystem(&gfs2meta_fs_type);
 fail_unregister:
@@ -195,6 +204,7 @@ static void __exit exit_gfs2_fs(void)
        unregister_filesystem(&gfs2_fs_type);
        unregister_filesystem(&gfs2meta_fs_type);
        destroy_workqueue(gfs_recovery_wq);
+        destroy_workqueue(gfs2_control_wq);
        rcu_barrier();
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index fe72e79e6ff9..6aacf3f230a2 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -562,8 +562,12 @@ static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
 {
        char *message = "FIRSTMOUNT=Done";
        char *envp[] = { message, NULL };
-        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-        ls->ls_first_done = 1;
+        fs_info(sdp, "first mount done, others may mount\n");
+        if (sdp->sd_lockstruct.ls_ops->lm_first_done)
+                sdp->sd_lockstruct.ls_ops->lm_first_done(sdp);
        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
@@ -944,7 +948,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
        struct gfs2_args *args = &sdp->sd_args;
        const char *proto = sdp->sd_proto_name;
        const char *table = sdp->sd_table_name;
-        const char *fsname;
        char *o, *options;
        int ret;
@@ -1004,21 +1007,12 @@ hostdata_error:
                }
        }
-        if (sdp->sd_args.ar_spectator)
-                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
-        else
-                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
-                         sdp->sd_lockstruct.ls_jid);
-        fsname = strchr(table, ':');
-        if (fsname)
-                fsname++;
        if (lm->lm_mount == NULL) {
                fs_info(sdp, "Now mounting FS...\n");
                complete_all(&sdp->sd_locking_init);
                return 0;
        }
-        ret = lm->lm_mount(sdp, fsname);
+        ret = lm->lm_mount(sdp, table);
        if (ret == 0)
                fs_info(sdp, "Joined cluster. Now mounting FS...\n");
        complete_all(&sdp->sd_locking_init);
@@ -1084,7 +1078,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
        if (sdp->sd_args.ar_spectator) {
                sb->s_flags |= MS_RDONLY;
-                set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+                set_bit(SDF_RORECOVERY, &sdp->sd_flags);
        }
        if (sdp->sd_args.ar_posix_acl)
                sb->s_flags |= MS_POSIXACL;
@@ -1124,6 +1118,8 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
        if (error)
                goto fail;
+        snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name);
        gfs2_create_debugfs_file(sdp);
        error = gfs2_sys_fs_add(sdp);
@@ -1160,6 +1156,13 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
                goto fail_sb;
        }
+        if (sdp->sd_args.ar_spectator)
+                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s",
+                         sdp->sd_table_name);
+        else
+                snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u",
+                         sdp->sd_table_name, sdp->sd_lockstruct.ls_jid);
        error = init_inodes(sdp, DO);
        if (error)
                goto fail_sb;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f2a02edcac8f..963b2d75200c 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -436,12 +436,16 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
        char env_status[20];
        char *envp[] = { env_jid, env_status, NULL };
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
        ls->ls_recover_jid_done = jid;
        ls->ls_recover_jid_status = message;
        sprintf(env_jid, "JID=%d", jid);
        sprintf(env_status, "RECOVERY=%s",
                message == LM_RD_SUCCESS ? "Done" : "Failed");
        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
+        if (sdp->sd_lockstruct.ls_ops->lm_recovery_result)
+                sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message);
 }
 void gfs2_recover_func(struct work_struct *work)
@@ -512,7 +516,9 @@ void gfs2_recover_func(struct work_struct *work)
                if (error)
                        goto fail_gunlock_ji;
-                if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
+                if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) {
+                        ro = 1;
+                } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
                        if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
                                ro = 1;
                } else {
@@ -577,6 +583,7 @@ fail_gunlock_j:
        fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
 fail:
+        jd->jd_recover_error = error;
        gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
 done:
        clear_bit(JDF_RECOVERY, &jd->jd_flags);
@@ -605,6 +612,6 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
                wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait,
                            TASK_UNINTERRUPTIBLE);
-        return 0;
+        return wait ? jd->jd_recover_error : 0;
 }
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 22234627f684..981bfa32121a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1108,9 +1108,9 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 {
        struct gfs2_blkreserv *rs = ip->i_res;
-        gfs2_blkrsv_put(ip);
        if (rs->rs_rgd_gh.gh_gl)
                gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+        gfs2_blkrsv_put(ip);
 }
 /**
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 443cabcfcd23..d33172c291ba 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -298,7 +298,7 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
        ssize_t ret;
        int val = 0;
-        if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))
+        if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))
                val = 1;
        ret = sprintf(buf, "%d\n", val);
        return ret;
@@ -313,9 +313,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
        val = simple_strtol(buf, NULL, 0);
        if (val == 1)
-                set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+                set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
        else if (val == 0) {
-                clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+                clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
                smp_mb__after_clear_bit();
                gfs2_glock_thaw(sdp);
        } else {
@@ -350,8 +350,8 @@ static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
                goto out;
        if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
                goto out;
-        sdp->sd_lockstruct.ls_first = first;
+        sdp->sd_lockstruct.ls_first = first;
-        rv = 0;
+        rv = 0;
 out:
        spin_unlock(&sdp->sd_jindex_spin);
        return rv ? rv : len;
@@ -360,19 +360,14 @@ out:
 static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
 {
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-        return sprintf(buf, "%d\n", ls->ls_first_done);
+        return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags));
 }
-static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)
 {
-        unsigned jid;
        struct gfs2_jdesc *jd;
        int rv;
-        rv = sscanf(buf, "%u", &jid);
-        if (rv != 1)
-                return -EINVAL;
        rv = -ESHUTDOWN;
        spin_lock(&sdp->sd_jindex_spin);
        if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
@@ -389,6 +384,20 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
        }
 out:
        spin_unlock(&sdp->sd_jindex_spin);
+        return rv;
+}
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        unsigned jid;
+        int rv;
+        rv = sscanf(buf, "%u", &jid);
+        if (rv != 1)
+                return -EINVAL;
+        rv = gfs2_recover_set(sdp, jid);
        return rv ? rv : len;
 }
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index e94560e836d7..79182d6ad6ac 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -19,5 +19,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
 int gfs2_sys_init(void);
 void gfs2_sys_uninit(void);
+int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid);
 #endif /* __SYS_DOT_H__ */
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index edf0a801446b..427682ca9e48 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -499,9 +499,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
                if (!sbi->hidden_dir) {
                        mutex_lock(&sbi->vh_mutex);
                        sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-                        hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
+                        if (!sbi->hidden_dir) {
-                                           sbi->hidden_dir);
+                                mutex_unlock(&sbi->vh_mutex);
+                                err = -ENOMEM;
+                                goto out_put_root;
+                        }
+                        err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root,
+                                                 &str, sbi->hidden_dir);
                        mutex_unlock(&sbi->vh_mutex);
+                        if (err)
+                                goto out_put_hidden_dir;
                        hfsplus_mark_inode_dirty(sbi->hidden_dir,
                                                 HFSPLUS_I_CAT_DIRTY);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e425ad9d0490..1e85a7ac0217 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -583,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 }
 static int hugetlbfs_migrate_page(struct address_space *mapping,
-                                struct page *newpage, struct page *page)
+                                struct page *newpage, struct page *page,
+                                enum migrate_mode mode)
 {
        int rc;
diff --git a/fs/inode.c b/fs/inode.c
index 87535753ab04..fb10d86ffad7 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -322,9 +322,6 @@ EXPORT_SYMBOL(clear_nlink);
 void set_nlink(struct inode *inode, unsigned int nlink)
 {
        if (!nlink) {
-                printk_ratelimited(KERN_INFO
-                        "set_nlink() clearing i_nlink on %s inode %li\n",
-                        inode->i_sb->s_type->name, inode->i_ino);
                clear_nlink(inode);
        } else {
                /* Yes, some filesystems do change nlink from zero to one */
@@ -776,6 +773,8 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
        else
                __count_vm_events(PGINODESTEAL, reap);
        spin_unlock(&sb->s_inode_lru_lock);
+        if (current->reclaim_state)
+                current->reclaim_state->reclaimed_slab += reap;
        dispose_list(&freeable);
 }
diff --git a/fs/ioprio.c b/fs/ioprio.c
index f79dab83e17b..f84b380d65e5 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -48,28 +48,12 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
        if (err)
                return err;
-        task_lock(task);
+        ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
-        do {
+        if (ioc) {
-                ioc = task->io_context;
+                ioc_ioprio_changed(ioc, ioprio);
-                /* see wmb() in current_io_context() */
+                put_io_context(ioc, NULL);
-                smp_read_barrier_depends();
-                if (ioc)
-                        break;
-                ioc = alloc_io_context(GFP_ATOMIC, -1);
-                if (!ioc) {
-                        err = -ENOMEM;
-                        break;
-                }
-                task->io_context = ioc;
-        } while (1);
-        if (!err) {
-                ioc->ioprio = ioprio;
-                ioc->ioprio_changed = 1;
        }
-        task_unlock(task);
        return err;
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 7b99f5f460be..bd62c76fb5df 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -948,8 +948,11 @@ root_found:
        /* get the root dentry */
        s->s_root = d_alloc_root(inode);
-        if (!(s->s_root))
+        if (!(s->s_root)) {
-                goto out_no_root;
+                iput(inode);
+                error = -ENOMEM;
+                goto out_no_inode;
+        }
        kfree(opt.iocharset);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 5d1a00a5041b..05f0754f2b46 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -453,8 +453,6 @@ out:
 *
 * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
 *
- * Called with the journal lock held.
- *
 * This is the only part of the journaling code which really needs to be
 * aware of transaction aborts.  Checkpointing involves writing to the
 * main filesystem area rather than to the journal, so it can proceed
@@ -472,13 +470,14 @@ int cleanup_journal_tail(journal_t *journal)
        if (is_journal_aborted(journal))
                return 1;
-        /* OK, work out the oldest transaction remaining in the log, and
+        /*
+         * OK, work out the oldest transaction remaining in the log, and
         * the log block it starts at.
         *
         * If the log is now empty, we need to work out which is the
         * next transaction ID we will write, and where it will
-         * start. */
+         * start.
+         */
        spin_lock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        transaction = journal->j_checkpoint_transactions;
@@ -504,7 +503,25 @@ int cleanup_journal_tail(journal_t *journal)
                spin_unlock(&journal->j_state_lock);
                return 1;
        }
+        spin_unlock(&journal->j_state_lock);
+        /*
+         * We need to make sure that any blocks that were recently written out
+         * --- perhaps by log_do_checkpoint() --- are flushed out before we
+         * drop the transactions from the journal. It's unlikely this will be
+         * necessary, especially with an appropriately sized journal, but we
+         * need this to guarantee correctness.  Fortunately
+         * cleanup_journal_tail() doesn't get called all that often.
+         */
+        if (journal->j_flags & JFS_BARRIER)
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+        spin_lock(&journal->j_state_lock);
+        if (!tid_gt(first_tid, journal->j_tail_sequence)) {
+                spin_unlock(&journal->j_state_lock);
+                /* Someone else cleaned up journal so return 0 */
+                return 0;
+        }
        /* OK, update the superblock to recover the freed space.
         * Physical blocks come first: have we wrapped beyond the end of
         * the log?  */
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 8799207df058..f2b9a571f4cf 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -392,6 +392,12 @@ void journal_commit_transaction(journal_t *journal)
        jbd_debug (3, "JBD: commit phase 1\n");
        /*
+         * Clear revoked flag to reflect there is no revoked buffers
+         * in the next transaction which is going to be started.
+         */
+        journal_clear_buffer_revoked_flags(journal);
+        /*
         * Switch to a new revoke table.
         */
        journal_switch_revoke_table(journal);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index a96cff0c5f1d..59c09f9541b5 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -721,7 +721,6 @@ static journal_t * journal_init_common (void)
        init_waitqueue_head(&journal->j_wait_checkpoint);
        init_waitqueue_head(&journal->j_wait_commit);
        init_waitqueue_head(&journal->j_wait_updates);
-        mutex_init(&journal->j_barrier);
        mutex_init(&journal->j_checkpoint_mutex);
        spin_lock_init(&journal->j_revoke_lock);
        spin_lock_init(&journal->j_list_lock);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 5b43e96788e6..008bf062fd26 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,6 +20,7 @@
 #include <linux/fs.h>
 #include <linux/jbd.h>
 #include <linux/errno.h>
+#include <linux/blkdev.h>
 #endif
 /*
@@ -263,6 +264,9 @@ int journal_recover(journal_t *journal)
        err2 = sync_blockdev(journal->j_fs_dev);
        if (!err)
                err = err2;
+        /* Flush disk caches to get replayed data on the permanent storage */
+        if (journal->j_flags & JFS_BARRIER)
+                blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
        return err;
 }
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 305a90763154..25c713e7071c 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -47,6 +47,10 @@
 *   overwriting the new data.  We don't even need to clear the revoke
 *   bit here.
 *
+ * We cache revoke status of a buffer in the current transaction in b_states
+ * bits.  As the name says, revokevalid flag indicates that the cached revoke
+ * status of a buffer is valid and we can rely on the cached status.
+ *
 * Revoke information on buffers is a tri-state value:
 *
 * RevokeValid clear:   no cached revoke status, need to look it up
@@ -479,6 +483,36 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
        return did_revoke;
 }
+/*
+ * journal_clear_revoked_flags clears revoked flag of buffers in
+ * revoke table to reflect there is no revoked buffer in the next
+ * transaction which is going to be started.
+ */
+void journal_clear_buffer_revoked_flags(journal_t *journal)
+{
+        struct jbd_revoke_table_s *revoke = journal->j_revoke;
+        int i = 0;
+        for (i = 0; i < revoke->hash_size; i++) {
+                struct list_head *hash_list;
+                struct list_head *list_entry;
+                hash_list = &revoke->hash_table[i];
+                list_for_each(list_entry, hash_list) {
+                        struct jbd_revoke_record_s *record;
+                        struct buffer_head *bh;
+                        record = (struct jbd_revoke_record_s *)list_entry;
+                        bh = __find_get_block(journal->j_fs_dev,
+                                              record->blocknr,
+                                              journal->j_blocksize);
+                        if (bh) {
+                                clear_buffer_revoked(bh);
+                                __brelse(bh);
+                        }
+                }
+        }
+}
 /* journal_switch_revoke table select j_revoke for next transaction
 * we do not want to suspend any processing until all revokes are
 * written -bzzz
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 7e59c6e66f9b..7fce94b04bc3 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -426,17 +426,34 @@ int journal_restart(handle_t *handle, int nblocks)
 * void journal_lock_updates () - establish a transaction barrier.
 * @journal:  Journal to establish a barrier on.
 *
- * This locks out any further updates from being started, and blocks
+ * This locks out any further updates from being started, and blocks until all
- * until all existing updates have completed, returning only once the
+ * existing updates have completed, returning only once the journal is in a
- * journal is in a quiescent state with no updates running.
+ * quiescent state with no updates running.
- *
+ *
- * The journal lock should not be held on entry.
+ * We do not use simple mutex for synchronization as there are syscalls which
+ * want to return with filesystem locked and that trips up lockdep. Also
+ * hibernate needs to lock filesystem but locked mutex then blocks hibernation.
+ * Since locking filesystem is rare operation, we use simple counter and
+ * waitqueue for locking.
 */
 void journal_lock_updates(journal_t *journal)
 {
        DEFINE_WAIT(wait);
+wait:
+        /* Wait for previous locked operation to finish */
+        wait_event(journal->j_wait_transaction_locked,
+                   journal->j_barrier_count == 0);
        spin_lock(&journal->j_state_lock);
+        /*
+         * Check reliably under the lock whether we are the ones winning the race
+         * and locking the journal
+         */
+        if (journal->j_barrier_count > 0) {
+                spin_unlock(&journal->j_state_lock);
+                goto wait;
+        }
        ++journal->j_barrier_count;
        /* Wait until there are no running updates */
@@ -460,14 +477,6 @@ void journal_lock_updates(journal_t *journal)
                spin_lock(&journal->j_state_lock);
        }
        spin_unlock(&journal->j_state_lock);
-        /*
-         * We have now established a barrier against other normal updates, but
-         * we also need to barrier against other journal_lock_updates() calls
-         * to make sure that we serialise special journal-locked operations
-         * too.
-         */
-        mutex_lock(&journal->j_barrier);
 }
 /**
@@ -475,14 +484,11 @@ void journal_lock_updates(journal_t *journal)
 * @journal:  Journal to release the barrier on.
 *
 * Release a transaction barrier obtained with journal_lock_updates().
- *
- * Should be called without the journal lock held.
 */
 void journal_unlock_updates (journal_t *journal)
 {
        J_ASSERT(journal->j_barrier_count != 0);
-        mutex_unlock(&journal->j_barrier);
        spin_lock(&journal->j_state_lock);
        --journal->j_barrier_count;
        spin_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 68d704db787f..5069b8475150 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -430,6 +430,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        jbd_debug(3, "JBD2: commit phase 1\n");
        /*
+         * Clear revoked flag to reflect there is no revoked buffers
+         * in the next transaction which is going to be started.
+         */
+        jbd2_clear_buffer_revoked_flags(journal);
+        /*
         * Switch to a new revoke table.
         */
        jbd2_journal_switch_revoke_table(journal);
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 69fd93588118..30b2867d6cc9 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -47,6 +47,10 @@
 *   overwriting the new data.  We don't even need to clear the revoke
 *   bit here.
 *
+ * We cache revoke status of a buffer in the current transaction in b_states
+ * bits.  As the name says, revokevalid flag indicates that the cached revoke
+ * status of a buffer is valid and we can rely on the cached status.
+ *
 * Revoke information on buffers is a tri-state value:
 *
 * RevokeValid clear:   no cached revoke status, need to look it up
@@ -478,6 +482,36 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
        return did_revoke;
 }
+/*
+ * journal_clear_revoked_flag clears revoked flag of buffers in
+ * revoke table to reflect there is no revoked buffers in the next
+ * transaction which is going to be started.
+ */
+void jbd2_clear_buffer_revoked_flags(journal_t *journal)
+{
+        struct jbd2_revoke_table_s *revoke = journal->j_revoke;
+        int i = 0;
+        for (i = 0; i < revoke->hash_size; i++) {
+                struct list_head *hash_list;
+                struct list_head *list_entry;
+                hash_list = &revoke->hash_table[i];
+                list_for_each(list_entry, hash_list) {
+                        struct jbd2_revoke_record_s *record;
+                        struct buffer_head *bh;
+                        record = (struct jbd2_revoke_record_s *)list_entry;
+                        bh = __find_get_block(journal->j_fs_dev,
+                                              record->blocknr,
+                                              journal->j_blocksize);
+                        if (bh) {
+                                clear_buffer_revoked(bh);
+                                __brelse(bh);
+                        }
+                }
+        }
+}
 /* journal_switch_revoke table select j_revoke for next transaction
 * we do not want to suspend any processing until all revokes are
 * written -bzzz
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0e41a4c080e..35ae096bed5d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -517,12 +517,13 @@ void jbd2_journal_lock_updates(journal_t *journal)
                        break;
                spin_lock(&transaction->t_handle_lock);
+                prepare_to_wait(&journal->j_wait_updates, &wait,
+                                TASK_UNINTERRUPTIBLE);
                if (!atomic_read(&transaction->t_updates)) {
                        spin_unlock(&transaction->t_handle_lock);
+                        finish_wait(&journal->j_wait_updates, &wait);
                        break;
                }
-                prepare_to_wait(&journal->j_wait_updates, &wait,
-                                TASK_UNINTERRUPTIBLE);
                spin_unlock(&transaction->t_handle_lock);
                write_unlock(&journal->j_state_lock);
                schedule();
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index e513f1913c15..a01cdad6aad1 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -74,7 +74,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
        ((struct erase_priv_struct *)instr->priv)->jeb = jeb;
        ((struct erase_priv_struct *)instr->priv)->c = c;
-        ret = c->mtd->erase(c->mtd, instr);
+        ret = mtd_erase(c->mtd, instr);
        if (!ret)
                return;
@@ -336,12 +336,11 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
        uint32_t ofs;
        size_t retlen;
        int ret = -EIO;
+        unsigned long *wordebuf;
-        if (c->mtd->point) {
+        ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen,
-                unsigned long *wordebuf;
+                        &ebuf, NULL);
+        if (ret != -EOPNOTSUPP) {
-                ret = c->mtd->point(c->mtd, jeb->offset, c->sector_size,
-                                    &retlen, &ebuf, NULL);
                if (ret) {
                        D1(printk(KERN_DEBUG "MTD point failed %d\n", ret));
                        goto do_flash_read;
@@ -349,7 +348,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
                if (retlen < c->sector_size) {
                        /* Don't muck about if it won't let us point to the whole erase sector */
                        D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", retlen));
-                        c->mtd->unpoint(c->mtd, jeb->offset, retlen);
+                        mtd_unpoint(c->mtd, jeb->offset, retlen);
                        goto do_flash_read;
                }
                wordebuf = ebuf-sizeof(*wordebuf);
@@ -358,7 +357,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
                   if (*++wordebuf != ~0)
                           break;
                } while(--retlen);
-                c->mtd->unpoint(c->mtd, jeb->offset, c->sector_size);
+                mtd_unpoint(c->mtd, jeb->offset, c->sector_size);
                if (retlen) {
                        printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08tx\n",
                               *wordebuf, jeb->offset + c->sector_size-retlen*sizeof(*wordebuf));
@@ -381,7 +380,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
                *bad_offset = ofs;
-                ret = c->mtd->read(c->mtd, ofs, readlen, &retlen, ebuf);
+                ret = mtd_read(c->mtd, ofs, readlen, &retlen, ebuf);
                if (ret) {
                        printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret);
                        ret = -EIO;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 4b8afe39a87f..2e0123867cb1 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -466,7 +466,6 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
        if (insert_inode_locked(inode) < 0) {
                make_bad_inode(inode);
-                unlock_new_inode(inode);
                iput(inode);
                return ERR_PTR(-EINVAL);
        }
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index ee57bac1ba6d..3093ac4fb24c 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -62,17 +62,15 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
 #ifndef __ECOS
        /* TODO: instead, incapsulate point() stuff to jffs2_flash_read(),
         * adding and jffs2_flash_read_end() interface. */
-        if (c->mtd->point) {
+        err = mtd_point(c->mtd, ofs, len, &retlen, (void **)&buffer, NULL);
-                err = c->mtd->point(c->mtd, ofs, len, &retlen,
+        if (!err && retlen < len) {
-                                    (void **)&buffer, NULL);
+                JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
-                if (!err && retlen < len) {
+                mtd_unpoint(c->mtd, ofs, retlen);
-                        JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
+        } else if (err) {
-                        c->mtd->unpoint(c->mtd, ofs, retlen);
+                if (err != -EOPNOTSUPP)
-                } else if (err)
                        JFFS2_WARNING("MTD point failed: error code %d.\n", err);
-                else
+        } else
-                        pointed = 1; /* succefully pointed to device */
+                pointed = 1; /* succefully pointed to device */
-        }
 #endif
        if (!pointed) {
@@ -101,7 +99,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
                kfree(buffer);
 #ifndef __ECOS
        else
-                c->mtd->unpoint(c->mtd, ofs, len);
+                mtd_unpoint(c->mtd, ofs, len);
 #endif
        if (crc != tn->data_crc) {
@@ -137,7 +135,7 @@ free_out:
                kfree(buffer);
 #ifndef __ECOS
        else
-                c->mtd->unpoint(c->mtd, ofs, len);
+                mtd_unpoint(c->mtd, ofs, len);
 #endif
        return err;
 }
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 28107ca136e4..f99464833bb2 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -97,15 +97,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
        size_t pointlen, try_size;
        if (c->mtd->point) {
-                ret = c->mtd->point(c->mtd, 0, c->mtd->size, &pointlen,
+                ret = mtd_point(c->mtd, 0, c->mtd->size, &pointlen,
-                                    (void **)&flashbuf, NULL);
+                                (void **)&flashbuf, NULL);
                if (!ret && pointlen < c->mtd->size) {
                        /* Don't muck about if it won't let us point to the whole flash */
                        D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", pointlen));
-                        c->mtd->unpoint(c->mtd, 0, pointlen);
+                        mtd_unpoint(c->mtd, 0, pointlen);
                        flashbuf = NULL;
                }
-                if (ret)
+                if (ret && ret != -EOPNOTSUPP)
                        D1(printk(KERN_DEBUG "MTD point failed %d\n", ret));
        }
 #endif
@@ -273,7 +273,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
                kfree(flashbuf);
 #ifndef __ECOS
        else
-                c->mtd->unpoint(c->mtd, 0, c->mtd->size);
+                mtd_unpoint(c->mtd, 0, c->mtd->size);
 #endif
        kfree(s);
        return ret;
@@ -455,7 +455,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
        if (jffs2_cleanmarker_oob(c)) {
                int ret;
-                if (c->mtd->block_isbad(c->mtd, jeb->offset))
+                if (mtd_block_isbad(c->mtd, jeb->offset))
                        return BLK_STATE_BADBLOCK;
                ret = jffs2_check_nand_cleanmarker(c, jeb);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 8be4925296cf..f2d96b5e64f6 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -335,9 +335,7 @@ static void jffs2_put_super (struct super_block *sb)
        jffs2_flash_cleanup(c);
        kfree(c->inocache_list);
        jffs2_clear_xattr_subsystem(c);
-        if (c->mtd->sync)
+        mtd_sync(c->mtd);
-                c->mtd->sync(c->mtd);
        D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
 }
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index b09e51d2f81f..30e8f47e8a23 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -228,7 +228,7 @@ static int jffs2_verify_write(struct jffs2_sb_info *c, unsigned char *buf,
        size_t retlen;
        char *eccstr;
-        ret = c->mtd->read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify);
+        ret = mtd_read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify);
        if (ret && ret != -EUCLEAN && ret != -EBADMSG) {
                printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x failed: %d\n", c->wbuf_ofs, ret);
                return ret;
@@ -337,7 +337,8 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
                }
                /* Do the read... */
-                ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf);
+                ret = mtd_read(c->mtd, start, c->wbuf_ofs - start, &retlen,
+                               buf);
                /* ECC recovered ? */
                if ((ret == -EUCLEAN || ret == -EBADMSG) &&
@@ -413,13 +414,12 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
                if (breakme++ == 20) {
                        printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs);
                        breakme = 0;
-                        c->mtd->write(c->mtd, ofs, towrite, &retlen,
+                        mtd_write(c->mtd, ofs, towrite, &retlen, brokenbuf);
-                                      brokenbuf);
                        ret = -EIO;
                } else
 #endif
-                        ret = c->mtd->write(c->mtd, ofs, towrite, &retlen,
+                        ret = mtd_write(c->mtd, ofs, towrite, &retlen,
-                                            rewrite_buf);
+                                        rewrite_buf);
                if (ret || retlen != towrite || jffs2_verify_write(c, rewrite_buf, ofs)) {
                        /* Argh. We tried. Really we did. */
@@ -619,13 +619,14 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
        if (breakme++ == 20) {
                printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs);
                breakme = 0;
-                c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen,
+                mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen,
-                              brokenbuf);
+                          brokenbuf);
                ret = -EIO;
        } else
 #endif
-                ret = c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf);
+                ret = mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize,
+                                &retlen, c->wbuf);
        if (ret) {
                printk(KERN_WARNING "jffs2_flush_wbuf(): Write failed with %d\n", ret);
@@ -861,8 +862,8 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs,
                v += wbuf_retlen;
                if (vlen >= c->wbuf_pagesize) {
-                        ret = c->mtd->write(c->mtd, outvec_to, PAGE_DIV(vlen),
+                        ret = mtd_write(c->mtd, outvec_to, PAGE_DIV(vlen),
-                                            &wbuf_retlen, v);
+                                        &wbuf_retlen, v);
                        if (ret < 0 || wbuf_retlen != PAGE_DIV(vlen))
                                goto outfile;
@@ -948,11 +949,11 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re
        int     ret;
        if (!jffs2_is_writebuffered(c))
-                return c->mtd->read(c->mtd, ofs, len, retlen, buf);
+                return mtd_read(c->mtd, ofs, len, retlen, buf);
        /* Read flash */
        down_read(&c->wbuf_sem);
-        ret = c->mtd->read(c->mtd, ofs, len, retlen, buf);
+        ret = mtd_read(c->mtd, ofs, len, retlen, buf);
        if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) {
                if (ret == -EBADMSG)
@@ -1031,7 +1032,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
        ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
        ops.datbuf = NULL;
-        ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops);
+        ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
        if (ret || ops.oobretlen != ops.ooblen) {
                printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd"
                                " bytes, read %zd bytes, error %d\n",
@@ -1074,7 +1075,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
        ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
        ops.datbuf = NULL;
-        ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops);
+        ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
        if (ret || ops.oobretlen != ops.ooblen) {
                printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd"
                                " bytes, read %zd bytes, error %d\n",
@@ -1100,7 +1101,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
        ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
        ops.datbuf = NULL;
-        ret = c->mtd->write_oob(c->mtd, jeb->offset, &ops);
+        ret = mtd_write_oob(c->mtd, jeb->offset, &ops);
        if (ret || ops.oobretlen != ops.ooblen) {
                printk(KERN_ERR "cannot write OOB for EB at %08x, requested %zd"
                                " bytes, read %zd bytes, error %d\n",
@@ -1129,11 +1130,8 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
        if( ++jeb->bad_count < MAX_ERASE_FAILURES)
                return 0;
-        if (!c->mtd->block_markbad)
-                return 1; // What else can we do?
        printk(KERN_WARNING "JFFS2: marking eraseblock at %08x\n as bad", bad_offset);
-        ret = c->mtd->block_markbad(c->mtd, bad_offset);
+        ret = mtd_block_markbad(c->mtd, bad_offset);
        if (ret) {
                D1(printk(KERN_WARNING "jffs2_write_nand_badblock(): Write failed for block at %08x: error %d\n", jeb->offset, ret));
diff --git a/fs/jffs2/writev.c b/fs/jffs2/writev.c
index b9276b11bac6..a1bda9dab3f8 100644
--- a/fs/jffs2/writev.c
+++ b/fs/jffs2/writev.c
@@ -13,30 +13,6 @@
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
-/* This ought to be in core MTD code. All registered MTD devices
-   without writev should have this put in place. Bug the MTD
-   maintainer */
-static inline int mtd_fake_writev(struct mtd_info *mtd, const struct kvec *vecs,
-                                  unsigned long count, loff_t to, size_t *retlen)
-{
-        unsigned long i;
-        size_t totlen = 0, thislen;
-        int ret = 0;
-        for (i=0; i<count; i++) {
-                if (!vecs[i].iov_len)
-                        continue;
-                ret = mtd->write(mtd, to, vecs[i].iov_len, &thislen, vecs[i].iov_base);
-                totlen += thislen;
-                if (ret || thislen != vecs[i].iov_len)
-                        break;
-                to += vecs[i].iov_len;
-        }
-        if (retlen)
-                *retlen = totlen;
-        return ret;
-}
 int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs,
                              unsigned long count, loff_t to, size_t *retlen)
 {
@@ -50,18 +26,14 @@ int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs,
                }
        }
-        if (c->mtd->writev)
+        return mtd_writev(c->mtd, vecs, count, to, retlen);
-                return c->mtd->writev(c->mtd, vecs, count, to, retlen);
-        else {
-                return mtd_fake_writev(c->mtd, vecs, count, to, retlen);
-        }
 }
 int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len,
                        size_t *retlen, const u_char *buf)
 {
        int ret;
-        ret = c->mtd->write(c->mtd, ofs, len, retlen, buf);
+        ret = mtd_write(c->mtd, ofs, len, retlen, buf);
        if (jffs2_sum_active()) {
                struct kvec vecs[1];
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 23d7451b2938..65ba36b80a9e 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -55,7 +55,7 @@ static				DEFINE_SPINLOCK(nsm_lock);
 * Local NSM state
 */
 u32     __read_mostly           nsm_local_state;
-int     __read_mostly           nsm_use_hostnames;
+bool    __read_mostly           nsm_use_hostnames;
 static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
 {
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 339e17e9133d..9c501449450d 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -13,13 +13,14 @@
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
-static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+static int logfs_mtd_read(struct super_block *sb, loff_t ofs, size_t len,
+                        void *buf)
 {
        struct mtd_info *mtd = logfs_super(sb)->s_mtd;
        size_t retlen;
        int ret;
-        ret = mtd->read(mtd, ofs, len, &retlen, buf);
+        ret = mtd_read(mtd, ofs, len, &retlen, buf);
        BUG_ON(ret == -EINVAL);
        if (ret)
                return ret;
@@ -31,7 +32,8 @@ static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
        return 0;
 }
-static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
+static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len,
+                        void *buf)
 {
        struct logfs_super *super = logfs_super(sb);
        struct mtd_info *mtd = super->s_mtd;
@@ -47,7 +49,7 @@ static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
        BUG_ON(len > PAGE_CACHE_SIZE);
        page_start = ofs & PAGE_CACHE_MASK;
        page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
-        ret = mtd->write(mtd, ofs, len, &retlen, buf);
+        ret = mtd_write(mtd, ofs, len, &retlen, buf);
        if (ret || (retlen != len))
                return -EIO;
@@ -60,14 +62,15 @@ static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
 * asynchronous properties.  So just to prevent the first implementor of such
 * a thing from breaking logfs in 2350, we do the usual pointless dance to
 * declare a completion variable and wait for completion before returning
- * from mtd_erase().  What an exercise in futility!
+ * from logfs_mtd_erase().  What an exercise in futility!
 */
 static void logfs_erase_callback(struct erase_info *ei)
 {
        complete((struct completion *)ei->priv);
 }
-static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
+static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs,
+                                size_t len)
 {
        struct logfs_super *super = logfs_super(sb);
        struct address_space *mapping = super->s_mapping_inode->i_mapping;
@@ -84,7 +87,7 @@ static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
        return 0;
 }
-static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
+static int logfs_mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
                int ensure_write)
 {
        struct mtd_info *mtd = logfs_super(sb)->s_mtd;
@@ -102,30 +105,29 @@ static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
        ei.len = len;
        ei.callback = logfs_erase_callback;
        ei.priv = (long)&complete;
-        ret = mtd->erase(mtd, &ei);
+        ret = mtd_erase(mtd, &ei);
        if (ret)
                return -EIO;
        wait_for_completion(&complete);
        if (ei.state != MTD_ERASE_DONE)
                return -EIO;
-        return mtd_erase_mapping(sb, ofs, len);
+        return logfs_mtd_erase_mapping(sb, ofs, len);
 }
-static void mtd_sync(struct super_block *sb)
+static void logfs_mtd_sync(struct super_block *sb)
 {
        struct mtd_info *mtd = logfs_super(sb)->s_mtd;
-        if (mtd->sync)
+        mtd_sync(mtd);
-                mtd->sync(mtd);
 }
-static int mtd_readpage(void *_sb, struct page *page)
+static int logfs_mtd_readpage(void *_sb, struct page *page)
 {
        struct super_block *sb = _sb;
        int err;
-        err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+        err = logfs_mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
                        page_address(page));
        if (err == -EUCLEAN || err == -EBADMSG) {
                /* -EBADMSG happens regularly on power failures */
@@ -143,18 +145,15 @@ static int mtd_readpage(void *_sb, struct page *page)
        return err;
 }
-static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
+static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)
 {
        struct logfs_super *super = logfs_super(sb);
        struct address_space *mapping = super->s_mapping_inode->i_mapping;
-        filler_t *filler = mtd_readpage;
+        filler_t *filler = logfs_mtd_readpage;
        struct mtd_info *mtd = super->s_mtd;
-        if (!mtd->block_isbad)
-                return NULL;
        *ofs = 0;
-        while (mtd->block_isbad(mtd, *ofs)) {
+        while (mtd_block_isbad(mtd, *ofs)) {
                *ofs += mtd->erasesize;
                if (*ofs >= mtd->size)
                        return NULL;
@@ -163,18 +162,15 @@ static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
        return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
 }
-static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
+static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)
 {
        struct logfs_super *super = logfs_super(sb);
        struct address_space *mapping = super->s_mapping_inode->i_mapping;
-        filler_t *filler = mtd_readpage;
+        filler_t *filler = logfs_mtd_readpage;
        struct mtd_info *mtd = super->s_mtd;
-        if (!mtd->block_isbad)
-                return NULL;
        *ofs = mtd->size - mtd->erasesize;
-        while (mtd->block_isbad(mtd, *ofs)) {
+        while (mtd_block_isbad(mtd, *ofs)) {
                *ofs -= mtd->erasesize;
                if (*ofs <= 0)
                        return NULL;
@@ -184,7 +180,7 @@ static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
        return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
 }
-static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
+static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
                size_t nr_pages)
 {
        struct logfs_super *super = logfs_super(sb);
@@ -196,8 +192,8 @@ static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
                page = find_lock_page(mapping, index + i);
                BUG_ON(!page);
-                err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
+                err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
-                                page_address(page));
+                                        page_address(page));
                unlock_page(page);
                page_cache_release(page);
                if (err)
@@ -206,7 +202,7 @@ static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
        return 0;
 }
-static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
+static void logfs_mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
 {
        struct logfs_super *super = logfs_super(sb);
        int head;
@@ -227,15 +223,15 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
                len += head;
        }
        len = PAGE_ALIGN(len);
-        __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
+        __logfs_mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
 }
-static void mtd_put_device(struct logfs_super *s)
+static void logfs_mtd_put_device(struct logfs_super *s)
 {
        put_mtd_device(s->s_mtd);
 }
-static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
+static int logfs_mtd_can_write_buf(struct super_block *sb, u64 ofs)
 {
        struct logfs_super *super = logfs_super(sb);
        void *buf;
@@ -244,7 +240,7 @@ static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
        buf = kmalloc(super->s_writesize, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;
-        err = mtd_read(sb, ofs, super->s_writesize, buf);
+        err = logfs_mtd_read(sb, ofs, super->s_writesize, buf);
        if (err)
                goto out;
        if (memchr_inv(buf, 0xff, super->s_writesize))
@@ -255,14 +251,14 @@ out:
 }
 static const struct logfs_device_ops mtd_devops = {
-        .find_first_sb  = mtd_find_first_sb,
+        .find_first_sb  = logfs_mtd_find_first_sb,
-        .find_last_sb   = mtd_find_last_sb,
+        .find_last_sb   = logfs_mtd_find_last_sb,
-        .readpage       = mtd_readpage,
+        .readpage       = logfs_mtd_readpage,
-        .writeseg       = mtd_writeseg,
+        .writeseg       = logfs_mtd_writeseg,
-        .erase          = mtd_erase,
+        .erase          = logfs_mtd_erase,
-        .can_write_buf  = mtd_can_write_buf,
+        .can_write_buf  = logfs_mtd_can_write_buf,
-        .sync           = mtd_sync,
+        .sync           = logfs_mtd_sync,
-        .put_device     = mtd_put_device,
+        .put_device     = logfs_mtd_put_device,
 };
 int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 501043e8966c..3de7a32cadbe 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -71,7 +71,7 @@ static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
 static int write_inode(struct inode *inode)
 {
-        return __logfs_write_inode(inode, WF_LOCK);
+        return __logfs_write_inode(inode, NULL, WF_LOCK);
 }
 static s64 dir_seek_data(struct inode *inode, s64 pos)
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index b548c87a86f1..3886cded283c 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -230,7 +230,9 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
                return ret;
        mutex_lock(&inode->i_mutex);
+        logfs_get_wblocks(sb, NULL, WF_LOCK);
        logfs_write_anchor(sb);
+        logfs_put_wblocks(sb, NULL, WF_LOCK);
        mutex_unlock(&inode->i_mutex);
        return 0;
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index caa4419285dc..d4efb061bdc5 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -367,7 +367,7 @@ static struct gc_candidate *get_candidate(struct super_block *sb)
        int i, max_dist;
        struct gc_candidate *cand = NULL, *this;
-        max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
+        max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);
        for (i = max_dist; i >= 0; i--) {
                this = first_in_list(&super->s_low_list[i]);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 388df1aa35e5..a422f42238b2 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -286,7 +286,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
                return 0;
-        ret = __logfs_write_inode(inode, flags);
+        ret = __logfs_write_inode(inode, NULL, flags);
        LOGFS_BUG_ON(ret, inode->i_sb);
        return ret;
 }
@@ -363,7 +363,9 @@ static void logfs_init_once(void *_li)
 static int logfs_sync_fs(struct super_block *sb, int wait)
 {
+        logfs_get_wblocks(sb, NULL, WF_LOCK);
        logfs_write_anchor(sb);
+        logfs_put_wblocks(sb, NULL, WF_LOCK);
        return 0;
 }
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 9da29706f91c..1e1c369df22b 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -612,7 +612,6 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
        if (len == 0)
                return logfs_write_header(super, header, 0, type);
-        BUG_ON(len > sb->s_blocksize);
        compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
        if (compr_len < 0 || type == JE_ANCHOR) {
                memcpy(data, buf, len);
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 926373866a55..5f0937609465 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -528,7 +528,7 @@ void logfs_destroy_inode_cache(void);
 void logfs_set_blocks(struct inode *inode, u64 no);
 /* these logically belong into inode.c but actually reside in readwrite.c */
 int logfs_read_inode(struct inode *inode);
-int __logfs_write_inode(struct inode *inode, long flags);
+int __logfs_write_inode(struct inode *inode, struct page *, long flags);
 void logfs_evict_inode(struct inode *inode);
 /* journal.c */
@@ -577,6 +577,8 @@ void initialize_block_counters(struct page *page, struct logfs_block *block,
                __be64 *array, int page_is_empty);
 int logfs_exist_block(struct inode *inode, u64 bix);
 int get_page_reserve(struct inode *inode, struct page *page);
+void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
+void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
 extern struct logfs_block_ops indirect_block_ops;
 /* segment.c */
@@ -594,6 +596,7 @@ int logfs_init_mapping(struct super_block *sb);
 void logfs_sync_area(struct logfs_area *area);
 void logfs_sync_segments(struct super_block *sb);
 void freeseg(struct super_block *sb, u32 segno);
+void free_areas(struct super_block *sb);
 /* area handling */
 int logfs_init_areas(struct super_block *sb);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 2ac4217b7901..4153e65b0148 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -244,8 +244,7 @@ static void preunlock_page(struct super_block *sb, struct page *page, int lock)
 * is waiting for s_write_mutex.  We annotate this fact by setting PG_pre_locked
 * in addition to PG_locked.
 */
-static void logfs_get_wblocks(struct super_block *sb, struct page *page,
+void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock)
-                int lock)
 {
        struct logfs_super *super = logfs_super(sb);
@@ -260,8 +259,7 @@ static void logfs_get_wblocks(struct super_block *sb, struct page *page,
        }
 }
-static void logfs_put_wblocks(struct super_block *sb, struct page *page,
+void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock)
-                int lock)
 {
        struct logfs_super *super = logfs_super(sb);
@@ -424,7 +422,7 @@ static void inode_write_block(struct logfs_block *block)
        if (inode->i_ino == LOGFS_INO_MASTER)
                logfs_write_anchor(inode->i_sb);
        else {
-                ret = __logfs_write_inode(inode, 0);
+                ret = __logfs_write_inode(inode, NULL, 0);
                /* see indirect_write_block comment */
                BUG_ON(ret);
        }
@@ -560,8 +558,13 @@ static void inode_free_block(struct super_block *sb, struct logfs_block *block)
 static void indirect_free_block(struct super_block *sb,
                struct logfs_block *block)
 {
-        ClearPagePrivate(block->page);
+        struct page *page = block->page;
-        block->page->private = 0;
+        if (PagePrivate(page)) {
+                ClearPagePrivate(page);
+                page_cache_release(page);
+                set_page_private(page, 0);
+        }
        __free_block(sb, block);
 }
@@ -650,8 +653,11 @@ static void alloc_data_block(struct inode *inode, struct page *page)
        logfs_unpack_index(page->index, &bix, &level);
        block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
        block->page = page;
        SetPagePrivate(page);
-        page->private = (unsigned long)block;
+        page_cache_get(page);
+        set_page_private(page, (unsigned long) block);
        block->ops = &indirect_block_ops;
 }
@@ -1570,11 +1576,15 @@ int logfs_write_buf(struct inode *inode, struct page *page, long flags)
 static int __logfs_delete(struct inode *inode, struct page *page)
 {
        long flags = WF_DELETE;
+        int err;
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
        if (page->index < I0_BLOCKS)
                return logfs_write_direct(inode, page, flags);
+        err = grow_inode(inode, page->index, 0);
+        if (err)
+                return err;
        return logfs_write_rec(inode, page, page->index, 0, flags);
 }
@@ -1623,7 +1633,7 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
                        if (inode->i_ino == LOGFS_INO_MASTER)
                                logfs_write_anchor(inode->i_sb);
                        else {
-                                err = __logfs_write_inode(inode, flags);
+                                err = __logfs_write_inode(inode, page, flags);
                        }
                }
        }
@@ -1873,7 +1883,7 @@ int logfs_truncate(struct inode *inode, u64 target)
                logfs_get_wblocks(sb, NULL, 1);
                err = __logfs_truncate(inode, size);
                if (!err)
-                        err = __logfs_write_inode(inode, 0);
+                        err = __logfs_write_inode(inode, NULL, 0);
                logfs_put_wblocks(sb, NULL, 1);
        }
@@ -1901,8 +1911,11 @@ static void move_page_to_inode(struct inode *inode, struct page *page)
        li->li_block = block;
        block->page = NULL;
-        page->private = 0;
+        if (PagePrivate(page)) {
-        ClearPagePrivate(page);
+                ClearPagePrivate(page);
+                page_cache_release(page);
+                set_page_private(page, 0);
+        }
 }
 static void move_inode_to_page(struct page *page, struct inode *inode)
@@ -1918,8 +1931,12 @@ static void move_inode_to_page(struct page *page, struct inode *inode)
        BUG_ON(PagePrivate(page));
        block->ops = &indirect_block_ops;
        block->page = page;
-        page->private = (unsigned long)block;
-        SetPagePrivate(page);
+        if (!PagePrivate(page)) {
+                SetPagePrivate(page);
+                page_cache_get(page);
+                set_page_private(page, (unsigned long) block);
+        }
        block->inode = NULL;
        li->li_block = NULL;
@@ -2106,14 +2123,14 @@ void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
                        ec_level);
 }
-int __logfs_write_inode(struct inode *inode, long flags)
+int __logfs_write_inode(struct inode *inode, struct page *page, long flags)
 {
        struct super_block *sb = inode->i_sb;
        int ret;
-        logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
+        logfs_get_wblocks(sb, page, flags & WF_LOCK);
        ret = do_write_inode(inode);
-        logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
+        logfs_put_wblocks(sb, page, flags & WF_LOCK);
        return ret;
 }
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 9d5187353255..ab798ed1cc88 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -86,7 +86,11 @@ int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
                BUG_ON(!page); /* FIXME: reserve a pool */
                SetPageUptodate(page);
                memcpy(page_address(page) + offset, buf, copylen);
-                SetPagePrivate(page);
+                if (!PagePrivate(page)) {
+                        SetPagePrivate(page);
+                        page_cache_get(page);
+                }
                page_cache_release(page);
                buf += copylen;
@@ -110,7 +114,10 @@ static void pad_partial_page(struct logfs_area *area)
                page = get_mapping_page(sb, index, 0);
                BUG_ON(!page); /* FIXME: reserve a pool */
                memset(page_address(page) + offset, 0xff, len);
-                SetPagePrivate(page);
+                if (!PagePrivate(page)) {
+                        SetPagePrivate(page);
+                        page_cache_get(page);
+                }
                page_cache_release(page);
        }
 }
@@ -130,7 +137,10 @@ static void pad_full_pages(struct logfs_area *area)
                BUG_ON(!page); /* FIXME: reserve a pool */
                SetPageUptodate(page);
                memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
-                SetPagePrivate(page);
+                if (!PagePrivate(page)) {
+                        SetPagePrivate(page);
+                        page_cache_get(page);
+                }
                page_cache_release(page);
                index++;
                no_indizes--;
@@ -485,8 +495,12 @@ static void move_btree_to_page(struct inode *inode, struct page *page,
                mempool_free(item, super->s_alias_pool);
        }
        block->page = page;
-        SetPagePrivate(page);
-        page->private = (unsigned long)block;
+        if (!PagePrivate(page)) {
+                SetPagePrivate(page);
+                page_cache_get(page);
+                set_page_private(page, (unsigned long) block);
+        }
        block->ops = &indirect_block_ops;
        initialize_block_counters(page, block, data, 0);
 }
@@ -536,8 +550,12 @@ void move_page_to_btree(struct page *page)
                list_add(&item->list, &block->item_list);
        }
        block->page = NULL;
-        ClearPagePrivate(page);
-        page->private = 0;
+        if (PagePrivate(page)) {
+                ClearPagePrivate(page);
+                page_cache_release(page);
+                set_page_private(page, 0);
+        }
        block->ops = &btree_block_ops;
        err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
                        block);
@@ -702,7 +720,10 @@ void freeseg(struct super_block *sb, u32 segno)
                page = find_get_page(mapping, ofs >> PAGE_SHIFT);
                if (!page)
                        continue;
-                ClearPagePrivate(page);
+                if (PagePrivate(page)) {
+                        ClearPagePrivate(page);
+                        page_cache_release(page);
+                }
                page_cache_release(page);
        }
 }
@@ -841,6 +862,16 @@ static void free_area(struct logfs_area *area)
        kfree(area);
 }
+void free_areas(struct super_block *sb)
+{
+        struct logfs_super *super = logfs_super(sb);
+        int i;
+        for_each_area(i)
+                free_area(super->s_area[i]);
+        free_area(super->s_journal_area);
+}
 static struct logfs_area *alloc_area(struct super_block *sb)
 {
        struct logfs_area *area;
@@ -923,10 +954,6 @@ err:
 void logfs_cleanup_areas(struct super_block *sb)
 {
        struct logfs_super *super = logfs_super(sb);
-        int i;
        btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
-        for_each_area(i)
-                free_area(super->s_area[i]);
-        free_area(super->s_journal_area);
 }
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index e795c234ea33..c9ee7f5d1caf 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -486,14 +486,15 @@ static void logfs_kill_sb(struct super_block *sb)
        /* Alias entries slow down mount, so evict as many as possible */
        sync_filesystem(sb);
        logfs_write_anchor(sb);
+        free_areas(sb);
        /*
         * From this point on alias entries are simply dropped - and any
         * writes to the object store are considered bugs.
         */
-        super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
        log_super("LogFS: Now in shutdown\n");
        generic_shutdown_super(sb);
+        super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
        BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
diff --git a/fs/mpage.c b/fs/mpage.c
index fdfae9fa98cd..643e9f55ef29 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -371,9 +371,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
        sector_t last_block_in_bio = 0;
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
-        struct blk_plug plug;
-        blk_start_plug(&plug);
        map_bh.b_state = 0;
        map_bh.b_size = 0;
@@ -395,7 +392,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
        BUG_ON(!list_empty(pages));
        if (bio)
                mpage_bio_submit(READ, bio);
-        blk_finish_plug(&plug);
        return 0;
 }
 EXPORT_SYMBOL(mpage_readpages);
diff --git a/fs/namei.c b/fs/namei.c
index c283a1ec008e..208c6aa4a989 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page)
 static char *getname_flags(const char __user *filename, int flags, int *empty)
 {
-        char *tmp, *result;
+        char *result = __getname();
+        int retval;
-        result = ERR_PTR(-ENOMEM);
-        tmp = __getname();
+        if (!result)
-        if (tmp)  {
+                return ERR_PTR(-ENOMEM);
-                int retval = do_getname(filename, tmp);
+        retval = do_getname(filename, result);
-                result = tmp;
+        if (retval < 0) {
-                if (retval < 0) {
+                if (retval == -ENOENT && empty)
-                        if (retval == -ENOENT && empty)
+                        *empty = 1;
-                                *empty = 1;
+                if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
-                        if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+                        __putname(result);
-                                __putname(tmp);
+                        return ERR_PTR(retval);
-                                result = ERR_PTR(retval);
-                        }
                }
        }
        audit_getname(result);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 281ae95932c9..48cfac31f64c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -90,9 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
 */
 struct parallel_io {
        struct kref refcnt;
-        struct rpc_call_ops call_ops;
+        void (*pnfs_callback) (void *data, int num_se);
-        void (*pnfs_callback) (void *data);
        void *data;
+        int bse_count;
 };
 static inline struct parallel_io *alloc_parallel(void *data)
@@ -103,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
        if (rv) {
                rv->data = data;
                kref_init(&rv->refcnt);
+                rv->bse_count = 0;
        }
        return rv;
 }
@@ -117,7 +118,7 @@ static void destroy_parallel(struct kref *kref)
        struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
        dprintk("%s enter\n", __func__);
-        p->pnfs_callback(p->data);
+        p->pnfs_callback(p->data, p->bse_count);
        kfree(p);
 }
@@ -146,14 +147,19 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
 {
        struct bio *bio;
+        npg = min(npg, BIO_MAX_PAGES);
        bio = bio_alloc(GFP_NOIO, npg);
-        if (!bio)
+        if (!bio && (current->flags & PF_MEMALLOC)) {
-                return NULL;
+                while (!bio && (npg /= 2))
+                        bio = bio_alloc(GFP_NOIO, npg);
+        }
-        bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+        if (bio) {
-        bio->bi_bdev = be->be_mdev;
+                bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
-        bio->bi_end_io = end_io;
+                bio->bi_bdev = be->be_mdev;
-        bio->bi_private = par;
+                bio->bi_end_io = end_io;
+                bio->bi_private = par;
+        }
        return bio;
 }
@@ -212,22 +218,15 @@ static void bl_read_cleanup(struct work_struct *work)
 }
 static void
-bl_end_par_io_read(void *data)
+bl_end_par_io_read(void *data, int unused)
 {
        struct nfs_read_data *rdata = data;
+        rdata->task.tk_status = rdata->pnfs_error;
        INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
        schedule_work(&rdata->task.u.tk_work);
 }
-/* We don't want normal .rpc_call_done callback used, so we replace it
- * with this stub.
- */
-static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
-{
-        return;
-}
 static enum pnfs_try_status
 bl_read_pagelist(struct nfs_read_data *rdata)
 {
@@ -247,8 +246,6 @@ bl_read_pagelist(struct nfs_read_data *rdata)
        par = alloc_parallel(rdata);
        if (!par)
                goto use_mds;
-        par->call_ops = *rdata->mds_ops;
-        par->call_ops.rpc_call_done = bl_rpc_do_nothing;
        par->pnfs_callback = bl_end_par_io_read;
        /* At this point, we can no longer jump to use_mds */
@@ -322,6 +319,7 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
 {
        sector_t isect, end;
        struct pnfs_block_extent *be;
+        struct pnfs_block_short_extent *se;
        dprintk("%s(%llu, %u)\n", __func__, offset, count);
        if (count == 0)
@@ -334,8 +332,11 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
                be = bl_find_get_extent(bl, isect, NULL);
                BUG_ON(!be); /* FIXME */
                len = min(end, be->be_f_offset + be->be_length) - isect;
-                if (be->be_state == PNFS_BLOCK_INVALID_DATA)
+                if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
-                        bl_mark_for_commit(be, isect, len); /* What if fails? */
+                        se = bl_pop_one_short_extent(be->be_inval);
+                        BUG_ON(!se);
+                        bl_mark_for_commit(be, isect, len, se);
+                }
                isect += len;
                bl_put_extent(be);
        }
@@ -357,7 +358,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
                end_page_writeback(page);
                page_cache_release(page);
        } while (bvec >= bio->bi_io_vec);
-        if (!uptodate) {
+        if (unlikely(!uptodate)) {
                if (!wdata->pnfs_error)
                        wdata->pnfs_error = -EIO;
                pnfs_set_lo_fail(wdata->lseg);
@@ -366,7 +368,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
        put_parallel(par);
 }
-/* This is basically copied from mpage_end_io_read */
 static void bl_end_io_write(struct bio *bio, int err)
 {
        struct parallel_io *par = bio->bi_private;
@@ -392,7 +393,7 @@ static void bl_write_cleanup(struct work_struct *work)
        dprintk("%s enter\n", __func__);
        task = container_of(work, struct rpc_task, u.tk_work);
        wdata = container_of(task, struct nfs_write_data, task);
-        if (!wdata->pnfs_error) {
+        if (likely(!wdata->pnfs_error)) {
                /* Marks for LAYOUTCOMMIT */
                mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
                                     wdata->args.offset, wdata->args.count);
@@ -401,11 +402,16 @@ static void bl_write_cleanup(struct work_struct *work)
 }
 /* Called when last of bios associated with a bl_write_pagelist call finishes */
-static void bl_end_par_io_write(void *data)
+static void bl_end_par_io_write(void *data, int num_se)
 {
        struct nfs_write_data *wdata = data;
-        wdata->task.tk_status = 0;
+        if (unlikely(wdata->pnfs_error)) {
+                bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
+                                        num_se);
+        }
+        wdata->task.tk_status = wdata->pnfs_error;
        wdata->verf.committed = NFS_FILE_SYNC;
        INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
        schedule_work(&wdata->task.u.tk_work);
@@ -484,6 +490,55 @@ cleanup:
        return ret;
 }
+/* Find or create a zeroing page marked being writeback.
+ * Return ERR_PTR on error, NULL to indicate skip this page and page itself
+ * to indicate write out.
+ */
+static struct page *
+bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
+                        struct pnfs_block_extent *cow_read)
+{
+        struct page *page;
+        int locked = 0;
+        page = find_get_page(inode->i_mapping, index);
+        if (page)
+                goto check_page;
+        page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+        if (unlikely(!page)) {
+                dprintk("%s oom\n", __func__);
+                return ERR_PTR(-ENOMEM);
+        }
+        locked = 1;
+check_page:
+        /* PageDirty: Other will write this out
+         * PageWriteback: Other is writing this out
+         * PageUptodate: It was read before
+         */
+        if (PageDirty(page) || PageWriteback(page)) {
+                print_page(page);
+                if (locked)
+                        unlock_page(page);
+                page_cache_release(page);
+                return NULL;
+        }
+        if (!locked) {
+                lock_page(page);
+                locked = 1;
+                goto check_page;
+        }
+        if (!PageUptodate(page)) {
+                /* New page, readin or zero it */
+                init_page_for_write(page, cow_read);
+        }
+        set_page_writeback(page);
+        unlock_page(page);
+        return page;
+}
 static enum pnfs_try_status
 bl_write_pagelist(struct nfs_write_data *wdata, int sync)
 {
@@ -508,9 +563,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
         */
        par = alloc_parallel(wdata);
        if (!par)
-                return PNFS_NOT_ATTEMPTED;
+                goto out_mds;
-        par->call_ops = *wdata->mds_ops;
-        par->call_ops.rpc_call_done = bl_rpc_do_nothing;
        par->pnfs_callback = bl_end_par_io_write;
        /* At this point, have to be more careful with error handling */
@@ -518,12 +571,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
        be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
        if (!be || !is_writable(be, isect)) {
                dprintk("%s no matching extents!\n", __func__);
-                wdata->pnfs_error = -EINVAL;
+                goto out_mds;
-                goto out;
        }
        /* First page inside INVALID extent */
        if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                if (likely(!bl_push_one_short_extent(be->be_inval)))
+                        par->bse_count++;
+                else
+                        goto out_mds;
                temp = offset >> PAGE_CACHE_SHIFT;
                npg_zero = do_div(temp, npg_per_block);
                isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
@@ -543,36 +599,16 @@ fill_invalid_ext:
                        dprintk("%s zero %dth page: index %lu isect %llu\n",
                                __func__, npg_zero, index,
                                (unsigned long long)isect);
-                        page =
+                        page = bl_find_get_zeroing_page(wdata->inode, index,
-                            find_or_create_page(wdata->inode->i_mapping, index,
+                                                        cow_read);
-                                                GFP_NOFS);
+                        if (unlikely(IS_ERR(page))) {
-                        if (!page) {
+                                wdata->pnfs_error = PTR_ERR(page);
-                                dprintk("%s oom\n", __func__);
-                                wdata->pnfs_error = -ENOMEM;
                                goto out;
-                        }
+                        } else if (page == NULL)
-                        /* PageDirty: Other will write this out
-                         * PageWriteback: Other is writing this out
-                         * PageUptodate: It was read before
-                         * sector_initialized: already written out
-                         */
-                        if (PageDirty(page) || PageWriteback(page)) {
-                                print_page(page);
-                                unlock_page(page);
-                                page_cache_release(page);
                                goto next_page;
-                        }
-                        if (!PageUptodate(page)) {
-                                /* New page, readin or zero it */
-                                init_page_for_write(page, cow_read);
-                        }
-                        set_page_writeback(page);
-                        unlock_page(page);
                        ret = bl_mark_sectors_init(be->be_inval, isect,
-                                                       PAGE_CACHE_SECTORS,
+                                                       PAGE_CACHE_SECTORS);
-                                                       NULL);
                        if (unlikely(ret)) {
                                dprintk("%s bl_mark_sectors_init fail %d\n",
                                        __func__, ret);
@@ -581,6 +617,19 @@ fill_invalid_ext:
                                wdata->pnfs_error = ret;
                                goto out;
                        }
+                        if (likely(!bl_push_one_short_extent(be->be_inval)))
+                                par->bse_count++;
+                        else {
+                                end_page_writeback(page);
+                                page_cache_release(page);
+                                wdata->pnfs_error = -ENOMEM;
+                                goto out;
+                        }
+                        /* FIXME: This should be done in bi_end_io */
+                        mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
+                                             page->index << PAGE_CACHE_SHIFT,
+                                             PAGE_CACHE_SIZE);
                        bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
                                                 isect, page, be,
                                                 bl_end_io_write_zero, par);
@@ -589,10 +638,6 @@ fill_invalid_ext:
                                bio = NULL;
                                goto out;
                        }
-                        /* FIXME: This should be done in bi_end_io */
-                        mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
-                                             page->index << PAGE_CACHE_SHIFT,
-                                             PAGE_CACHE_SIZE);
 next_page:
                        isect += PAGE_CACHE_SECTORS;
                        extent_length -= PAGE_CACHE_SECTORS;
@@ -616,13 +661,21 @@ next_page:
                                wdata->pnfs_error = -EINVAL;
                                goto out;
                        }
+                        if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
+                                if (likely(!bl_push_one_short_extent(
+                                                                be->be_inval)))
+                                        par->bse_count++;
+                                else {
+                                        wdata->pnfs_error = -ENOMEM;
+                                        goto out;
+                                }
+                        }
                        extent_length = be->be_length -
                            (isect - be->be_f_offset);
                }
                if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
                        ret = bl_mark_sectors_init(be->be_inval, isect,
-                                                       PAGE_CACHE_SECTORS,
+                                                       PAGE_CACHE_SECTORS);
-                                                       NULL);
                        if (unlikely(ret)) {
                                dprintk("%s bl_mark_sectors_init fail %d\n",
                                        __func__, ret);
@@ -664,6 +717,10 @@ out:
        bl_submit_bio(WRITE, bio);
        put_parallel(par);
        return PNFS_ATTEMPTED;
+out_mds:
+        bl_put_extent(be);
+        kfree(par);
+        return PNFS_NOT_ATTEMPTED;
 }
 /* FIXME - range ignored */
@@ -690,11 +747,17 @@ static void
 release_inval_marks(struct pnfs_inval_markings *marks)
 {
        struct pnfs_inval_tracking *pos, *temp;
+        struct pnfs_block_short_extent *se, *stemp;
        list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
                list_del(&pos->it_link);
                kfree(pos);
        }
+        list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
+                list_del(&se->bse_node);
+                kfree(se);
+        }
        return;
 }
@@ -779,16 +842,13 @@ bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
 static void free_blk_mountid(struct block_mount_id *mid)
 {
        if (mid) {
-                struct pnfs_block_dev *dev;
+                struct pnfs_block_dev *dev, *tmp;
-                spin_lock(&mid->bm_lock);
-                while (!list_empty(&mid->bm_devlist)) {
+                /* No need to take bm_lock as we are last user freeing bm_devlist */
-                        dev = list_first_entry(&mid->bm_devlist,
+                list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
-                                               struct pnfs_block_dev,
-                                               bm_node);
                        list_del(&dev->bm_node);
                        bl_free_block_dev(dev);
                }
-                spin_unlock(&mid->bm_lock);
                kfree(mid);
        }
 }
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 42acf7ef5992..e31a2df28e70 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -70,6 +70,7 @@ struct pnfs_inval_markings {
        spinlock_t      im_lock;
        struct my_tree  im_tree;        /* Sectors that need LAYOUTCOMMIT */
        sector_t        im_block_size;  /* Server blocksize in sectors */
+        struct list_head im_extents;    /* Short extents for INVAL->RW conversion */
 };
 struct pnfs_inval_tracking {
@@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
 {
        spin_lock_init(&marks->im_lock);
        INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
+        INIT_LIST_HEAD(&marks->im_extents);
        marks->im_block_size = blocksize;
        marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
                                           blocksize);
@@ -186,8 +188,7 @@ struct pnfs_block_extent *
 bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
                struct pnfs_block_extent **cow_read);
 int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
-                             sector_t offset, sector_t length,
+                             sector_t offset, sector_t length);
-                             sector_t **pages);
 void bl_put_extent(struct pnfs_block_extent *be);
 struct pnfs_block_extent *bl_alloc_extent(void);
 int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
@@ -200,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
 int bl_add_merge_extent(struct pnfs_block_layout *bl,
                         struct pnfs_block_extent *new);
 int bl_mark_for_commit(struct pnfs_block_extent *be,
-                        sector_t offset, sector_t length);
+                        sector_t offset, sector_t length,
+                        struct pnfs_block_short_extent *new);
+int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
+struct pnfs_block_short_extent *
+bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
+void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
 #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 19fa7b0b8c00..1abac09f7cd5 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -110,13 +110,7 @@ static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
                return 0;
        } else {
                struct pnfs_inval_tracking *new;
-                if (storage)
+                new = storage;
-                        new = storage;
-                else {
-                        new = kmalloc(sizeof(*new), GFP_NOFS);
-                        if (!new)
-                                return -ENOMEM;
-                }
                new->it_sector = s;
                new->it_tags = (1 << tag);
                list_add(&new->it_link, &pos->it_link);
@@ -139,11 +133,13 @@ static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
 }
 /* Ensure that future operations on given range of tree will not malloc */
-static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
+static int _preload_range(struct pnfs_inval_markings *marks,
+                u64 offset, u64 length)
 {
        u64 start, end, s;
        int count, i, used = 0, status = -ENOMEM;
        struct pnfs_inval_tracking **storage;
+        struct my_tree  *tree = &marks->im_tree;
        dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
        start = normalize(offset, tree->mtt_step_size);
@@ -161,12 +157,11 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
                        goto out_cleanup;
        }
-        /* Now need lock - HOW??? */
+        spin_lock_bh(&marks->im_lock);
        for (s = start; s < end; s += tree->mtt_step_size)
                used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
+        spin_unlock_bh(&marks->im_lock);
-        /* Unlock - HOW??? */
        status = 0;
 out_cleanup:
@@ -179,41 +174,14 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
        return status;
 }
-static void set_needs_init(sector_t *array, sector_t offset)
-{
-        sector_t *p = array;
-        dprintk("%s enter\n", __func__);
-        if (!p)
-                return;
-        while (*p < offset)
-                p++;
-        if (*p == offset)
-                return;
-        else if (*p == ~0) {
-                *p++ = offset;
-                *p = ~0;
-                return;
-        } else {
-                sector_t *save = p;
-                dprintk("%s Adding %llu\n", __func__, (u64)offset);
-                while (*p != ~0)
-                        p++;
-                p++;
-                memmove(save + 1, save, (char *)p - (char *)save);
-                *save = offset;
-                return;
-        }
-}
 /* We are relying on page lock to serialize this */
 int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
 {
        int rv;
-        spin_lock(&marks->im_lock);
+        spin_lock_bh(&marks->im_lock);
        rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
-        spin_unlock(&marks->im_lock);
+        spin_unlock_bh(&marks->im_lock);
        return rv;
 }
@@ -253,78 +221,39 @@ static int is_range_written(struct pnfs_inval_markings *marks,
 {
        int rv;
-        spin_lock(&marks->im_lock);
+        spin_lock_bh(&marks->im_lock);
        rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
-        spin_unlock(&marks->im_lock);
+        spin_unlock_bh(&marks->im_lock);
        return rv;
 }
 /* Marks sectors in [offest, offset_length) as having been initialized.
 * All lengths are step-aligned, where step is min(pagesize, blocksize).
- * Notes where partial block is initialized, and helps prepare it for
+ * Currently assumes offset is page-aligned
- * complete initialization later.
 */
-/* Currently assumes offset is page-aligned */
 int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
-                             sector_t offset, sector_t length,
+                             sector_t offset, sector_t length)
-                             sector_t **pages)
 {
-        sector_t s, start, end;
+        sector_t start, end;
-        sector_t *array = NULL; /* Pages to mark */
        dprintk("%s(offset=%llu,len=%llu) enter\n",
                __func__, (u64)offset, (u64)length);
-        s = max((sector_t) 3,
-                2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
-        dprintk("%s set max=%llu\n", __func__, (u64)s);
-        if (pages) {
-                array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
-                if (!array)
-                        goto outerr;
-                array[0] = ~0;
-        }
        start = normalize(offset, marks->im_block_size);
        end = normalize_up(offset + length, marks->im_block_size);
-        if (_preload_range(&marks->im_tree, start, end - start))
+        if (_preload_range(marks, start, end - start))
                goto outerr;
-        spin_lock(&marks->im_lock);
+        spin_lock_bh(&marks->im_lock);
-        for (s = normalize_up(start, PAGE_CACHE_SECTORS);
-             s < offset; s += PAGE_CACHE_SECTORS) {
-                dprintk("%s pre-area pages\n", __func__);
-                /* Portion of used block is not initialized */
-                if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
-                        set_needs_init(array, s);
-        }
        if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
                goto out_unlock;
-        for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS);
+        spin_unlock_bh(&marks->im_lock);
-             s < end; s += PAGE_CACHE_SECTORS) {
-                dprintk("%s post-area pages\n", __func__);
-                if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
-                        set_needs_init(array, s);
-        }
-        spin_unlock(&marks->im_lock);
-        if (pages) {
-                if (array[0] == ~0) {
-                        kfree(array);
-                        *pages = NULL;
-                } else
-                        *pages = array;
-        }
        return 0;
- out_unlock:
+out_unlock:
-        spin_unlock(&marks->im_lock);
+        spin_unlock_bh(&marks->im_lock);
- outerr:
+outerr:
-        if (pages) {
-                kfree(array);
-                *pages = NULL;
-        }
        return -ENOMEM;
 }
@@ -338,9 +267,9 @@ static int mark_written_sectors(struct pnfs_inval_markings *marks,
        dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
                (u64)offset, (u64)length);
-        spin_lock(&marks->im_lock);
+        spin_lock_bh(&marks->im_lock);
        status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
-        spin_unlock(&marks->im_lock);
+        spin_unlock_bh(&marks->im_lock);
        return status;
 }
@@ -440,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout *bl,
 /* Note the range described by offset, length is guaranteed to be contained
 * within be.
+ * new will be freed, either by this function or add_to_commitlist if they
+ * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
 */
 int bl_mark_for_commit(struct pnfs_block_extent *be,
-                    sector_t offset, sector_t length)
+                    sector_t offset, sector_t length,
+                    struct pnfs_block_short_extent *new)
 {
        sector_t new_end, end = offset + length;
-        struct pnfs_block_short_extent *new;
        struct pnfs_block_layout *bl = container_of(be->be_inval,
                                                    struct pnfs_block_layout,
                                                    bl_inval);
-        new = kmalloc(sizeof(*new), GFP_NOFS);
-        if (!new)
-                return -ENOMEM;
        mark_written_sectors(be->be_inval, offset, length);
        /* We want to add the range to commit list, but it must be
         * block-normalized, and verified that the normalized range has
@@ -483,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
        new->bse_mdev = be->be_mdev;
        spin_lock(&bl->bl_ext_lock);
-        /* new will be freed, either by add_to_commitlist if it decides not
-         * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
-         */
        add_to_commitlist(bl, new);
        spin_unlock(&bl->bl_ext_lock);
        return 0;
@@ -933,3 +857,53 @@ clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
                }
        }
 }
+int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
+{
+        struct pnfs_block_short_extent *new;
+        new = kmalloc(sizeof(*new), GFP_NOFS);
+        if (unlikely(!new))
+                return -ENOMEM;
+        spin_lock_bh(&marks->im_lock);
+        list_add(&new->bse_node, &marks->im_extents);
+        spin_unlock_bh(&marks->im_lock);
+        return 0;
+}
+struct pnfs_block_short_extent *
+bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
+{
+        struct pnfs_block_short_extent *rv = NULL;
+        spin_lock_bh(&marks->im_lock);
+        if (!list_empty(&marks->im_extents)) {
+                rv = list_entry((&marks->im_extents)->next,
+                                struct pnfs_block_short_extent, bse_node);
+                list_del_init(&rv->bse_node);
+        }
+        spin_unlock_bh(&marks->im_lock);
+        return rv;
+}
+void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
+{
+        struct pnfs_block_short_extent *se = NULL, *tmp;
+        if (num_to_free <= 0)
+                return;
+        spin_lock(&marks->im_lock);
+        list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
+                list_del(&se->bse_node);
+                kfree(se);
+                if (--num_to_free == 0)
+                        break;
+        }
+        spin_unlock(&marks->im_lock);
+        BUG_ON(num_to_free > 0);
+}
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 07df5f1d85e5..c89d3b9e483c 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -162,7 +162,7 @@ struct cb_layoutrecallargs {
        };
 };
-extern unsigned nfs4_callback_layoutrecall(
+extern __be32 nfs4_callback_layoutrecall(
        struct cb_layoutrecallargs *args,
        void *dummy, struct cb_process_state *cps);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 43926add945b..54cea8ad5a76 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -339,7 +339,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
        dprintk("%s enter. slotid %d seqid %d\n",
                __func__, args->csa_slotid, args->csa_sequenceid);
-        if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS)
+        if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
                return htonl(NFS4ERR_BADSLOT);
        slot = tbl->slots + args->csa_slotid;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 726e59a9e50f..d50b2742f23b 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -305,6 +305,10 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
        n = ntohl(*p++);
        if (n <= 0)
                goto out;
+        if (n > ULONG_MAX / sizeof(*args->devs)) {
+                status = htonl(NFS4ERR_BADXDR);
+                goto out;
+        }
        args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
        if (!args->devs) {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 873bf00d51a2..31778f74357d 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -84,7 +84,7 @@ retry:
 /*
 * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
 */
-static int nfs4_disable_idmapping = 0;
+static bool nfs4_disable_idmapping = true;
 /*
 * RPC cruft for NFS
@@ -185,7 +185,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        clp->cl_minorversion = cl_init->minorversion;
        clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
 #endif
-        cred = rpc_lookup_machine_cred();
+        cred = rpc_lookup_machine_cred("*");
        if (!IS_ERR(cred))
                clp->cl_machine_cred = cred;
        nfs_fscache_get_client_cookie(clp);
@@ -250,6 +250,11 @@ static void pnfs_init_server(struct nfs_server *server)
        rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
 }
+static void nfs4_destroy_server(struct nfs_server *server)
+{
+        nfs4_purge_state_owners(server);
+}
 #else
 static void nfs4_shutdown_client(struct nfs_client *clp)
 {
@@ -1065,6 +1070,7 @@ static struct nfs_server *nfs_alloc_server(void)
        INIT_LIST_HEAD(&server->master_link);
        INIT_LIST_HEAD(&server->delegations);
        INIT_LIST_HEAD(&server->layouts);
+        INIT_LIST_HEAD(&server->state_owners_lru);
        atomic_set(&server->active, 0);
@@ -1538,6 +1544,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
        nfs_server_insert_lists(server);
        server->mount_time = jiffies;
+        server->destroy = nfs4_destroy_server;
 out:
        nfs_free_fattr(fattr);
        return error;
@@ -1719,6 +1726,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
        /* Copy data from the source */
        server->nfs_client = source->nfs_client;
+        server->destroy = source->destroy;
        atomic_inc(&server->nfs_client->cl_count);
        nfs_server_copy_userdata(server, source);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 606ef0f20aed..c43a452f7da2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -272,13 +272,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
                        datasync);
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-        if (ret)
-                return ret;
        mutex_lock(&inode->i_mutex);
        nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
        have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
        status = nfs_commit_inode(inode, FLUSH_SYNC);
+        if (status >= 0 && ret < 0)
+                status = ret;
        have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
        if (have_error)
                ret = xchg(&ctx->error, 0);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 47d1c6ff2d8e..2c05f1991e1e 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -38,6 +38,89 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/nfs_idmap.h>
+#include <linux/nfs_fs.h>
+/**
+ * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
+ * @fattr: fully initialised struct nfs_fattr
+ * @owner_name: owner name string cache
+ * @group_name: group name string cache
+ */
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+                struct nfs4_string *owner_name,
+                struct nfs4_string *group_name)
+{
+        fattr->owner_name = owner_name;
+        fattr->group_name = group_name;
+}
+static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr)
+{
+        fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME;
+        kfree(fattr->owner_name->data);
+}
+static void nfs_fattr_free_group_name(struct nfs_fattr *fattr)
+{
+        fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME;
+        kfree(fattr->group_name->data);
+}
+static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+        struct nfs4_string *owner = fattr->owner_name;
+        __u32 uid;
+        if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME))
+                return false;
+        if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) {
+                fattr->uid = uid;
+                fattr->valid |= NFS_ATTR_FATTR_OWNER;
+        }
+        return true;
+}
+static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+        struct nfs4_string *group = fattr->group_name;
+        __u32 gid;
+        if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME))
+                return false;
+        if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) {
+                fattr->gid = gid;
+                fattr->valid |= NFS_ATTR_FATTR_GROUP;
+        }
+        return true;
+}
+/**
+ * nfs_fattr_free_names - free up the NFSv4 owner and group strings
+ * @fattr: a fully initialised nfs_fattr structure
+ */
+void nfs_fattr_free_names(struct nfs_fattr *fattr)
+{
+        if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)
+                nfs_fattr_free_owner_name(fattr);
+        if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)
+                nfs_fattr_free_group_name(fattr);
+}
+/**
+ * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free
+ * @server: pointer to the filesystem nfs_server structure
+ * @fattr: a fully initialised nfs_fattr structure
+ *
+ * This helper maps the cached NFSv4 owner/group strings in fattr into
+ * their numeric uid/gid equivalents, and then frees the cached strings.
+ */
+void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+        if (nfs_fattr_map_owner_name(server, fattr))
+                nfs_fattr_free_owner_name(fattr);
+        if (nfs_fattr_map_group_name(server, fattr))
+                nfs_fattr_free_group_name(fattr);
+}
 static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
 {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 81db25e92e10..f649fba8c384 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -57,7 +57,7 @@
 #define NFS_64_BIT_INODE_NUMBERS_ENABLED        1
 /* Default is to see 64-bit inode numbers */
-static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
+static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
@@ -1020,6 +1020,8 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
        fattr->valid = 0;
        fattr->time_start = jiffies;
        fattr->gencount = nfs_inc_attr_generation_counter();
+        fattr->owner_name = NULL;
+        fattr->group_name = NULL;
 }
 struct nfs_fattr *nfs_alloc_fattr(void)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 3f4d95751d52..8102db9b926c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -307,6 +307,8 @@ extern void nfs_readdata_release(struct nfs_read_data *rdata);
 /* write.c */
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
                struct list_head *head);
+extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+                                  struct inode *inode, int ioflags);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_writedata_release(struct nfs_write_data *wdata);
 extern void nfs_commit_free(struct nfs_write_data *p);
@@ -330,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data);
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
-                struct page *, struct page *);
+                struct page *, struct page *, enum migrate_mode);
 #else
 #define nfs_migrate_page NULL
 #endif
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 693ae22f8731..4d7d0aedc101 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -94,6 +94,8 @@ struct nfs_unique_id {
 struct nfs4_state_owner {
        struct nfs_unique_id so_owner_id;
        struct nfs_server    *so_server;
+        struct list_head     so_lru;
+        unsigned long        so_expires;
        struct rb_node       so_server_node;
        struct rpc_cred      *so_cred;   /* Associated cred */
@@ -319,6 +321,7 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
+extern void nfs4_purge_state_owners(struct nfs_server *);
 extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
 extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct nfs4_state *, fmode_t);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index a62d36b9a99e..71ec08617e23 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -49,13 +49,14 @@ filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
                            loff_t offset)
 {
        u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
-        u64 tmp;
+        u64 stripe_no;
+        u32 rem;
        offset -= flseg->pattern_offset;
-        tmp = offset;
+        stripe_no = div_u64(offset, stripe_width);
-        do_div(tmp, stripe_width);
+        div_u64_rem(offset, flseg->stripe_unit, &rem);
-        return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
+        return stripe_no * flseg->stripe_unit + rem;
 }
 /* This function is used by the layout driver to calculate the
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index ed388aae9689..8ae91908f5aa 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -382,7 +382,7 @@ decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)
 {
        struct nfs4_pnfs_ds_addr *da = NULL;
        char *buf, *portstr;
-        u32 port;
+        __be16 port;
        int nlen, rlen;
        int tmp[2];
        __be32 *p;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dcda0ba7af60..f0c849c98fe4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -52,6 +52,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/module.h>
+#include <linux/nfs_idmap.h>
 #include <linux/sunrpc/bc_xprt.h>
 #include <linux/xattr.h>
 #include <linux/utsname.h>
@@ -364,9 +365,8 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
 * Must be called while holding tbl->slot_tbl_lock
 */
 static void
-nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
+nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
 {
-        int free_slotid = free_slot - tbl->slots;
        int slotid = free_slotid;
        BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
@@ -431,7 +431,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
        }
        spin_lock(&tbl->slot_tbl_lock);
-        nfs4_free_slot(tbl, res->sr_slot);
+        nfs4_free_slot(tbl, res->sr_slot - tbl->slots);
        nfs4_check_drain_fc_complete(res->sr_session);
        spin_unlock(&tbl->slot_tbl_lock);
        res->sr_slot = NULL;
@@ -554,13 +554,10 @@ int nfs41_setup_sequence(struct nfs4_session *session,
        spin_lock(&tbl->slot_tbl_lock);
        if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
            !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
-                /*
+                /* The state manager will wait until the slot table is empty */
-                 * The state manager will wait until the slot table is empty.
-                 * Schedule the reset thread
-                 */
                rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
                spin_unlock(&tbl->slot_tbl_lock);
-                dprintk("%s Schedule Session Reset\n", __func__);
+                dprintk("%s session is draining\n", __func__);
                return -EAGAIN;
        }
@@ -765,6 +762,8 @@ struct nfs4_opendata {
        struct nfs_openres o_res;
        struct nfs_open_confirmargs c_arg;
        struct nfs_open_confirmres c_res;
+        struct nfs4_string owner_name;
+        struct nfs4_string group_name;
        struct nfs_fattr f_attr;
        struct nfs_fattr dir_attr;
        struct dentry *dir;
@@ -788,6 +787,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
        p->o_res.server = p->o_arg.server;
        nfs_fattr_init(&p->f_attr);
        nfs_fattr_init(&p->dir_attr);
+        nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);
 }
 static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
@@ -819,6 +819,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        p->o_arg.name = &dentry->d_name;
        p->o_arg.server = server;
        p->o_arg.bitmask = server->attr_bitmask;
+        p->o_arg.dir_bitmask = server->cache_consistency_bitmask;
        p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
        if (flags & O_CREAT) {
                u32 *s;
@@ -855,6 +856,7 @@ static void nfs4_opendata_free(struct kref *kref)
        dput(p->dir);
        dput(p->dentry);
        nfs_sb_deactive(sb);
+        nfs_fattr_free_names(&p->f_attr);
        kfree(p);
 }
@@ -1579,6 +1581,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
        if (status != 0 || !data->rpc_done)
                return status;
+        nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr);
        nfs_refresh_inode(dir, o_res->dir_attr);
        if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
@@ -1611,6 +1615,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
                return status;
        }
+        nfs_fattr_map_and_free_names(server, &data->f_attr);
        if (o_arg->open_flags & O_CREAT) {
                update_changeattr(dir, &o_res->cinfo);
                nfs_post_op_update_inode(dir, o_res->dir_attr);
@@ -3431,19 +3437,6 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
 */
 #define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT)
-static void buf_to_pages(const void *buf, size_t buflen,
-                struct page **pages, unsigned int *pgbase)
-{
-        const void *p = buf;
-        *pgbase = offset_in_page(buf);
-        p -= *pgbase;
-        while (p < buf + buflen) {
-                *(pages++) = virt_to_page(p);
-                p += PAGE_CACHE_SIZE;
-        }
-}
 static int buf_to_pages_noslab(const void *buf, size_t buflen,
                struct page **pages, unsigned int *pgbase)
 {
@@ -3540,9 +3533,19 @@ out:
        nfs4_set_cached_acl(inode, acl);
 }
+/*
+ * The getxattr API returns the required buffer length when called with a
+ * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating
+ * the required buf.  On a NULL buf, we send a page of data to the server
+ * guessing that the ACL request can be serviced by a page. If so, we cache
+ * up to the page of ACL data, and the 2nd call to getxattr is serviced by
+ * the cache. If not so, we throw away the page, and cache the required
+ * length. The next getxattr call will then produce another round trip to
+ * the server, this time with the input buf of the required size.
+ */
 static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
 {
-        struct page *pages[NFS4ACL_MAXPAGES];
+        struct page *pages[NFS4ACL_MAXPAGES] = {NULL, };
        struct nfs_getaclargs args = {
                .fh = NFS_FH(inode),
                .acl_pages = pages,
@@ -3557,41 +3560,60 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
                .rpc_argp = &args,
                .rpc_resp = &res,
        };
-        struct page *localpage = NULL;
+        int ret = -ENOMEM, npages, i, acl_len = 0;
-        int ret;
-        if (buflen < PAGE_SIZE) {
+        npages = (buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
-                /* As long as we're doing a round trip to the server anyway,
+        /* As long as we're doing a round trip to the server anyway,
-                 * let's be prepared for a page of acl data. */
+         * let's be prepared for a page of acl data. */
-                localpage = alloc_page(GFP_KERNEL);
+        if (npages == 0)
-                resp_buf = page_address(localpage);
+                npages = 1;
-                if (localpage == NULL)
-                        return -ENOMEM;
+        for (i = 0; i < npages; i++) {
-                args.acl_pages[0] = localpage;
+                pages[i] = alloc_page(GFP_KERNEL);
-                args.acl_pgbase = 0;
+                if (!pages[i])
-                args.acl_len = PAGE_SIZE;
+                        goto out_free;
-        } else {
-                resp_buf = buf;
-                buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
        }
-        ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
+        if (npages > 1) {
+                /* for decoding across pages */
+                args.acl_scratch = alloc_page(GFP_KERNEL);
+                if (!args.acl_scratch)
+                        goto out_free;
+        }
+        args.acl_len = npages * PAGE_SIZE;
+        args.acl_pgbase = 0;
+        /* Let decode_getfacl know not to fail if the ACL data is larger than
+         * the page we send as a guess */
+        if (buf == NULL)
+                res.acl_flags |= NFS4_ACL_LEN_REQUEST;
+        resp_buf = page_address(pages[0]);
+        dprintk("%s  buf %p buflen %zu npages %d args.acl_len %zu\n",
+                __func__, buf, buflen, npages, args.acl_len);
+        ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
+                             &msg, &args.seq_args, &res.seq_res, 0);
        if (ret)
                goto out_free;
-        if (res.acl_len > args.acl_len)
-                nfs4_write_cached_acl(inode, NULL, res.acl_len);
+        acl_len = res.acl_len - res.acl_data_offset;
+        if (acl_len > args.acl_len)
+                nfs4_write_cached_acl(inode, NULL, acl_len);
        else
-                nfs4_write_cached_acl(inode, resp_buf, res.acl_len);
+                nfs4_write_cached_acl(inode, resp_buf + res.acl_data_offset,
+                                      acl_len);
        if (buf) {
                ret = -ERANGE;
-                if (res.acl_len > buflen)
+                if (acl_len > buflen)
                        goto out_free;
-                if (localpage)
+                _copy_from_pages(buf, pages, res.acl_data_offset,
-                        memcpy(buf, resp_buf, res.acl_len);
+                                res.acl_len);
        }
-        ret = res.acl_len;
+        ret = acl_len;
 out_free:
-        if (localpage)
+        for (i = 0; i < npages; i++)
-                __free_page(localpage);
+                if (pages[i])
+                        __free_page(pages[i]);
+        if (args.acl_scratch)
+                __free_page(args.acl_scratch);
        return ret;
 }
@@ -3622,6 +3644,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
                nfs_zap_acl_cache(inode);
        ret = nfs4_read_cached_acl(inode, buf, buflen);
        if (ret != -ENOENT)
+                /* -ENOENT is returned if there is no ACL or if there is an ACL
+                 * but no cached acl data, just the acl length */
                return ret;
        return nfs4_get_acl_uncached(inode, buf, buflen);
 }
@@ -5022,23 +5046,6 @@ out:
        return ret;
 }
-/*
- * Reset the forechannel and backchannel slot tables
- */
-static int nfs4_reset_slot_tables(struct nfs4_session *session)
-{
-        int status;
-        status = nfs4_reset_slot_table(&session->fc_slot_table,
-                        session->fc_attrs.max_reqs, 1);
-        if (status)
-                return status;
-        status = nfs4_reset_slot_table(&session->bc_slot_table,
-                        session->bc_attrs.max_reqs, 0);
-        return status;
-}
 /* Destroy the slot table */
 static void nfs4_destroy_slot_tables(struct nfs4_session *session)
 {
@@ -5084,29 +5091,35 @@ out:
 }
 /*
- * Initialize the forechannel and backchannel tables
+ * Initialize or reset the forechannel and backchannel tables
 */
-static int nfs4_init_slot_tables(struct nfs4_session *session)
+static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
 {
        struct nfs4_slot_table *tbl;
-        int status = 0;
+        int status;
-        tbl = &session->fc_slot_table;
+        dprintk("--> %s\n", __func__);
+        /* Fore channel */
+        tbl = &ses->fc_slot_table;
        if (tbl->slots == NULL) {
-                status = nfs4_init_slot_table(tbl,
+                status = nfs4_init_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
-                                session->fc_attrs.max_reqs, 1);
+                if (status) /* -ENOMEM */
+                        return status;
+        } else {
+                status = nfs4_reset_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
                if (status)
                        return status;
        }
+        /* Back channel */
-        tbl = &session->bc_slot_table;
+        tbl = &ses->bc_slot_table;
        if (tbl->slots == NULL) {
-                status = nfs4_init_slot_table(tbl,
+                status = nfs4_init_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
-                                session->bc_attrs.max_reqs, 0);
                if (status)
-                        nfs4_destroy_slot_tables(session);
+                        /* Fore and back channel share a connection so get
-        }
+                         * both slot tables or neither */
+                        nfs4_destroy_slot_tables(ses);
+        } else
+                status = nfs4_reset_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
        return status;
 }
@@ -5294,13 +5307,9 @@ int nfs4_proc_create_session(struct nfs_client *clp)
        if (status)
                goto out;
-        /* Init and reset the fore channel */
+        /* Init or reset the session slot tables */
-        status = nfs4_init_slot_tables(session);
+        status = nfs4_setup_session_slot_tables(session);
-        dprintk("slot table initialization returned %d\n", status);
+        dprintk("slot table setup returned %d\n", status);
-        if (status)
-                goto out;
-        status = nfs4_reset_slot_tables(session);
-        dprintk("slot table reset returned %d\n", status);
        if (status)
                goto out;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6a7107ae6b72..a53f33b4ac3a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -49,6 +49,7 @@
 #include <linux/ratelimit.h>
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
+#include <linux/jiffies.h>
 #include "nfs4_fs.h"
 #include "callback.h"
@@ -377,31 +378,24 @@ nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
 {
        struct rb_node **p = &server->state_owners.rb_node,
                       *parent = NULL;
-        struct nfs4_state_owner *sp, *res = NULL;
+        struct nfs4_state_owner *sp;
        while (*p != NULL) {
                parent = *p;
                sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
-                if (server < sp->so_server) {
-                        p = &parent->rb_left;
-                        continue;
-                }
-                if (server > sp->so_server) {
-                        p = &parent->rb_right;
-                        continue;
-                }
                if (cred < sp->so_cred)
                        p = &parent->rb_left;
                else if (cred > sp->so_cred)
                        p = &parent->rb_right;
                else {
+                        if (!list_empty(&sp->so_lru))
+                                list_del_init(&sp->so_lru);
                        atomic_inc(&sp->so_count);
-                        res = sp;
+                        return sp;
-                        break;
                }
        }
-        return res;
+        return NULL;
 }
 static struct nfs4_state_owner *
@@ -421,6 +415,8 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
                else if (new->so_cred > sp->so_cred)
                        p = &parent->rb_right;
                else {
+                        if (!list_empty(&sp->so_lru))
+                                list_del_init(&sp->so_lru);
                        atomic_inc(&sp->so_count);
                        return sp;
                }
@@ -462,6 +458,7 @@ nfs4_alloc_state_owner(void)
        spin_lock_init(&sp->so_sequence.lock);
        INIT_LIST_HEAD(&sp->so_sequence.list);
        atomic_set(&sp->so_count, 1);
+        INIT_LIST_HEAD(&sp->so_lru);
        return sp;
 }
@@ -479,6 +476,38 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
        }
 }
+static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
+{
+        rpc_destroy_wait_queue(&sp->so_sequence.wait);
+        put_rpccred(sp->so_cred);
+        kfree(sp);
+}
+static void nfs4_gc_state_owners(struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        struct nfs4_state_owner *sp, *tmp;
+        unsigned long time_min, time_max;
+        LIST_HEAD(doomed);
+        spin_lock(&clp->cl_lock);
+        time_max = jiffies;
+        time_min = (long)time_max - (long)clp->cl_lease_time;
+        list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+                /* NB: LRU is sorted so that oldest is at the head */
+                if (time_in_range(sp->so_expires, time_min, time_max))
+                        break;
+                list_move(&sp->so_lru, &doomed);
+                nfs4_remove_state_owner_locked(sp);
+        }
+        spin_unlock(&clp->cl_lock);
+        list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+                list_del(&sp->so_lru);
+                nfs4_free_state_owner(sp);
+        }
+}
 /**
 * nfs4_get_state_owner - Look up a state owner given a credential
 * @server: nfs_server to search
@@ -496,10 +525,10 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
        sp = nfs4_find_state_owner_locked(server, cred);
        spin_unlock(&clp->cl_lock);
        if (sp != NULL)
-                return sp;
+                goto out;
        new = nfs4_alloc_state_owner();
        if (new == NULL)
-                return NULL;
+                goto out;
        new->so_server = server;
        new->so_cred = cred;
        spin_lock(&clp->cl_lock);
@@ -511,26 +540,58 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
                rpc_destroy_wait_queue(&new->so_sequence.wait);
                kfree(new);
        }
+out:
+        nfs4_gc_state_owners(server);
        return sp;
 }
 /**
 * nfs4_put_state_owner - Release a nfs4_state_owner
 * @sp: state owner data to release
- *
 */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
-        struct nfs_client *clp = sp->so_server->nfs_client;
+        struct nfs_server *server = sp->so_server;
-        struct rpc_cred *cred = sp->so_cred;
+        struct nfs_client *clp = server->nfs_client;
        if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
                return;
-        nfs4_remove_state_owner_locked(sp);
+        if (!RB_EMPTY_NODE(&sp->so_server_node)) {
+                sp->so_expires = jiffies;
+                list_add_tail(&sp->so_lru, &server->state_owners_lru);
+                spin_unlock(&clp->cl_lock);
+        } else {
+                nfs4_remove_state_owner_locked(sp);
+                spin_unlock(&clp->cl_lock);
+                nfs4_free_state_owner(sp);
+        }
+}
+/**
+ * nfs4_purge_state_owners - Release all cached state owners
+ * @server: nfs_server with cached state owners to release
+ *
+ * Called at umount time.  Remaining state owners will be on
+ * the LRU with ref count of zero.
+ */
+void nfs4_purge_state_owners(struct nfs_server *server)
+{
+        struct nfs_client *clp = server->nfs_client;
+        struct nfs4_state_owner *sp, *tmp;
+        LIST_HEAD(doomed);
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+                list_move(&sp->so_lru, &doomed);
+                nfs4_remove_state_owner_locked(sp);
+        }
        spin_unlock(&clp->cl_lock);
-        rpc_destroy_wait_queue(&sp->so_sequence.wait);
-        put_rpccred(cred);
+        list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
-        kfree(sp);
+                list_del(&sp->so_lru);
+                nfs4_free_state_owner(sp);
+        }
 }
 static struct nfs4_state *
@@ -1402,6 +1463,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
 restart:
        rcu_read_lock();
        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+                nfs4_purge_state_owners(server);
                spin_lock(&clp->cl_lock);
                for (pos = rb_first(&server->state_owners);
                     pos != NULL;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e6161b213ed1..95e92e438407 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2298,7 +2298,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
        encode_getfh(xdr, &hdr);
        encode_getfattr(xdr, args->bitmask, &hdr);
        encode_restorefh(xdr, &hdr);
-        encode_getfattr(xdr, args->bitmask, &hdr);
+        encode_getfattr(xdr, args->dir_bitmask, &hdr);
        encode_nops(&hdr);
 }
@@ -2517,11 +2517,13 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
        encode_compound_hdr(xdr, req, &hdr);
        encode_sequence(xdr, &args->seq_args, &hdr);
        encode_putfh(xdr, args->fh, &hdr);
-        replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
+        replen = hdr.replen + op_decode_hdr_maxsz + 1;
        encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
        xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
                args->acl_pages, args->acl_pgbase, args->acl_len);
+        xdr_set_scratch_buffer(xdr, page_address(args->acl_scratch), PAGE_SIZE);
        encode_nops(&hdr);
 }
@@ -3790,7 +3792,8 @@ out_overflow:
 }
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
-                const struct nfs_server *server, uint32_t *uid, int may_sleep)
+                const struct nfs_server *server, uint32_t *uid,
+                struct nfs4_string *owner_name)
 {
        uint32_t len;
        __be32 *p;
@@ -3807,8 +3810,12 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
                p = xdr_inline_decode(xdr, len);
                if (unlikely(!p))
                        goto out_overflow;
-                if (!may_sleep) {
+                if (owner_name != NULL) {
-                        /* do nothing */
+                        owner_name->data = kmemdup(p, len, GFP_NOWAIT);
+                        if (owner_name->data != NULL) {
+                                owner_name->len = len;
+                                ret = NFS_ATTR_FATTR_OWNER_NAME;
+                        }
                } else if (len < XDR_MAX_NETOBJ) {
                        if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
                                ret = NFS_ATTR_FATTR_OWNER;
@@ -3828,7 +3835,8 @@ out_overflow:
 }
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
-                const struct nfs_server *server, uint32_t *gid, int may_sleep)
+                const struct nfs_server *server, uint32_t *gid,
+                struct nfs4_string *group_name)
 {
        uint32_t len;
        __be32 *p;
@@ -3845,8 +3853,12 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
                p = xdr_inline_decode(xdr, len);
                if (unlikely(!p))
                        goto out_overflow;
-                if (!may_sleep) {
+                if (group_name != NULL) {
-                        /* do nothing */
+                        group_name->data = kmemdup(p, len, GFP_NOWAIT);
+                        if (group_name->data != NULL) {
+                                group_name->len = len;
+                                ret = NFS_ATTR_FATTR_GROUP_NAME;
+                        }
                } else if (len < XDR_MAX_NETOBJ) {
                        if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
                                ret = NFS_ATTR_FATTR_GROUP;
@@ -4283,7 +4295,7 @@ xdr_error:
 static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
                struct nfs_fattr *fattr, struct nfs_fh *fh,
-                const struct nfs_server *server, int may_sleep)
+                const struct nfs_server *server)
 {
        int status;
        umode_t fmode = 0;
@@ -4350,12 +4362,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
                goto xdr_error;
        fattr->valid |= status;
-        status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
+        status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, fattr->owner_name);
        if (status < 0)
                goto xdr_error;
        fattr->valid |= status;
-        status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
+        status = decode_attr_group(xdr, bitmap, server, &fattr->gid, fattr->group_name);
        if (status < 0)
                goto xdr_error;
        fattr->valid |= status;
@@ -4396,7 +4408,7 @@ xdr_error:
 }
 static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
-                struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
+                struct nfs_fh *fh, const struct nfs_server *server)
 {
        __be32 *savep;
        uint32_t attrlen,
@@ -4415,7 +4427,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
        if (status < 0)
                goto xdr_error;
-        status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
+        status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server);
        if (status < 0)
                goto xdr_error;
@@ -4426,9 +4438,9 @@ xdr_error:
 }
 static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
-                const struct nfs_server *server, int may_sleep)
+                const struct nfs_server *server)
 {
-        return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
+        return decode_getfattr_generic(xdr, fattr, NULL, server);
 }
 /*
@@ -4957,17 +4969,18 @@ decode_restorefh(struct xdr_stream *xdr)
 }
 static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
-                size_t *acl_len)
+                         struct nfs_getaclres *res)
 {
-        __be32 *savep;
+        __be32 *savep, *bm_p;
        uint32_t attrlen,
                 bitmap[3] = {0};
        struct kvec *iov = req->rq_rcv_buf.head;
        int status;
-        *acl_len = 0;
+        res->acl_len = 0;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
                goto out;
+        bm_p = xdr->p;
        if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
                goto out;
        if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
@@ -4979,18 +4992,30 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
                size_t hdrlen;
                u32 recvd;
+                /* The bitmap (xdr len + bitmaps) and the attr xdr len words
+                 * are stored with the acl data to handle the problem of
+                 * variable length bitmaps.*/
+                xdr->p = bm_p;
+                res->acl_data_offset = be32_to_cpup(bm_p) + 2;
+                res->acl_data_offset <<= 2;
                /* We ignore &savep and don't do consistency checks on
                 * the attr length.  Let userspace figure it out.... */
                hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
+                attrlen += res->acl_data_offset;
                recvd = req->rq_rcv_buf.len - hdrlen;
                if (attrlen > recvd) {
-                        dprintk("NFS: server cheating in getattr"
+                        if (res->acl_flags & NFS4_ACL_LEN_REQUEST) {
-                                        " acl reply: attrlen %u > recvd %u\n",
+                                /* getxattr interface called with a NULL buf */
+                                res->acl_len = attrlen;
+                                goto out;
+                        }
+                        dprintk("NFS: acl reply: attrlen %u > recvd %u\n",
                                        attrlen, recvd);
                        return -EINVAL;
                }
                xdr_read_pages(xdr, attrlen);
-                *acl_len = attrlen;
+                res->acl_len = attrlen;
        } else
                status = -EOPNOTSUPP;
@@ -5696,8 +5721,7 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
        status = decode_open_downgrade(xdr, res);
        if (status != 0)
                goto out;
-        decode_getfattr(xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server);
-                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -5723,8 +5747,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_access(xdr, res);
        if (status != 0)
                goto out;
-        decode_getfattr(xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server);
-                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -5753,8 +5776,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_getfh(xdr, res->fh);
        if (status)
                goto out;
-        status = decode_getfattr(xdr, res->fattr, res->server
+        status = decode_getfattr(xdr, res->fattr, res->server);
-                        ,!RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -5780,8 +5802,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
                goto out;
        status = decode_getfh(xdr, res->fh);
        if (status == 0)
-                status = decode_getfattr(xdr, res->fattr, res->server,
+                status = decode_getfattr(xdr, res->fattr, res->server);
-                                !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -5807,8 +5828,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_remove(xdr, &res->cinfo);
        if (status)
                goto out;
-        decode_getfattr(xdr, res->dir_attr, res->server,
+        decode_getfattr(xdr, res->dir_attr, res->server);
-                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -5841,14 +5861,12 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        if (status)
                goto out;
        /* Current FH is target directory */
-        if (decode_getfattr(xdr, res->new_fattr, res->server,
+        if (decode_getfattr(xdr, res->new_fattr, res->server))
-                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
        status = decode_restorefh(xdr);
        if (status)
                goto out;
-        decode_getfattr(xdr, res->old_fattr, res->server,
+        decode_getfattr(xdr, res->old_fattr, res->server);
-                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -5884,14 +5902,12 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
         * Note order: OP_LINK leaves the directory as the current
         *             filehandle.
         */
-        if (decode_getfattr(xdr, res->dir_attr, res->server,
+        if (decode_getfattr(xdr, res->dir_attr, res->server))
-                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
        status = decode_restorefh(xdr);
        if (status)
                goto out;
-        decode_getfattr(xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server);
-                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -5923,14 +5939,12 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_getfh(xdr, res->fh);
        if (status)
                goto out;
-        if (decode_getfattr(xdr, res->fattr, res->server,
+        if (decode_getfattr(xdr, res->fattr, res->server))
-                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
        status = decode_restorefh(xdr);
        if (status)
                goto out;
-        decode_getfattr(xdr, res->dir_fattr, res->server,
+        decode_getfattr(xdr, res->dir_fattr, res->server);
-                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -5962,8 +5976,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_getfattr(xdr, res->fattr, res->server,
+        status = decode_getfattr(xdr, res->fattr, res->server);
-                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -6028,7 +6041,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        status = decode_putfh(xdr);
        if (status)
                goto out;
-        status = decode_getacl(xdr, rqstp, &res->acl_len);
+        status = decode_getacl(xdr, rqstp, res);
 out:
        return status;
@@ -6061,8 +6074,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
         *      an ESTALE error. Shouldn't be a problem,
         *      though, since fattr->valid will remain unset.
         */
-        decode_getfattr(xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server);
-                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -6093,13 +6105,11 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
                goto out;
        if (decode_getfh(xdr, &res->fh) != 0)
                goto out;
-        if (decode_getfattr(xdr, res->f_attr, res->server,
+        if (decode_getfattr(xdr, res->f_attr, res->server) != 0)
-                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
        if (decode_restorefh(xdr) != 0)
                goto out;
-        decode_getfattr(xdr, res->dir_attr, res->server,
+        decode_getfattr(xdr, res->dir_attr, res->server);
-                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -6147,8 +6157,7 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
        status = decode_open(xdr, res);
        if (status)
                goto out;
-        decode_getfattr(xdr, res->f_attr, res->server,
+        decode_getfattr(xdr, res->f_attr, res->server);
-                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -6175,8 +6184,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
        status = decode_setattr(xdr);
        if (status)
                goto out;
-        decode_getfattr(xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server);
-                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -6356,8 +6364,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        if (status)
                goto out;
        if (res->fattr)
-                decode_getfattr(xdr, res->fattr, res->server,
+                decode_getfattr(xdr, res->fattr, res->server);
-                                !RPC_IS_ASYNC(rqstp->rq_task));
        if (!status)
                status = res->count;
 out:
@@ -6386,8 +6393,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        if (status)
                goto out;
        if (res->fattr)
-                decode_getfattr(xdr, res->fattr, res->server,
+                decode_getfattr(xdr, res->fattr, res->server);
-                                !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -6546,8 +6552,7 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
        status = decode_delegreturn(xdr);
        if (status != 0)
                goto out;
-        decode_getfattr(xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server);
-                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -6576,8 +6581,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
                goto out;
        xdr_enter_page(xdr, PAGE_SIZE);
        status = decode_getfattr(xdr, &res->fs_locations->fattr,
-                                 res->fs_locations->server,
+                                 res->fs_locations->server);
-                                 !RPC_IS_ASYNC(req->rq_task));
 out:
        return status;
 }
@@ -6826,8 +6830,7 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
        status = decode_layoutcommit(xdr, rqstp, res);
        if (status)
                goto out;
-        decode_getfattr(xdr, res->fattr, res->server,
+        decode_getfattr(xdr, res->fattr, res->server);
-                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -6958,7 +6961,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
                goto out_overflow;
        if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
-                                        entry->server, 1) < 0)
+                                        entry->server) < 0)
                goto out_overflow;
        if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
                entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index c807ab93140e..55d01280a609 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -551,7 +551,8 @@ static const struct nfs_pageio_ops objio_pg_write_ops = {
 static struct pnfs_layoutdriver_type objlayout_type = {
        .id = LAYOUT_OSD2_OBJECTS,
        .name = "LAYOUT_OSD2_OBJECTS",
-        .flags                   = PNFS_LAYOUTRET_ON_SETATTR,
+        .flags                   = PNFS_LAYOUTRET_ON_SETATTR |
+                                   PNFS_LAYOUTRET_ON_ERROR,
        .alloc_layout_hdr        = objlayout_alloc_layout_hdr,
        .free_layout_hdr         = objlayout_free_layout_hdr,
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 72074e3a04f9..b3c29039f5b8 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -254,6 +254,8 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
        oir->status = rdata->task.tk_status = status;
        if (status >= 0)
                rdata->res.count = status;
+        else
+                rdata->pnfs_error = status;
        objlayout_iodone(oir);
        /* must not use oir after this point */
@@ -334,6 +336,8 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
        if (status >= 0) {
                wdata->res.count = status;
                wdata->verf.committed = oir->committed;
+        } else {
+                wdata->pnfs_error = status;
        }
        objlayout_iodone(oir);
        /* must not use oir after this point */
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8e672a2b2d69..17149a490065 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1166,6 +1166,33 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
+static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head)
+{
+        struct nfs_pageio_descriptor pgio;
+        LIST_HEAD(failed);
+        /* Resend all requests through the MDS */
+        nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE);
+        while (!list_empty(head)) {
+                struct nfs_page *req = nfs_list_entry(head->next);
+                nfs_list_remove_request(req);
+                if (!nfs_pageio_add_request(&pgio, req))
+                        nfs_list_add_request(req, &failed);
+        }
+        nfs_pageio_complete(&pgio);
+        if (!list_empty(&failed)) {
+                /* For some reason our attempt to resend pages. Mark the
+                 * overall send request as having failed, and let
+                 * nfs_writeback_release_full deal with the error.
+                 */
+                list_move(&failed, head);
+                return -EIO;
+        }
+        return 0;
+}
 /*
 * Called by non rpc-based layout drivers
 */
@@ -1175,9 +1202,17 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
                pnfs_set_layoutcommit(data);
                data->mds_ops->rpc_call_done(&data->task, data);
        } else {
-                put_lseg(data->lseg);
-                data->lseg = NULL;
                dprintk("pnfs write error = %d\n", data->pnfs_error);
+                if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+                                                PNFS_LAYOUTRET_ON_ERROR) {
+                        /* Don't lo_commit on error, Server will needs to
+                         * preform a file recovery.
+                         */
+                        clear_bit(NFS_INO_LAYOUTCOMMIT,
+                                  &NFS_I(data->inode)->flags);
+                        pnfs_return_layout(data->inode);
+                }
+                data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages);
        }
        data->mds_ops->rpc_release(data);
 }
@@ -1267,6 +1302,9 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
        put_lseg(data->lseg);
        data->lseg = NULL;
        dprintk("pnfs write error = %d\n", data->pnfs_error);
+        if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+                                                PNFS_LAYOUTRET_ON_ERROR)
+                pnfs_return_layout(data->inode);
        nfs_pageio_init_read_mds(&pgio, data->inode);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1509530cb111..53d593a0a4f2 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -68,6 +68,7 @@ enum {
 enum layoutdriver_policy_flags {
        /* Should the pNFS client commit and return the layout upon a setattr */
        PNFS_LAYOUTRET_ON_SETATTR       = 1 << 0,
+        PNFS_LAYOUTRET_ON_ERROR         = 1 << 1,
 };
 struct nfs4_deviceid_node;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e463967aafb8..3dfa4f112c0a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -908,10 +908,24 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
                data->auth_flavor_len   = 1;
                data->version           = version;
                data->minorversion      = 0;
+                security_init_mnt_opts(&data->lsm_opts);
        }
        return data;
 }
+static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data)
+{
+        if (data) {
+                kfree(data->client_address);
+                kfree(data->mount_server.hostname);
+                kfree(data->nfs_server.export_path);
+                kfree(data->nfs_server.hostname);
+                kfree(data->fscache_uniq);
+                security_free_mnt_opts(&data->lsm_opts);
+                kfree(data);
+        }
+}
 /*
 * Sanity-check a server address provided by the mount command.
 *
@@ -2219,9 +2233,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
        data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
        mntfh = nfs_alloc_fhandle();
        if (data == NULL || mntfh == NULL)
-                goto out_free_fh;
+                goto out;
-        security_init_mnt_opts(&data->lsm_opts);
        /* Validate the mount data */
        error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
@@ -2233,8 +2245,6 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 #ifdef CONFIG_NFS_V4
        if (data->version == 4) {
                mntroot = nfs4_try_mount(flags, dev_name, data);
-                kfree(data->client_address);
-                kfree(data->nfs_server.export_path);
                goto out;
        }
 #endif  /* CONFIG_NFS_V4 */
@@ -2289,13 +2299,8 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
        s->s_flags |= MS_ACTIVE;
 out:
-        kfree(data->nfs_server.hostname);
+        nfs_free_parsed_mount_data(data);
-        kfree(data->mount_server.hostname);
-        kfree(data->fscache_uniq);
-        security_free_mnt_opts(&data->lsm_opts);
-out_free_fh:
        nfs_free_fhandle(mntfh);
-        kfree(data);
        return mntroot;
 out_err_nosb:
@@ -2622,9 +2627,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
        mntfh = nfs_alloc_fhandle();
        if (data == NULL || mntfh == NULL)
-                goto out_free_fh;
+                goto out;
-        security_init_mnt_opts(&data->lsm_opts);
        /* Get a volume representation */
        server = nfs4_create_server(data, mntfh);
@@ -2676,13 +2679,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
        s->s_flags |= MS_ACTIVE;
-        security_free_mnt_opts(&data->lsm_opts);
        nfs_free_fhandle(mntfh);
        return mntroot;
 out:
-        security_free_mnt_opts(&data->lsm_opts);
-out_free_fh:
        nfs_free_fhandle(mntfh);
        return ERR_PTR(error);
@@ -2839,7 +2839,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
        data = nfs_alloc_parsed_mount_data(4);
        if (data == NULL)
-                goto out_free_data;
+                goto out;
        /* Validate the mount data */
        error = nfs4_validate_mount_data(raw_data, data, dev_name);
@@ -2853,12 +2853,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
                error = PTR_ERR(res);
 out:
-        kfree(data->client_address);
+        nfs_free_parsed_mount_data(data);
-        kfree(data->nfs_server.export_path);
-        kfree(data->nfs_server.hostname);
-        kfree(data->fscache_uniq);
-out_free_data:
-        kfree(data);
        dprintk("<-- nfs4_mount() = %d%s\n", error,
                        error != 0 ? " [error]" : "");
        return res;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1dda78db6a73..834f0fe96f89 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1052,7 +1052,7 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = {
        .pg_doio = nfs_generic_pg_writepages,
 };
-static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
                                  struct inode *inode, int ioflags)
 {
        nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
@@ -1166,13 +1166,7 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
 static void nfs_writeback_release_full(void *calldata)
 {
        struct nfs_write_data   *data = calldata;
-        int ret, status = data->task.tk_status;
+        int status = data->task.tk_status;
-        struct nfs_pageio_descriptor pgio;
-        if (data->pnfs_error) {
-                nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
-                pgio.pg_recoalesce = 1;
-        }
        /* Update attributes as result of writeback. */
        while (!list_empty(&data->pages)) {
@@ -1188,11 +1182,6 @@ static void nfs_writeback_release_full(void *calldata)
                        req->wb_bytes,
                        (long long)req_offset(req));
-                if (data->pnfs_error) {
-                        dprintk(", pnfs error = %d\n", data->pnfs_error);
-                        goto next;
-                }
                if (status < 0) {
                        nfs_set_pageerror(page);
                        nfs_context_set_write_error(req->wb_context, status);
@@ -1212,19 +1201,7 @@ remove_request:
        next:
                nfs_clear_page_tag_locked(req);
                nfs_end_page_writeback(page);
-                if (data->pnfs_error) {
-                        lock_page(page);
-                        nfs_pageio_cond_complete(&pgio, page->index);
-                        ret = nfs_page_async_flush(&pgio, page, 0);
-                        if (ret) {
-                                nfs_set_pageerror(page);
-                                dprintk("rewrite to MDS error = %d\n", ret);
-                        }
-                        unlock_page(page);
-                }
        }
-        if (data->pnfs_error)
-                nfs_pageio_complete(&pgio);
        nfs_writedata_release(calldata);
 }
@@ -1711,7 +1688,7 @@ out_error:
 #ifdef CONFIG_MIGRATION
 int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
-                struct page *page)
+                struct page *page, enum migrate_mode mode)
 {
        /*
         * If PagePrivate is set, then the page is currently associated with
@@ -1726,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
        nfs_fscache_release_page(page, GFP_KERNEL);
-        return migrate_page(mapping, newpage, page);
+        return migrate_page(mapping, newpage, page, mode);
 }
 #endif
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 10e6366608f2..8df1ea4a6ff9 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -80,3 +80,13 @@ config NFSD_V4
          available from http://linux-nfs.org/.
          If unsure, say N.
+config NFSD_FAULT_INJECTION
+        bool "NFS server manual fault injection"
+        depends on NFSD_V4 && DEBUG_KERNEL
+        help
+          This option enables support for manually injecting faults
+          into the NFS server.  This is intended to be used for
+          testing error recovery on the NFS client.
+          If unsure, say N.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 9b118ee20193..af32ef06b4fe 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_NFSD)	+= nfsd.o
 nfsd-y                  := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
                           export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
 nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
 nfsd-$(CONFIG_NFSD_V3)  += nfs3proc.o nfs3xdr.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 62f3b9074e84..cf8a6bd062fa 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -87,7 +87,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
        struct svc_expkey key;
        struct svc_expkey *ek = NULL;
-        if (mesg[mlen-1] != '\n')
+        if (mlen < 1 || mesg[mlen-1] != '\n')
                return -EINVAL;
        mesg[mlen-1] = 0;
@@ -1226,12 +1226,12 @@ nfsd_export_init(void)
        int rv;
        dprintk("nfsd: initializing export module.\n");
-        rv = cache_register(&svc_export_cache);
+        rv = cache_register_net(&svc_export_cache, &init_net);
        if (rv)
                return rv;
-        rv = cache_register(&svc_expkey_cache);
+        rv = cache_register_net(&svc_expkey_cache, &init_net);
        if (rv)
-                cache_unregister(&svc_export_cache);
+                cache_unregister_net(&svc_export_cache, &init_net);
        return rv;
 }
@@ -1255,8 +1255,8 @@ nfsd_export_shutdown(void)
        dprintk("nfsd: shutting down export module.\n");
-        cache_unregister(&svc_expkey_cache);
+        cache_unregister_net(&svc_expkey_cache, &init_net);
-        cache_unregister(&svc_export_cache);
+        cache_unregister_net(&svc_export_cache, &init_net);
        svcauth_unix_purge();
        dprintk("nfsd: export shutdown complete.\n");
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
new file mode 100644
index 000000000000..ce7f0758d84c
--- /dev/null
+++ b/fs/nfsd/fault_inject.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
+ *
+ * Uses debugfs to create fault injection points for client testing
+ */
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include "state.h"
+#include "fault_inject.h"
+struct nfsd_fault_inject_op {
+        char *file;
+        void (*func)(u64);
+};
+static struct nfsd_fault_inject_op inject_ops[] = {
+        {
+                .file   = "forget_clients",
+                .func   = nfsd_forget_clients,
+        },
+        {
+                .file   = "forget_locks",
+                .func   = nfsd_forget_locks,
+        },
+        {
+                .file   = "forget_openowners",
+                .func   = nfsd_forget_openowners,
+        },
+        {
+                .file   = "forget_delegations",
+                .func   = nfsd_forget_delegations,
+        },
+        {
+                .file   = "recall_delegations",
+                .func   = nfsd_recall_delegations,
+        },
+};
+static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);
+static struct dentry *debug_dir;
+static int nfsd_inject_set(void *op_ptr, u64 val)
+{
+        struct nfsd_fault_inject_op *op = op_ptr;
+        if (val == 0)
+                printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);
+        else
+                printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val);
+        op->func(val);
+        return 0;
+}
+static int nfsd_inject_get(void *data, u64 *val)
+{
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n");
+void nfsd_fault_inject_cleanup(void)
+{
+        debugfs_remove_recursive(debug_dir);
+}
+int nfsd_fault_inject_init(void)
+{
+        unsigned int i;
+        struct nfsd_fault_inject_op *op;
+        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+        debug_dir = debugfs_create_dir("nfsd", NULL);
+        if (!debug_dir)
+                goto fail;
+        for (i = 0; i < NUM_INJECT_OPS; i++) {
+                op = &inject_ops[i];
+                if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd))
+                        goto fail;
+        }
+        return 0;
+fail:
+        nfsd_fault_inject_cleanup();
+        return -ENOMEM;
+}
diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h
new file mode 100644
index 000000000000..90bd0570956c
--- /dev/null
+++ b/fs/nfsd/fault_inject.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
+ *
+ * Function definitions for fault injection
+ */
+#ifndef LINUX_NFSD_FAULT_INJECT_H
+#define LINUX_NFSD_FAULT_INJECT_H
+#ifdef CONFIG_NFSD_FAULT_INJECTION
+int nfsd_fault_inject_init(void);
+void nfsd_fault_inject_cleanup(void);
+void nfsd_forget_clients(u64);
+void nfsd_forget_locks(u64);
+void nfsd_forget_openowners(u64);
+void nfsd_forget_delegations(u64);
+void nfsd_recall_delegations(u64);
+#else /* CONFIG_NFSD_FAULT_INJECTION */
+static inline int nfsd_fault_inject_init(void) { return 0; }
+static inline void nfsd_fault_inject_cleanup(void) {}
+static inline void nfsd_forget_clients(u64 num) {}
+static inline void nfsd_forget_locks(u64 num) {}
+static inline void nfsd_forget_openowners(u64 num) {}
+static inline void nfsd_forget_delegations(u64 num) {}
+static inline void nfsd_recall_delegations(u64 num) {}
+#endif /* CONFIG_NFSD_FAULT_INJECTION */
+#endif /* LINUX_NFSD_FAULT_INJECT_H */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7748d6a18d97..6f3ebb48b12f 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -718,7 +718,7 @@ int set_callback_cred(void)
 {
        if (callback_cred)
                return 0;
-        callback_cred = rpc_lookup_machine_cred();
+        callback_cred = rpc_lookup_machine_cred("nfs");
        if (!callback_cred)
                return -ENOMEM;
        return 0;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 55780a22fdbd..94096273cd6c 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,6 +36,7 @@
 #include <linux/seq_file.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <net/net_namespace.h>
 #include "idmap.h"
 #include "nfsd.h"
@@ -466,20 +467,20 @@ nfsd_idmap_init(void)
 {
        int rv;
-        rv = cache_register(&idtoname_cache);
+        rv = cache_register_net(&idtoname_cache, &init_net);
        if (rv)
                return rv;
-        rv = cache_register(&nametoid_cache);
+        rv = cache_register_net(&nametoid_cache, &init_net);
        if (rv)
-                cache_unregister(&idtoname_cache);
+                cache_unregister_net(&idtoname_cache, &init_net);
        return rv;
 }
 void
 nfsd_idmap_shutdown(void)
 {
-        cache_unregister(&idtoname_cache);
+        cache_unregister_net(&idtoname_cache, &init_net);
-        cache_unregister(&nametoid_cache);
+        cache_unregister_net(&nametoid_cache, &init_net);
 }
 static int
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c5e28ed8bca0..896da74ec563 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -266,10 +266,6 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 {
        __be32 status;
-        /* Only reclaims from previously confirmed clients are valid */
-        if ((status = nfs4_check_open_reclaim(&open->op_clientid)))
-                return status;
        /* We don't know the target directory, and therefore can not
        * set the change info
        */
@@ -373,6 +369,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        break;
                case NFS4_OPEN_CLAIM_PREVIOUS:
                        open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+                        status = nfs4_check_open_reclaim(&open->op_clientid);
+                        if (status)
+                                goto out;
                case NFS4_OPEN_CLAIM_FH:
                case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
                        status = do_open_fhandle(rqstp, &cstate->current_fh,
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 80a0be9ed008..0b3e875d1abd 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -117,8 +117,7 @@ out_no_tfm:
        return status;
 }
-int
+void nfsd4_create_clid_dir(struct nfs4_client *clp)
-nfsd4_create_clid_dir(struct nfs4_client *clp)
 {
        const struct cred *original_cred;
        char *dname = clp->cl_recdir;
@@ -127,13 +126,14 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
        dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
-        if (!rec_file || clp->cl_firststate)
+        if (clp->cl_firststate)
-                return 0;
+                return;
        clp->cl_firststate = 1;
+        if (!rec_file)
+                return;
        status = nfs4_save_creds(&original_cred);
        if (status < 0)
-                return status;
+                return;
        dir = rec_file->f_path.dentry;
        /* lock the parent */
@@ -144,8 +144,15 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
                status = PTR_ERR(dentry);
                goto out_unlock;
        }
-        status = -EEXIST;
        if (dentry->d_inode)
+                /*
+                 * In the 4.1 case, where we're called from
+                 * reclaim_complete(), records from the previous reboot
+                 * may still be left, so this is OK.
+                 *
+                 * In the 4.0 case, we should never get here; but we may
+                 * as well be forgiving and just succeed silently.
+                 */
                goto out_put;
        status = mnt_want_write_file(rec_file);
        if (status)
@@ -164,7 +171,6 @@ out_unlock:
                                " and is writeable", status,
                                user_recovery_dirname);
        nfs4_reset_creds(original_cred);
-        return status;
 }
 typedef int (recdir_func)(struct dentry *, struct dentry *);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9ca16dc09e04..e8c98f009670 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,12 +49,20 @@
 time_t nfsd4_lease = 90;     /* default lease time */
 time_t nfsd4_grace = 90;
 static time_t boot_time;
-static stateid_t zerostateid;             /* bits all 0 */
-static stateid_t onestateid;              /* bits all 1 */
+#define all_ones {{~0,~0},~0}
+static const stateid_t one_stateid = {
+        .si_generation = ~0,
+        .si_opaque = all_ones,
+};
+static const stateid_t zero_stateid = {
+        /* all fields zero */
+};
 static u64 current_sessionid = 1;
-#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
+#define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t)))
-#define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
+#define ONE_STATEID(stateid)  (!memcmp((stateid), &one_stateid, sizeof(stateid_t)))
 /* forward declarations */
 static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
@@ -133,21 +141,21 @@ unsigned int max_delegations;
 * Open owner state (share locks)
 */
-/* hash tables for open owners */
+/* hash tables for lock and open owners */
-#define OPEN_OWNER_HASH_BITS              8
+#define OWNER_HASH_BITS              8
-#define OPEN_OWNER_HASH_SIZE             (1 << OPEN_OWNER_HASH_BITS)
+#define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
-#define OPEN_OWNER_HASH_MASK             (OPEN_OWNER_HASH_SIZE - 1)
+#define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
-static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
+static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
 {
        unsigned int ret;
        ret = opaque_hashval(ownername->data, ownername->len);
        ret += clientid;
-        return ret & OPEN_OWNER_HASH_MASK;
+        return ret & OWNER_HASH_MASK;
 }
-static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE];
+static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
 /* hash table for nfs4_file */
 #define FILE_HASH_BITS                   8
@@ -514,6 +522,7 @@ static void unhash_lockowner(struct nfs4_lockowner *lo)
        list_del(&lo->lo_owner.so_strhash);
        list_del(&lo->lo_perstateid);
+        list_del(&lo->lo_owner_ino_hash);
        while (!list_empty(&lo->lo_owner.so_stateids)) {
                stp = list_first_entry(&lo->lo_owner.so_stateids,
                                struct nfs4_ol_stateid, st_perstateowner);
@@ -985,12 +994,11 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
        clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
        if (clp == NULL)
                return NULL;
-        clp->cl_name.data = kmalloc(name.len, GFP_KERNEL);
+        clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL);
        if (clp->cl_name.data == NULL) {
                kfree(clp);
                return NULL;
        }
-        memcpy(clp->cl_name.data, name.data, name.len);
        clp->cl_name.len = name.len;
        return clp;
 }
@@ -1058,7 +1066,6 @@ expire_client(struct nfs4_client *clp)
        spin_unlock(&recall_lock);
        while (!list_empty(&reaplist)) {
                dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
-                list_del_init(&dp->dl_recall_lru);
                unhash_delegation(dp);
        }
        while (!list_empty(&clp->cl_openowners)) {
@@ -2301,7 +2308,7 @@ nfsd4_free_slabs(void)
        nfsd4_free_slab(&deleg_slab);
 }
-static int
+int
 nfsd4_init_slabs(void)
 {
        openowner_slab = kmem_cache_create("nfsd4_openowners",
@@ -2373,7 +2380,7 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
 static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
 {
-        list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]);
+        list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
        list_add(&oo->oo_perclient, &clp->cl_openowners);
 }
@@ -2436,7 +2443,9 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
        struct nfs4_stateowner *so;
        struct nfs4_openowner *oo;
-        list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) {
+        list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
+                if (!so->so_is_open_owner)
+                        continue;
                if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
                        oo = openowner(so);
                        renew_client(oo->oo_owner.so_client);
@@ -2580,7 +2589,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
        if (open->op_file == NULL)
                return nfserr_jukebox;
-        strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner);
+        strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
        oo = find_openstateowner_str(strhashval, open);
        open->op_openowner = oo;
        if (!oo) {
@@ -3123,7 +3132,6 @@ nfs4_laundromat(void)
        spin_unlock(&recall_lock);
        list_for_each_safe(pos, next, &reaplist) {
                dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-                list_del_init(&dp->dl_recall_lru);
                unhash_delegation(dp);
        }
        test_val = nfsd4_lease;
@@ -3718,13 +3726,11 @@ out:
 }
-/* 
- * Lock owner state (byte-range locks)
- */
 #define LOFF_OVERFLOW(start, len)      ((u64)(len) > ~(u64)(start))
-#define LOCK_HASH_BITS              8
-#define LOCK_HASH_SIZE             (1 << LOCK_HASH_BITS)
+#define LOCKOWNER_INO_HASH_BITS 8
-#define LOCK_HASH_MASK             (LOCK_HASH_SIZE - 1)
+#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
+#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)
 static inline u64
 end_offset(u64 start, u64 len)
@@ -3746,16 +3752,14 @@ last_byte_offset(u64 start, u64 len)
        return end > start ? end - 1: NFS4_MAX_UINT64;
 }
-static inline unsigned int
+static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername)
-lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
-                struct xdr_netobj *ownername)
 {
        return (file_hashval(inode) + cl_id
                        + opaque_hashval(ownername->data, ownername->len))
-                & LOCK_HASH_MASK;
+                & LOCKOWNER_INO_HASH_MASK;
 }
-static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
+static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE];
 /*
 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
@@ -3809,23 +3813,39 @@ nevermind:
                deny->ld_type = NFS4_WRITE_LT;
 }
+static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, clientid_t *clid, struct xdr_netobj *owner)
+{
+        struct nfs4_ol_stateid *lst;
+        if (!same_owner_str(&lo->lo_owner, owner, clid))
+                return false;
+        lst = list_first_entry(&lo->lo_owner.so_stateids,
+                               struct nfs4_ol_stateid, st_perstateowner);
+        return lst->st_file->fi_inode == inode;
+}
 static struct nfs4_lockowner *
 find_lockowner_str(struct inode *inode, clientid_t *clid,
                struct xdr_netobj *owner)
 {
-        unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner);
+        unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);
-        struct nfs4_stateowner *op;
+        struct nfs4_lockowner *lo;
-        list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) {
+        list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
-                if (same_owner_str(op, owner, clid))
+                if (same_lockowner_ino(lo, inode, clid, owner))
-                        return lockowner(op);
+                        return lo;
        }
        return NULL;
 }
 static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
 {
-        list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]);
+        struct inode *inode = open_stp->st_file->fi_inode;
+        unsigned int inohash = lockowner_ino_hashval(inode,
+                        clp->cl_clientid.cl_id, &lo->lo_owner.so_owner);
+        list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
+        list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]);
        list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
 }
@@ -3834,7 +3854,7 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 
 * occurred. 
 *
- * strhashval = lock_ownerstr_hashval 
+ * strhashval = ownerstr_hashval
 */
 static struct nfs4_lockowner *
@@ -3892,6 +3912,37 @@ static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
        __set_bit(access, &lock_stp->st_access_bmap);
 }
+__be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
+{
+        struct nfs4_file *fi = ost->st_file;
+        struct nfs4_openowner *oo = openowner(ost->st_stateowner);
+        struct nfs4_client *cl = oo->oo_owner.so_client;
+        struct nfs4_lockowner *lo;
+        unsigned int strhashval;
+        lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner);
+        if (lo) {
+                if (!cstate->minorversion)
+                        return nfserr_bad_seqid;
+                /* XXX: a lockowner always has exactly one stateid: */
+                *lst = list_first_entry(&lo->lo_owner.so_stateids,
+                                struct nfs4_ol_stateid, st_perstateowner);
+                return nfs_ok;
+        }
+        strhashval = ownerstr_hashval(cl->cl_clientid.cl_id,
+                        &lock->v.new.owner);
+        lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
+        if (lo == NULL)
+                return nfserr_jukebox;
+        *lst = alloc_init_lock_stateid(lo, fi, ost);
+        if (*lst == NULL) {
+                release_lockowner(lo);
+                return nfserr_jukebox;
+        }
+        *new = true;
+        return nfs_ok;
+}
 /*
 *  LOCK operation 
 */
@@ -3907,7 +3958,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct file_lock file_lock;
        struct file_lock conflock;
        __be32 status = 0;
-        unsigned int strhashval;
+        bool new_state = false;
        int lkflg;
        int err;
@@ -3933,10 +3984,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                 * lock stateid.
                 */
                struct nfs4_ol_stateid *open_stp = NULL;
-                
+                if (nfsd4_has_session(cstate))
+                        /* See rfc 5661 18.10.3: given clientid is ignored: */
+                        memcpy(&lock->v.new.clientid,
+                                &cstate->session->se_client->cl_clientid,
+                                sizeof(clientid_t));
                status = nfserr_stale_clientid;
-                if (!nfsd4_has_session(cstate) &&
+                if (STALE_CLIENTID(&lock->lk_new_clientid))
-                    STALE_CLIENTID(&lock->lk_new_clientid))
                        goto out;
                /* validate and update open stateid and open seqid */
@@ -3948,25 +4004,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out;
                open_sop = openowner(open_stp->st_stateowner);
                status = nfserr_bad_stateid;
-                if (!nfsd4_has_session(cstate) &&
+                if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
-                        !same_clid(&open_sop->oo_owner.so_client->cl_clientid,
                                                &lock->v.new.clientid))
                        goto out;
-                /* create lockowner and lock stateid */
+                status = lookup_or_create_lock_state(cstate, open_stp, lock,
-                fp = open_stp->st_file;
+                                                        &lock_stp, &new_state);
-                strhashval = lock_ownerstr_hashval(fp->fi_inode,
+                if (status)
-                                open_sop->oo_owner.so_client->cl_clientid.cl_id,
-                                &lock->v.new.owner);
-                /* XXX: Do we need to check for duplicate stateowners on
-                 * the same file, or should they just be allowed (and
-                 * create new stateids)? */
-                status = nfserr_jukebox;
-                lock_sop = alloc_init_lock_stateowner(strhashval,
-                                open_sop->oo_owner.so_client, open_stp, lock);
-                if (lock_sop == NULL)
-                        goto out;
-                lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
-                if (lock_stp == NULL)
                        goto out;
        } else {
                /* lock (lock owner + lock stateid) already exists */
@@ -3976,10 +4019,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                                       NFS4_LOCK_STID, &lock_stp);
                if (status)
                        goto out;
-                lock_sop = lockowner(lock_stp->st_stateowner);
-                fp = lock_stp->st_file;
        }
-        /* lock_sop and lock_stp have been created or found */
+        lock_sop = lockowner(lock_stp->st_stateowner);
+        fp = lock_stp->st_file;
        lkflg = setlkflg(lock->lk_type);
        status = nfs4_check_openmode(lock_stp, lkflg);
@@ -4054,7 +4096,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                break;
        }
 out:
-        if (status && lock->lk_is_new && lock_sop)
+        if (status && new_state)
                release_lockowner(lock_sop);
        if (!cstate->replay_owner)
                nfs4_unlock_state();
@@ -4251,7 +4293,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
        struct nfs4_ol_stateid *stp;
        struct xdr_netobj *owner = &rlockowner->rl_owner;
        struct list_head matches;
-        int i;
+        unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
        __be32 status;
        dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
@@ -4266,22 +4308,19 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
        nfs4_lock_state();
        status = nfserr_locks_held;
-        /* XXX: we're doing a linear search through all the lockowners.
-         * Yipes!  For now we'll just hope clients aren't really using
-         * release_lockowner much, but eventually we have to fix these
-         * data structures. */
        INIT_LIST_HEAD(&matches);
-        for (i = 0; i < LOCK_HASH_SIZE; i++) {
-                list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) {
+        list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) {
-                        if (!same_owner_str(sop, owner, clid))
+                if (sop->so_is_open_owner)
-                                continue;
+                        continue;
-                        list_for_each_entry(stp, &sop->so_stateids,
+                if (!same_owner_str(sop, owner, clid))
-                                        st_perstateowner) {
+                        continue;
-                                lo = lockowner(sop);
+                list_for_each_entry(stp, &sop->so_stateids,
-                                if (check_for_locks(stp->st_file, lo))
+                                st_perstateowner) {
-                                        goto out;
+                        lo = lockowner(sop);
-                                list_add(&lo->lo_list, &matches);
+                        if (check_for_locks(stp->st_file, lo))
-                        }
+                                goto out;
+                        list_add(&lo->lo_list, &matches);
                }
        }
        /* Clients probably won't expect us to return with some (but not all)
@@ -4394,16 +4433,127 @@ nfs4_check_open_reclaim(clientid_t *clid)
        return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad;
 }
+#ifdef CONFIG_NFSD_FAULT_INJECTION
+void nfsd_forget_clients(u64 num)
+{
+        struct nfs4_client *clp, *next;
+        int count = 0;
+        nfs4_lock_state();
+        list_for_each_entry_safe(clp, next, &client_lru, cl_lru) {
+                nfsd4_remove_clid_dir(clp);
+                expire_client(clp);
+                if (++count == num)
+                        break;
+        }
+        nfs4_unlock_state();
+        printk(KERN_INFO "NFSD: Forgot %d clients", count);
+}
+static void release_lockowner_sop(struct nfs4_stateowner *sop)
+{
+        release_lockowner(lockowner(sop));
+}
+static void release_openowner_sop(struct nfs4_stateowner *sop)
+{
+        release_openowner(openowner(sop));
+}
+static int nfsd_release_n_owners(u64 num, bool is_open_owner,
+                                void (*release_sop)(struct nfs4_stateowner *))
+{
+        int i, count = 0;
+        struct nfs4_stateowner *sop, *next;
+        for (i = 0; i < OWNER_HASH_SIZE; i++) {
+                list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) {
+                        if (sop->so_is_open_owner != is_open_owner)
+                                continue;
+                        release_sop(sop);
+                        if (++count == num)
+                                return count;
+                }
+        }
+        return count;
+}
+void nfsd_forget_locks(u64 num)
+{
+        int count;
+        nfs4_lock_state();
+        count = nfsd_release_n_owners(num, false, release_lockowner_sop);
+        nfs4_unlock_state();
+        printk(KERN_INFO "NFSD: Forgot %d locks", count);
+}
+void nfsd_forget_openowners(u64 num)
+{
+        int count;
+        nfs4_lock_state();
+        count = nfsd_release_n_owners(num, true, release_openowner_sop);
+        nfs4_unlock_state();
+        printk(KERN_INFO "NFSD: Forgot %d open owners", count);
+}
+int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegation *))
+{
+        int i, count = 0;
+        struct nfs4_file *fp, *fnext;
+        struct nfs4_delegation *dp, *dnext;
+        for (i = 0; i < FILE_HASH_SIZE; i++) {
+                list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) {
+                        list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) {
+                                deleg_func(dp);
+                                if (++count == num)
+                                        return count;
+                        }
+                }
+        }
+        return count;
+}
+void nfsd_forget_delegations(u64 num)
+{
+        unsigned int count;
+        nfs4_lock_state();
+        count = nfsd_process_n_delegations(num, unhash_delegation);
+        nfs4_unlock_state();
+        printk(KERN_INFO "NFSD: Forgot %d delegations", count);
+}
+void nfsd_recall_delegations(u64 num)
+{
+        unsigned int count;
+        nfs4_lock_state();
+        spin_lock(&recall_lock);
+        count = nfsd_process_n_delegations(num, nfsd_break_one_deleg);
+        spin_unlock(&recall_lock);
+        nfs4_unlock_state();
+        printk(KERN_INFO "NFSD: Recalled %d delegations", count);
+}
+#endif /* CONFIG_NFSD_FAULT_INJECTION */
 /* initialization to perform at module load time: */
-int
+void
 nfs4_state_init(void)
 {
-        int i, status;
+        int i;
-        status = nfsd4_init_slabs();
-        if (status)
-                return status;
        for (i = 0; i < CLIENT_HASH_SIZE; i++) {
                INIT_LIST_HEAD(&conf_id_hashtbl[i]);
                INIT_LIST_HEAD(&conf_str_hashtbl[i]);
@@ -4416,18 +4566,15 @@ nfs4_state_init(void)
        for (i = 0; i < FILE_HASH_SIZE; i++) {
                INIT_LIST_HEAD(&file_hashtbl[i]);
        }
-        for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) {
+        for (i = 0; i < OWNER_HASH_SIZE; i++) {
-                INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]);
+                INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
-        }
-        for (i = 0; i < LOCK_HASH_SIZE; i++) {
-                INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
        }
-        memset(&onestateid, ~0, sizeof(stateid_t));
+        for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
+                INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]);
        INIT_LIST_HEAD(&close_lru);
        INIT_LIST_HEAD(&client_lru);
        INIT_LIST_HEAD(&del_recall_lru);
        reclaim_str_hashtbl_size = 0;
-        return 0;
 }
 static void
@@ -4526,7 +4673,6 @@ __nfs4_state_shutdown(void)
        spin_unlock(&recall_lock);
        list_for_each_safe(pos, next, &reaplist) {
                dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-                list_del_init(&dp->dl_recall_lru);
                unhash_delegation(dp);
        }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b6fa792d6b85..0ec5a1b9700e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -215,10 +215,9 @@ defer_free(struct nfsd4_compoundargs *argp,
 static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
 {
        if (p == argp->tmp) {
-                p = kmalloc(nbytes, GFP_KERNEL);
+                p = kmemdup(argp->tmp, nbytes, GFP_KERNEL);
                if (!p)
                        return NULL;
-                memcpy(p, argp->tmp, nbytes);
        } else {
                BUG_ON(p != argp->tmpp);
                argp->tmpp = NULL;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bb4a11d58a5a..748eda93ce59 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -18,6 +18,7 @@
 #include "idmap.h"
 #include "nfsd.h"
 #include "cache.h"
+#include "fault_inject.h"
 /*
 *      We have a single directory with several nodes in it.
@@ -1128,9 +1129,13 @@ static int __init init_nfsd(void)
        int retval;
        printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
-        retval = nfs4_state_init(); /* nfs4 locking state */
+        retval = nfsd4_init_slabs();
        if (retval)
                return retval;
+        nfs4_state_init();
+        retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+        if (retval)
+                goto out_free_slabs;
        nfsd_stat_init();       /* Statistics */
        retval = nfsd_reply_cache_init();
        if (retval)
@@ -1161,6 +1166,8 @@ out_free_cache:
        nfsd_reply_cache_shutdown();
 out_free_stat:
        nfsd_stat_shutdown();
+        nfsd_fault_inject_cleanup();
+out_free_slabs:
        nfsd4_free_slabs();
        return retval;
 }
@@ -1175,6 +1182,7 @@ static void __exit exit_nfsd(void)
        nfsd_lockd_shutdown();
        nfsd_idmap_shutdown();
        nfsd4_free_slabs();
+        nfsd_fault_inject_cleanup();
        unregister_filesystem(&nfsd_fs_type);
 }
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 58134a23fdfb..1d1e8589b4ce 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -104,14 +104,16 @@ static inline int nfsd_v4client(struct svc_rqst *rq)
 */
 #ifdef CONFIG_NFSD_V4
 extern unsigned int max_delegations;
-int nfs4_state_init(void);
+void nfs4_state_init(void);
+int nfsd4_init_slabs(void);
 void nfsd4_free_slabs(void);
 int nfs4_state_start(void);
 void nfs4_state_shutdown(void);
 void nfs4_reset_lease(time_t leasetime);
 int nfs4_reset_recoverydir(char *recdir);
 #else
-static inline int nfs4_state_init(void) { return 0; }
+static inline void nfs4_state_init(void) { }
+static inline int nfsd4_init_slabs(void) { return 0; }
 static inline void nfsd4_free_slabs(void) { }
 static inline int nfs4_state_start(void) { return 0; }
 static inline void nfs4_state_shutdown(void) { }
@@ -338,15 +340,15 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
 }
 /* These will return ERR_INVAL if specified in GETATTR or READDIR. */
-#define NFSD_WRITEONLY_ATTRS_WORD1                                                          \
+#define NFSD_WRITEONLY_ATTRS_WORD1 \
-(FATTR4_WORD1_TIME_ACCESS_SET   | FATTR4_WORD1_TIME_MODIFY_SET)
+        (FATTR4_WORD1_TIME_ACCESS_SET   | FATTR4_WORD1_TIME_MODIFY_SET)
 /* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
-#define NFSD_WRITEABLE_ATTRS_WORD0                                                          \
+#define NFSD_WRITEABLE_ATTRS_WORD0 \
-(FATTR4_WORD0_SIZE              | FATTR4_WORD0_ACL                                         )
+        (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL)
-#define NFSD_WRITEABLE_ATTRS_WORD1                                                          \
+#define NFSD_WRITEABLE_ATTRS_WORD1 \
-(FATTR4_WORD1_MODE              | FATTR4_WORD1_OWNER         | FATTR4_WORD1_OWNER_GROUP     \
+        (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
- | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+        | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
 #define NFSD_WRITEABLE_ATTRS_WORD2 0
 #define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index a3cf38476a1b..ffb5df1db94f 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -366,6 +366,7 @@ struct nfs4_openowner {
 struct nfs4_lockowner {
        struct nfs4_stateowner  lo_owner; /* must be first element */
+        struct list_head        lo_owner_ino_hash; /* hash by owner,file */
        struct list_head        lo_perstateid; /* for lockowners only */
        struct list_head        lo_list; /* for temporary uses */
 };
@@ -482,7 +483,7 @@ extern void nfsd4_shutdown_recdir(void);
 extern int nfs4_client_to_reclaim(const char *name);
 extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
 extern void nfsd4_recdir_purge_old(void);
-extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
+extern void nfsd4_create_clid_dir(struct nfs4_client *clp);
 extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
 extern void release_session_client(struct nfsd4_session *);
 extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d25a723b68ad..edf6d3ed8777 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -594,8 +594,19 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
        return error;
 }
-#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction."
+/*
-#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type"
+ * NFS junction information is stored in an extended attribute.
+ */
+#define NFSD_JUNCTION_XATTR_NAME        XATTR_TRUSTED_PREFIX "junction.nfs"
+/**
+ * nfsd4_is_junction - Test if an object could be an NFS junction
+ *
+ * @dentry: object to test
+ *
+ * Returns 1 if "dentry" appears to contain NFS junction information.
+ * Otherwise 0 is returned.
+ */
 int nfsd4_is_junction(struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
@@ -606,7 +617,7 @@ int nfsd4_is_junction(struct dentry *dentry)
                return 0;
        if (!(inode->i_mode & S_ISVTX))
                return 0;
-        if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0)
+        if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0)
                return 0;
        return 1;
 }
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 44a88a9fa2c8..fea6bd5831dc 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -52,7 +52,7 @@ static const struct utf8_table utf8_table[] =
 #define SURROGATE_LOW   0x00000400
 #define SURROGATE_BITS  0x000003ff
-int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
+int utf8_to_utf32(const u8 *s, int inlen, unicode_t *pu)
 {
        unsigned long l;
        int c0, c, nc;
@@ -71,7 +71,7 @@ int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
                        *pu = (unicode_t) l;
                        return nc;
                }
-                if (len <= nc)
+                if (inlen <= nc)
                        return -1;
                s++;
                c = (*s ^ 0x80) & 0xFF;
@@ -83,7 +83,7 @@ int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
 }
 EXPORT_SYMBOL(utf8_to_utf32);
-int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
+int utf32_to_utf8(unicode_t u, u8 *s, int maxout)
 {
        unsigned long l;
        int c, nc;
@@ -97,7 +97,7 @@ int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
                return -1;
        nc = 0;
-        for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
+        for (t = utf8_table; t->cmask && maxout; t++, maxout--) {
                nc++;
                if (l <= t->lmask) {
                        c = t->shift;
@@ -114,34 +114,57 @@ int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
 }
 EXPORT_SYMBOL(utf32_to_utf8);
-int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
+static inline void put_utf16(wchar_t *s, unsigned c, enum utf16_endian endian)
+{
+        switch (endian) {
+        default:
+                *s = (wchar_t) c;
+                break;
+        case UTF16_LITTLE_ENDIAN:
+                *s = __cpu_to_le16(c);
+                break;
+        case UTF16_BIG_ENDIAN:
+                *s = __cpu_to_be16(c);
+                break;
+        }
+}
+int utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian,
+                wchar_t *pwcs, int maxout)
 {
        u16 *op;
        int size;
        unicode_t u;
        op = pwcs;
-        while (*s && len > 0) {
+        while (inlen > 0 && maxout > 0 && *s) {
                if (*s & 0x80) {
-                        size = utf8_to_utf32(s, len, &u);
+                        size = utf8_to_utf32(s, inlen, &u);
                        if (size < 0)
                                return -EINVAL;
+                        s += size;
+                        inlen -= size;
                        if (u >= PLANE_SIZE) {
+                                if (maxout < 2)
+                                        break;
                                u -= PLANE_SIZE;
-                                *op++ = (wchar_t) (SURROGATE_PAIR |
+                                put_utf16(op++, SURROGATE_PAIR |
-                                                ((u >> 10) & SURROGATE_BITS));
+                                                ((u >> 10) & SURROGATE_BITS),
-                                *op++ = (wchar_t) (SURROGATE_PAIR |
+                                                endian);
+                                put_utf16(op++, SURROGATE_PAIR |
                                                SURROGATE_LOW |
-                                                (u & SURROGATE_BITS));
+                                                (u & SURROGATE_BITS),
+                                                endian);
+                                maxout -= 2;
                        } else {
-                                *op++ = (wchar_t) u;
+                                put_utf16(op++, u, endian);
+                                maxout--;
                        }
-                        s += size;
-                        len -= size;
                } else {
-                        *op++ = *s++;
+                        put_utf16(op++, *s++, endian);
-                        len--;
+                        inlen--;
+                        maxout--;
                }
        }
        return op - pwcs;
@@ -160,27 +183,27 @@ static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
        }
 }
-int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
+int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian,
-                u8 *s, int maxlen)
+                u8 *s, int maxout)
 {
        u8 *op;
        int size;
        unsigned long u, v;
        op = s;
-        while (len > 0 && maxlen > 0) {
+        while (inlen > 0 && maxout > 0) {
                u = get_utf16(*pwcs, endian);
                if (!u)
                        break;
                pwcs++;
-                len--;
+                inlen--;
                if (u > 0x7f) {
                        if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
                                if (u & SURROGATE_LOW) {
                                        /* Ignore character and move on */
                                        continue;
                                }
-                                if (len <= 0)
+                                if (inlen <= 0)
                                        break;
                                v = get_utf16(*pwcs, endian);
                                if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
@@ -191,18 +214,18 @@ int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
                                u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
                                                + (v & SURROGATE_BITS);
                                pwcs++;
-                                len--;
+                                inlen--;
                        }
-                        size = utf32_to_utf8(u, op, maxlen);
+                        size = utf32_to_utf8(u, op, maxout);
                        if (size == -1) {
                                /* Ignore character and move on */
                        } else {
                                op += size;
-                                maxlen -= size;
+                                maxout -= size;
                        }
                } else {
                        *op++ = (u8) u;
-                        maxlen--;
+                        maxout--;
                }
        }
        return op - s;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index e14587d55689..f104d565b682 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -135,9 +135,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
        mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
-        /* 1 from caller and 1 for being on i_list/g_list */
-        BUG_ON(atomic_read(&mark->refcnt) < 2);
        spin_lock(&group->mark_lock);
        if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
@@ -182,6 +179,11 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
                iput(inode);
        /*
+         * We don't necessarily have a ref on mark from caller so the above iput
+         * may have already destroyed it.  Don't touch from now on.
+         */
+        /*
         * it's possible that this group tried to destroy itself, but this
         * this mark was simultaneously being freed by inode.  If that's the
         * case, we finish freeing the group here.
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 608be4516091..5a4a8af5c406 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3198,7 +3198,7 @@ MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparm
 MODULE_VERSION(NTFS_VERSION);
 MODULE_LICENSE("GPL");
 #ifdef DEBUG
-module_param(debug_msgs, bool, 0);
+module_param(debug_msgs, bint, 0);
 MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
 #endif
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index a5ebe421195f..286edf1e231f 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -827,8 +827,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
                goto out;
        }
-        rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
+        rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
-                               &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
+                               NULL, NULL, NULL, &fsdlm);
        if (rc) {
                ocfs2_live_connection_drop(control);
                goto out;
diff --git a/fs/pipe.c b/fs/pipe.c
index f0e485d54e64..a932ced92a16 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
        if (nr_pages < pipe->nrbufs)
                return -EBUSY;
-        bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
+        bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
        if (unlikely(!bufs))
                return -ENOMEM;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 8c344f037bd0..c602b8d20f06 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -380,7 +380,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
        state = *get_task_state(task);
        vsize = eip = esp = 0;
-        permitted = ptrace_may_access(task, PTRACE_MODE_READ);
+        permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
        mm = get_task_mm(task);
        if (mm) {
                vsize = task_vsize(mm);
@@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
        seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
 %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n",
                pid_nr_ns(pid, ns),
                tcomm,
                state,
@@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                task->policy,
                (unsigned long long)delayacct_blkio_ticks(task),
                cputime_to_clock_t(gtime),
-                cputime_to_clock_t(cgtime));
+                cputime_to_clock_t(cgtime),
+                (mm && permitted) ? mm->start_data : 0,
+                (mm && permitted) ? mm->end_data : 0,
+                (mm && permitted) ? mm->start_brk : 0);
        if (mm)
                mmput(mm);
        return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1dddda999f2..d4548dd49b02 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,9 +83,11 @@
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/flex_array.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
+#include <trace/events/oom.h>
 #include "internal.h"
 /* NOTE:
@@ -133,6 +135,8 @@ struct pid_entry {
                NULL, &proc_single_file_operations,     \
                { .proc_show = show } )
+static int proc_fd_permission(struct inode *inode, int mask);
 /*
 * Count the number of hardlinks for the pid_entry table, excluding the .
 * and .. links.
@@ -165,9 +169,9 @@ static int get_task_root(struct task_struct *task, struct path *root)
        return result;
 }
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
 {
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task = get_proc_task(dentry->d_inode);
        int result = -ENOENT;
        if (task) {
@@ -182,9 +186,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
        return result;
 }
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path)
 {
-        struct task_struct *task = get_proc_task(inode);
+        struct task_struct *task = get_proc_task(dentry->d_inode);
        int result = -ENOENT;
        if (task) {
@@ -194,82 +198,9 @@ static int proc_root_link(struct inode *inode, struct path *path)
        return result;
 }
-static struct mm_struct *__check_mem_permission(struct task_struct *task)
-{
-        struct mm_struct *mm;
-        mm = get_task_mm(task);
-        if (!mm)
-                return ERR_PTR(-EINVAL);
-        /*
-         * A task can always look at itself, in case it chooses
-         * to use system calls instead of load instructions.
-         */
-        if (task == current)
-                return mm;
-        /*
-         * If current is actively ptrace'ing, and would also be
-         * permitted to freshly attach with ptrace now, permit it.
-         */
-        if (task_is_stopped_or_traced(task)) {
-                int match;
-                rcu_read_lock();
-                match = (ptrace_parent(task) == current);
-                rcu_read_unlock();
-                if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
-                        return mm;
-        }
-        /*
-         * No one else is allowed.
-         */
-        mmput(mm);
-        return ERR_PTR(-EPERM);
-}
-/*
- * If current may access user memory in @task return a reference to the
- * corresponding mm, otherwise ERR_PTR.
- */
-static struct mm_struct *check_mem_permission(struct task_struct *task)
-{
-        struct mm_struct *mm;
-        int err;
-        /*
-         * Avoid racing if task exec's as we might get a new mm but validate
-         * against old credentials.
-         */
-        err = mutex_lock_killable(&task->signal->cred_guard_mutex);
-        if (err)
-                return ERR_PTR(err);
-        mm = __check_mem_permission(task);
-        mutex_unlock(&task->signal->cred_guard_mutex);
-        return mm;
-}
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
-        struct mm_struct *mm;
+        return mm_access(task, PTRACE_MODE_READ);
-        int err;
-        err =  mutex_lock_killable(&task->signal->cred_guard_mutex);
-        if (err)
-                return ERR_PTR(err);
-        mm = get_task_mm(task);
-        if (mm && mm != current->mm &&
-                        !ptrace_may_access(task, PTRACE_MODE_READ)) {
-                mmput(mm);
-                mm = ERR_PTR(-EACCES);
-        }
-        mutex_unlock(&task->signal->cred_guard_mutex);
-        return mm;
 }
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
@@ -627,6 +558,52 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
        return 0;
 }
+/*
+ * May current process learn task's sched/cmdline info (for hide_pid_min=1)
+ * or euid/egid (for hide_pid_min=2)?
+ */
+static bool has_pid_permissions(struct pid_namespace *pid,
+                                 struct task_struct *task,
+                                 int hide_pid_min)
+{
+        if (pid->hide_pid < hide_pid_min)
+                return true;
+        if (in_group_p(pid->pid_gid))
+                return true;
+        return ptrace_may_access(task, PTRACE_MODE_READ);
+}
+static int proc_pid_permission(struct inode *inode, int mask)
+{
+        struct pid_namespace *pid = inode->i_sb->s_fs_info;
+        struct task_struct *task;
+        bool has_perms;
+        task = get_proc_task(inode);
+        if (!task)
+                return -ESRCH;
+        has_perms = has_pid_permissions(pid, task, 1);
+        put_task_struct(task);
+        if (!has_perms) {
+                if (pid->hide_pid == 2) {
+                        /*
+                         * Let's make getdents(), stat(), and open()
+                         * consistent with each other.  If a process
+                         * may not stat() a file, it shouldn't be seen
+                         * in procfs at all.
+                         */
+                        return -ENOENT;
+                }
+                return -EPERM;
+        }
+        return generic_permission(inode, mask);
+}
 static const struct inode_operations proc_def_inode_operations = {
        .setattr        = proc_setattr,
 };
@@ -702,133 +679,96 @@ static const struct file_operations proc_single_file_operations = {
 static int mem_open(struct inode* inode, struct file* file)
 {
-        file->private_data = (void*)((long)current->self_exec_id);
-        /* OK to pass negative loff_t, we can catch out-of-range */
-        file->f_mode |= FMODE_UNSIGNED_OFFSET;
-        return 0;
-}
-static ssize_t mem_read(struct file * file, char __user * buf,
-                        size_t count, loff_t *ppos)
-{
        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
-        char *page;
-        unsigned long src = *ppos;
-        int ret = -ESRCH;
        struct mm_struct *mm;
        if (!task)
-                goto out_no_task;
+                return -ESRCH;
-        ret = -ENOMEM;
+        mm = mm_access(task, PTRACE_MODE_ATTACH);
-        page = (char *)__get_free_page(GFP_TEMPORARY);
+        put_task_struct(task);
-        if (!page)
-                goto out;
-        mm = check_mem_permission(task);
-        ret = PTR_ERR(mm);
        if (IS_ERR(mm))
-                goto out_free;
+                return PTR_ERR(mm);
-        ret = -EIO;
- 
-        if (file->private_data != (void*)((long)current->self_exec_id))
-                goto out_put;
-        ret = 0;
+        if (mm) {
- 
+                /* ensure this mm_struct can't be freed */
-        while (count > 0) {
+                atomic_inc(&mm->mm_count);
-                int this_len, retval;
+                /* but do not pin its memory */
+                mmput(mm);
-                this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
-                retval = access_remote_vm(mm, src, page, this_len, 0);
-                if (!retval) {
-                        if (!ret)
-                                ret = -EIO;
-                        break;
-                }
-                if (copy_to_user(buf, page, retval)) {
-                        ret = -EFAULT;
-                        break;
-                }
- 
-                ret += retval;
-                src += retval;
-                buf += retval;
-                count -= retval;
        }
-        *ppos = src;
-out_put:
+        /* OK to pass negative loff_t, we can catch out-of-range */
-        mmput(mm);
+        file->f_mode |= FMODE_UNSIGNED_OFFSET;
-out_free:
+        file->private_data = mm;
-        free_page((unsigned long) page);
-out:
+        return 0;
-        put_task_struct(task);
-out_no_task:
-        return ret;
 }
-static ssize_t mem_write(struct file * file, const char __user *buf,
+static ssize_t mem_rw(struct file *file, char __user *buf,
-                         size_t count, loff_t *ppos)
+                        size_t count, loff_t *ppos, int write)
 {
-        int copied;
+        struct mm_struct *mm = file->private_data;
+        unsigned long addr = *ppos;
+        ssize_t copied;
        char *page;
-        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
-        unsigned long dst = *ppos;
-        struct mm_struct *mm;
-        copied = -ESRCH;
+        if (!mm)
-        if (!task)
+                return 0;
-                goto out_no_task;
-        copied = -ENOMEM;
        page = (char *)__get_free_page(GFP_TEMPORARY);
        if (!page)
-                goto out_task;
+                return -ENOMEM;
-        mm = check_mem_permission(task);
-        copied = PTR_ERR(mm);
-        if (IS_ERR(mm))
-                goto out_free;
-        copied = -EIO;
-        if (file->private_data != (void *)((long)current->self_exec_id))
-                goto out_mm;
        copied = 0;
+        if (!atomic_inc_not_zero(&mm->mm_users))
+                goto free;
        while (count > 0) {
-                int this_len, retval;
+                int this_len = min_t(int, count, PAGE_SIZE);
-                this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
+                if (write && copy_from_user(page, buf, this_len)) {
-                if (copy_from_user(page, buf, this_len)) {
                        copied = -EFAULT;
                        break;
                }
-                retval = access_remote_vm(mm, dst, page, this_len, 1);
-                if (!retval) {
+                this_len = access_remote_vm(mm, addr, page, this_len, write);
+                if (!this_len) {
                        if (!copied)
                                copied = -EIO;
                        break;
                }
-                copied += retval;
-                buf += retval;
+                if (!write && copy_to_user(buf, page, this_len)) {
-                dst += retval;
+                        copied = -EFAULT;
-                count -= retval;                        
+                        break;
+                }
+                buf += this_len;
+                addr += this_len;
+                copied += this_len;
+                count -= this_len;
        }
-        *ppos = dst;
+        *ppos = addr;
-out_mm:
        mmput(mm);
-out_free:
+free:
        free_page((unsigned long) page);
-out_task:
-        put_task_struct(task);
-out_no_task:
        return copied;
 }
+static ssize_t mem_read(struct file *file, char __user *buf,
+                        size_t count, loff_t *ppos)
+{
+        return mem_rw(file, buf, count, ppos, 0);
+}
+static ssize_t mem_write(struct file *file, const char __user *buf,
+                         size_t count, loff_t *ppos)
+{
+        return mem_rw(file, (char __user*)buf, count, ppos, 1);
+}
 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 {
        switch (orig) {
@@ -845,11 +785,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
        return file->f_pos;
 }
+static int mem_release(struct inode *inode, struct file *file)
+{
+        struct mm_struct *mm = file->private_data;
+        if (mm)
+                mmdrop(mm);
+        return 0;
+}
 static const struct file_operations proc_mem_operations = {
        .llseek         = mem_lseek,
        .read           = mem_read,
        .write          = mem_write,
        .open           = mem_open,
+        .release        = mem_release,
 };
 static ssize_t environ_read(struct file *file, char __user *buf,
@@ -1010,6 +959,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        else
                task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
                                                                -OOM_DISABLE;
+        trace_oom_score_adj_update(task);
 err_sighand:
        unlock_task_sighand(task, &flags);
 err_task_lock:
@@ -1097,6 +1047,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
        task->signal->oom_score_adj = oom_score_adj;
        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
                task->signal->oom_score_adj_min = oom_score_adj;
+        trace_oom_score_adj_update(task);
        /*
         * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
         * always attainable.
@@ -1147,9 +1098,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
        ssize_t length;
        uid_t loginuid;
-        if (!capable(CAP_AUDIT_CONTROL))
-                return -EPERM;
        rcu_read_lock();
        if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
                rcu_read_unlock();
@@ -1178,7 +1126,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
                goto out_free_page;
        }
-        length = audit_set_loginuid(current, loginuid);
+        length = audit_set_loginuid(loginuid);
        if (likely(length == 0))
                length = count;
@@ -1453,13 +1401,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
        .release        = single_release,
 };
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 {
        struct task_struct *task;
        struct mm_struct *mm;
        struct file *exe_file;
-        task = get_proc_task(inode);
+        task = get_proc_task(dentry->d_inode);
        if (!task)
                return -ENOENT;
        mm = get_task_mm(task);
@@ -1489,7 +1437,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
        if (!proc_fd_access_allowed(inode))
                goto out;
-        error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+        error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
 out:
        return ERR_PTR(error);
 }
@@ -1528,7 +1476,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
        if (!proc_fd_access_allowed(inode))
                goto out;
-        error = PROC_I(inode)->op.proc_get_link(inode, &path);
+        error = PROC_I(inode)->op.proc_get_link(dentry, &path);
        if (error)
                goto out;
@@ -1609,6 +1557,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        struct inode *inode = dentry->d_inode;
        struct task_struct *task;
        const struct cred *cred;
+        struct pid_namespace *pid = dentry->d_sb->s_fs_info;
        generic_fillattr(inode, stat);
@@ -1617,6 +1566,14 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        stat->gid = 0;
        task = pid_task(proc_pid(inode), PIDTYPE_PID);
        if (task) {
+                if (!has_pid_permissions(pid, task, 2)) {
+                        rcu_read_unlock();
+                        /*
+                         * This doesn't prevent learning whether PID exists,
+                         * it only makes getattr() consistent with readdir().
+                         */
+                        return -ENOENT;
+                }
                if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
                    task_dumpable(task)) {
                        cred = __task_cred(task);
@@ -1820,9 +1777,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
        return -ENOENT;
 }
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path)
 {
-        return proc_fd_info(inode, path, NULL);
+        return proc_fd_info(dentry->d_inode, path, NULL);
 }
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
@@ -2043,6 +2000,355 @@ static const struct file_operations proc_fd_operations = {
        .llseek         = default_llseek,
 };
+#ifdef CONFIG_CHECKPOINT_RESTORE
+/*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+                             unsigned long *start, unsigned long *end)
+{
+        if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+                return -EINVAL;
+        return 0;
+}
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+        unsigned long vm_start, vm_end;
+        bool exact_vma_exists = false;
+        struct mm_struct *mm = NULL;
+        struct task_struct *task;
+        const struct cred *cred;
+        struct inode *inode;
+        int status = 0;
+        if (nd && nd->flags & LOOKUP_RCU)
+                return -ECHILD;
+        if (!capable(CAP_SYS_ADMIN)) {
+                status = -EACCES;
+                goto out_notask;
+        }
+        inode = dentry->d_inode;
+        task = get_proc_task(inode);
+        if (!task)
+                goto out_notask;
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
+                goto out;
+        mm = get_task_mm(task);
+        if (!mm)
+                goto out;
+        if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+                down_read(&mm->mmap_sem);
+                exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+                up_read(&mm->mmap_sem);
+        }
+        mmput(mm);
+        if (exact_vma_exists) {
+                if (task_dumpable(task)) {
+                        rcu_read_lock();
+                        cred = __task_cred(task);
+                        inode->i_uid = cred->euid;
+                        inode->i_gid = cred->egid;
+                        rcu_read_unlock();
+                } else {
+                        inode->i_uid = 0;
+                        inode->i_gid = 0;
+                }
+                security_task_to_inode(task, inode);
+                status = 1;
+        }
+out:
+        put_task_struct(task);
+out_notask:
+        if (status <= 0)
+                d_drop(dentry);
+        return status;
+}
+static const struct dentry_operations tid_map_files_dentry_operations = {
+        .d_revalidate   = map_files_d_revalidate,
+        .d_delete       = pid_delete_dentry,
+};
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+        unsigned long vm_start, vm_end;
+        struct vm_area_struct *vma;
+        struct task_struct *task;
+        struct mm_struct *mm;
+        int rc;
+        rc = -ENOENT;
+        task = get_proc_task(dentry->d_inode);
+        if (!task)
+                goto out;
+        mm = get_task_mm(task);
+        put_task_struct(task);
+        if (!mm)
+                goto out;
+        rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+        if (rc)
+                goto out_mmput;
+        down_read(&mm->mmap_sem);
+        vma = find_exact_vma(mm, vm_start, vm_end);
+        if (vma && vma->vm_file) {
+                *path = vma->vm_file->f_path;
+                path_get(path);
+                rc = 0;
+        }
+        up_read(&mm->mmap_sem);
+out_mmput:
+        mmput(mm);
+out:
+        return rc;
+}
+struct map_files_info {
+        struct file     *file;
+        unsigned long   len;
+        unsigned char   name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+                           struct task_struct *task, const void *ptr)
+{
+        const struct file *file = ptr;
+        struct proc_inode *ei;
+        struct inode *inode;
+        if (!file)
+                return ERR_PTR(-ENOENT);
+        inode = proc_pid_make_inode(dir->i_sb, task);
+        if (!inode)
+                return ERR_PTR(-ENOENT);
+        ei = PROC_I(inode);
+        ei->op.proc_get_link = proc_map_files_get_link;
+        inode->i_op = &proc_pid_link_inode_operations;
+        inode->i_size = 64;
+        inode->i_mode = S_IFLNK;
+        if (file->f_mode & FMODE_READ)
+                inode->i_mode |= S_IRUSR;
+        if (file->f_mode & FMODE_WRITE)
+                inode->i_mode |= S_IWUSR;
+        d_set_d_op(dentry, &tid_map_files_dentry_operations);
+        d_add(dentry, inode);
+        return NULL;
+}
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+                struct dentry *dentry, struct nameidata *nd)
+{
+        unsigned long vm_start, vm_end;
+        struct vm_area_struct *vma;
+        struct task_struct *task;
+        struct dentry *result;
+        struct mm_struct *mm;
+        result = ERR_PTR(-EACCES);
+        if (!capable(CAP_SYS_ADMIN))
+                goto out;
+        result = ERR_PTR(-ENOENT);
+        task = get_proc_task(dir);
+        if (!task)
+                goto out;
+        result = ERR_PTR(-EACCES);
+        if (lock_trace(task))
+                goto out_put_task;
+        result = ERR_PTR(-ENOENT);
+        if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+                goto out_unlock;
+        mm = get_task_mm(task);
+        if (!mm)
+                goto out_unlock;
+        down_read(&mm->mmap_sem);
+        vma = find_exact_vma(mm, vm_start, vm_end);
+        if (!vma)
+                goto out_no_vma;
+        result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+out_no_vma:
+        up_read(&mm->mmap_sem);
+        mmput(mm);
+out_unlock:
+        unlock_trace(task);
+out_put_task:
+        put_task_struct(task);
+out:
+        return result;
+}
+static const struct inode_operations proc_map_files_inode_operations = {
+        .lookup         = proc_map_files_lookup,
+        .permission     = proc_fd_permission,
+        .setattr        = proc_setattr,
+};
+static int
+proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+        struct dentry *dentry = filp->f_path.dentry;
+        struct inode *inode = dentry->d_inode;
+        struct vm_area_struct *vma;
+        struct task_struct *task;
+        struct mm_struct *mm;
+        ino_t ino;
+        int ret;
+        ret = -EACCES;
+        if (!capable(CAP_SYS_ADMIN))
+                goto out;
+        ret = -ENOENT;
+        task = get_proc_task(inode);
+        if (!task)
+                goto out;
+        ret = -EACCES;
+        if (lock_trace(task))
+                goto out_put_task;
+        ret = 0;
+        switch (filp->f_pos) {
+        case 0:
+                ino = inode->i_ino;
+                if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+                        goto out_unlock;
+                filp->f_pos++;
+        case 1:
+                ino = parent_ino(dentry);
+                if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+                        goto out_unlock;
+                filp->f_pos++;
+        default:
+        {
+                unsigned long nr_files, pos, i;
+                struct flex_array *fa = NULL;
+                struct map_files_info info;
+                struct map_files_info *p;
+                mm = get_task_mm(task);
+                if (!mm)
+                        goto out_unlock;
+                down_read(&mm->mmap_sem);
+                nr_files = 0;
+                /*
+                 * We need two passes here:
+                 *
+                 *  1) Collect vmas of mapped files with mmap_sem taken
+                 *  2) Release mmap_sem and instantiate entries
+                 *
+                 * otherwise we get lockdep complained, since filldir()
+                 * routine might require mmap_sem taken in might_fault().
+                 */
+                for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+                        if (vma->vm_file && ++pos > filp->f_pos)
+                                nr_files++;
+                }
+                if (nr_files) {
+                        fa = flex_array_alloc(sizeof(info), nr_files,
+                                                GFP_KERNEL);
+                        if (!fa || flex_array_prealloc(fa, 0, nr_files,
+                                                        GFP_KERNEL)) {
+                                ret = -ENOMEM;
+                                if (fa)
+                                        flex_array_free(fa);
+                                up_read(&mm->mmap_sem);
+                                mmput(mm);
+                                goto out_unlock;
+                        }
+                        for (i = 0, vma = mm->mmap, pos = 2; vma;
+                                        vma = vma->vm_next) {
+                                if (!vma->vm_file)
+                                        continue;
+                                if (++pos <= filp->f_pos)
+                                        continue;
+                                get_file(vma->vm_file);
+                                info.file = vma->vm_file;
+                                info.len = snprintf(info.name,
+                                                sizeof(info.name), "%lx-%lx",
+                                                vma->vm_start, vma->vm_end);
+                                if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+                                        BUG();
+                        }
+                }
+                up_read(&mm->mmap_sem);
+                for (i = 0; i < nr_files; i++) {
+                        p = flex_array_get(fa, i);
+                        ret = proc_fill_cache(filp, dirent, filldir,
+                                              p->name, p->len,
+                                              proc_map_files_instantiate,
+                                              task, p->file);
+                        if (ret)
+                                break;
+                        filp->f_pos++;
+                        fput(p->file);
+                }
+                for (; i < nr_files; i++) {
+                        /*
+                         * In case of error don't forget
+                         * to put rest of file refs.
+                         */
+                        p = flex_array_get(fa, i);
+                        fput(p->file);
+                }
+                if (fa)
+                        flex_array_free(fa);
+                mmput(mm);
+        }
+        }
+out_unlock:
+        unlock_trace(task);
+out_put_task:
+        put_task_struct(task);
+out:
+        return ret;
+}
+static const struct file_operations proc_map_files_operations = {
+        .read           = generic_read_dir,
+        .readdir        = proc_map_files_readdir,
+        .llseek         = default_llseek,
+};
+#endif /* CONFIG_CHECKPOINT_RESTORE */
 /*
 * /proc/pid/fd needs a special permission handler so that a process can still
 * access /proc/self/fd after it has executed a setuid().
@@ -2658,6 +2964,9 @@ static const struct inode_operations proc_task_inode_operations;
 static const struct pid_entry tgid_base_stuff[] = {
        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+        DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
+#endif
        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",         S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
@@ -2761,6 +3070,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
        .lookup         = proc_tgid_base_lookup,
        .getattr        = pid_getattr,
        .setattr        = proc_setattr,
+        .permission     = proc_pid_permission,
 };
 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
@@ -2964,6 +3274,12 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
                                proc_pid_instantiate, iter.task, NULL);
 }
+static int fake_filldir(void *buf, const char *name, int namelen,
+                        loff_t offset, u64 ino, unsigned d_type)
+{
+        return 0;
+}
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
@@ -2971,6 +3287,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
        struct task_struct *reaper;
        struct tgid_iter iter;
        struct pid_namespace *ns;
+        filldir_t __filldir;
        if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
                goto out_no_task;
@@ -2992,8 +3309,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
        for (iter = next_tgid(ns, iter);
             iter.task;
             iter.tgid += 1, iter = next_tgid(ns, iter)) {
+                if (has_pid_permissions(ns, iter.task, 2))
+                        __filldir = filldir;
+                else
+                        __filldir = fake_filldir;
                filp->f_pos = iter.tgid + TGID_OFFSET;
-                if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
+                if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
                        put_task_struct(iter.task);
                        goto out;
                }
@@ -3328,6 +3650,7 @@ static const struct inode_operations proc_task_inode_operations = {
        .lookup         = proc_task_lookup,
        .getattr        = proc_task_getattr,
        .setattr        = proc_setattr,
+        .permission     = proc_pid_permission,
 };
 static const struct file_operations proc_task_operations = {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 51a176622b8f..84fd3235a590 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -7,6 +7,7 @@
 #include <linux/time.h>
 #include <linux/proc_fs.h>
 #include <linux/kernel.h>
+#include <linux/pid_namespace.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/stat.h>
@@ -17,7 +18,9 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
+#include <linux/seq_file.h>
 #include <linux/slab.h>
+#include <linux/mount.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -101,12 +104,27 @@ void __init proc_init_inodecache(void)
                                             init_once);
 }
+static int proc_show_options(struct seq_file *seq, struct dentry *root)
+{
+        struct super_block *sb = root->d_sb;
+        struct pid_namespace *pid = sb->s_fs_info;
+        if (pid->pid_gid)
+                seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid);
+        if (pid->hide_pid != 0)
+                seq_printf(seq, ",hidepid=%u", pid->hide_pid);
+        return 0;
+}
 static const struct super_operations proc_sops = {
        .alloc_inode    = proc_alloc_inode,
        .destroy_inode  = proc_destroy_inode,
        .drop_inode     = generic_delete_inode,
        .evict_inode    = proc_evict_inode,
        .statfs         = simple_statfs,
+        .remount_fs     = proc_remount,
+        .show_options   = proc_show_options,
 };
 static void __pde_users_dec(struct proc_dir_entry *pde)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7838e5cfec14..292577531ad1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -117,6 +117,7 @@ void pde_put(struct proc_dir_entry *pde);
 int proc_fill_super(struct super_block *);
 struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
+int proc_remount(struct super_block *sb, int *flags, char *data);
 /*
 * These are generic /proc routines that use the internal
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 03102d978180..46a15d8a29ca 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -18,6 +18,7 @@
 #include <linux/bitops.h>
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
+#include <linux/parser.h>
 #include "internal.h"
@@ -36,6 +37,63 @@ static int proc_set_super(struct super_block *sb, void *data)
        return err;
 }
+enum {
+        Opt_gid, Opt_hidepid, Opt_err,
+};
+static const match_table_t tokens = {
+        {Opt_hidepid, "hidepid=%u"},
+        {Opt_gid, "gid=%u"},
+        {Opt_err, NULL},
+};
+static int proc_parse_options(char *options, struct pid_namespace *pid)
+{
+        char *p;
+        substring_t args[MAX_OPT_ARGS];
+        int option;
+        if (!options)
+                return 1;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                args[0].to = args[0].from = 0;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_gid:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        pid->pid_gid = option;
+                        break;
+                case Opt_hidepid:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option < 0 || option > 2) {
+                                pr_err("proc: hidepid value must be between 0 and 2.\n");
+                                return 0;
+                        }
+                        pid->hide_pid = option;
+                        break;
+                default:
+                        pr_err("proc: unrecognized mount option \"%s\" "
+                               "or missing value\n", p);
+                        return 0;
+                }
+        }
+        return 1;
+}
+int proc_remount(struct super_block *sb, int *flags, char *data)
+{
+        struct pid_namespace *pid = sb->s_fs_info;
+        return !proc_parse_options(data, pid);
+}
 static struct dentry *proc_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
@@ -43,11 +101,15 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
        struct super_block *sb;
        struct pid_namespace *ns;
        struct proc_inode *ei;
+        char *options;
-        if (flags & MS_KERNMOUNT)
+        if (flags & MS_KERNMOUNT) {
                ns = (struct pid_namespace *)data;
-        else
+                options = NULL;
+        } else {
                ns = current->nsproxy->pid_ns;
+                options = data;
+        }
        sb = sget(fs_type, proc_test_super, proc_set_super, ns);
        if (IS_ERR(sb))
@@ -55,6 +117,10 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
        if (!sb->s_root) {
                sb->s_flags = flags;
+                if (!proc_parse_options(options, ns)) {
+                        deactivate_locked_super(sb);
+                        return ERR_PTR(-EINVAL);
+                }
                err = proc_fill_super(sb);
                if (err) {
                        deactivate_locked_super(sb);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index d76ca6ae2b1b..121f77cfef76 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -77,6 +77,8 @@ static int show_stat(struct seq_file *p, void *v)
                steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
                guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
                guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
+                sum += kstat_cpu_irqs_sum(i);
+                sum += arch_irq_stat_cpu(i);
                for (j = 0; j < NR_SOFTIRQS; j++) {
                        unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e418c5abdb0e..7dcd2a250495 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -518,6 +518,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
                if (!page)
                        continue;
+                if (PageReserved(page))
+                        continue;
                /* Clear accessed and referenced bits. */
                ptep_test_and_clear_young(vma, addr, pte);
                ClearPageReferenced(page);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2bfd987f4853..6b009548d2e0 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -179,47 +179,33 @@ static const char *qnx4_checkroot(struct super_block *sb)
        struct qnx4_inode_entry *rootdir;
        int rd, rl;
        int i, j;
-        int found = 0;
-        if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') {
+        if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/')
                return "no qnx4 filesystem (no root dir).";
-        } else {
+        QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
-                QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
+        rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
-                rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
+        rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
-                rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
+        for (j = 0; j < rl; j++) {
-                for (j = 0; j < rl; j++) {
+                bh = sb_bread(sb, rd + j);      /* root dir, first block */
-                        bh = sb_bread(sb, rd + j);      /* root dir, first block */
+                if (bh == NULL)
-                        if (bh == NULL) {
+                        return "unable to read root entry.";
-                                return "unable to read root entry.";
+                rootdir = (struct qnx4_inode_entry *) bh->b_data;
-                        }
+                for (i = 0; i < QNX4_INODES_PER_BLOCK; i++, rootdir++) {
-                        for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
+                        QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
-                                rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
+                        if (strcmp(rootdir->di_fname, QNX4_BMNAME) != 0)
-                                if (rootdir->di_fname != NULL) {
+                                continue;
-                                        QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
+                        qnx4_sb(sb)->BitMap = kmemdup(rootdir,
-                                        if (!strcmp(rootdir->di_fname,
+                                                      sizeof(struct qnx4_inode_entry),
-                                                    QNX4_BMNAME)) {
+                                                      GFP_KERNEL);
-                                                found = 1;
-                                                qnx4_sb(sb)->BitMap = kmemdup(rootdir,
-                                                                              sizeof(struct qnx4_inode_entry),
-                                                                              GFP_KERNEL);
-                                                if (!qnx4_sb(sb)->BitMap) {
-                                                        brelse (bh);
-                                                        return "not enough memory for bitmap inode";
-                                                }/* keep bitmap inode known */
-                                                break;
-                                        }
-                                }
-                        }
                        brelse(bh);
-                        if (found != 0) {
+                        if (!qnx4_sb(sb)->BitMap)
-                                break;
+                                return "not enough memory for bitmap inode";
-                        }
+                        /* keep bitmap inode known */
-                }
+                        return NULL;
-                if (found == 0) {
-                        return "bitmap file not found.";
                }
+                brelse(bh);
        }
-        return NULL;
+        return "bitmap file not found.";
 }
 static int qnx4_fill_super(struct super_block *s, void *data, int silent)
@@ -270,7 +256,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
        if (IS_ERR(root)) {
                printk(KERN_ERR "qnx4: get inode failed\n");
                ret = PTR_ERR(root);
-                goto out;
+                goto outb;
        }
        ret = -ENOMEM;
@@ -283,6 +269,8 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
      outi:
        iput(root);
+      outb:
+        kfree(qs->BitMap);
      out:
        brelse(bh);
      outnobh:
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 5ec59b20cf76..46741970371b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2125,6 +2125,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                mutex_unlock(&dqopt->dqio_mutex);
                goto out_file_init;
        }
+        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+                dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
        mutex_unlock(&dqopt->dqio_mutex);
        spin_lock(&dq_state_lock);
        dqopt->flags |= dquot_state_flag(flags, type);
@@ -2464,7 +2466,7 @@ int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
        spin_lock(&dq_data_lock);
        ii->dqi_bgrace = mi->dqi_bgrace;
        ii->dqi_igrace = mi->dqi_igrace;
-        ii->dqi_flags = mi->dqi_flags & DQF_MASK;
+        ii->dqi_flags = mi->dqi_flags & DQF_GETINFO_MASK;
        ii->dqi_valid = IIF_ALL;
        spin_unlock(&dq_data_lock);
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
@@ -2490,8 +2492,8 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
        if (ii->dqi_valid & IIF_IGRACE)
                mi->dqi_igrace = ii->dqi_igrace;
        if (ii->dqi_valid & IIF_FLAGS)
-                mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) |
+                mi->dqi_flags = (mi->dqi_flags & ~DQF_SETINFO_MASK) |
-                                (ii->dqi_flags & DQF_MASK);
+                                (ii->dqi_flags & DQF_SETINFO_MASK);
        spin_unlock(&dq_data_lock);
        mark_info_dirty(sb, type);
        /* Force write to disk */
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index a945cd265228..70de42f09f1d 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1364,10 +1364,7 @@ int reiserfs_init_bitmap_cache(struct super_block *sb)
        struct reiserfs_bitmap_info *bitmap;
        unsigned int bmap_nr = reiserfs_bmap_count(sb);
-        /* Avoid lock recursion in fault case */
-        reiserfs_write_unlock(sb);
        bitmap = vmalloc(sizeof(*bitmap) * bmap_nr);
-        reiserfs_write_lock(sb);
        if (bitmap == NULL)
                return -ENOMEM;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index eb711060a6f2..c3cf54fd4de3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2678,16 +2678,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        char b[BDEVNAME_SIZE];
        int ret;
-        /*
-         * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS
-         * dependency inversion warnings.
-         */
-        reiserfs_write_unlock(sb);
        journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
        if (!journal) {
                reiserfs_warning(sb, "journal-1256",
                                 "unable to get memory for journal structure");
-                reiserfs_write_lock(sb);
                return 1;
        }
        INIT_LIST_HEAD(&journal->j_bitmap_nodes);
@@ -2695,10 +2689,8 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        INIT_LIST_HEAD(&journal->j_working_list);
        INIT_LIST_HEAD(&journal->j_journal_list);
        journal->j_persistent_trans = 0;
-        ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
+        if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
-                                           reiserfs_bmap_count(sb));
+                                           reiserfs_bmap_count(sb)))
-        reiserfs_write_lock(sb);
-        if (ret)
                goto free_and_return;
        allocate_bitmap_nodes(sb);
@@ -2727,27 +2719,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
                goto free_and_return;
        }
-        /*
-         * We need to unlock here to avoid creating the following
-         * dependency:
-         * reiserfs_lock -> sysfs_mutex
-         * Because the reiserfs mmap path creates the following dependency:
-         * mm->mmap -> reiserfs_lock, hence we have
-         * mm->mmap -> reiserfs_lock ->sysfs_mutex
-         * This would ends up in a circular dependency with sysfs readdir path
-         * which does sysfs_mutex -> mm->mmap_sem
-         * This is fine because the reiserfs lock is useless in mount path,
-         * at least until we call journal_begin. We keep it for paranoid
-         * reasons.
-         */
-        reiserfs_write_unlock(sb);
        if (journal_init_dev(sb, journal, j_dev_name) != 0) {
-                reiserfs_write_lock(sb);
                reiserfs_warning(sb, "sh-462",
                                 "unable to initialize jornal device");
                goto free_and_return;
        }
-        reiserfs_write_lock(sb);
        rs = SB_DISK_SUPER_BLOCK(sb);
@@ -2829,9 +2805,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        journal->j_mount_id = 10;
        journal->j_state = 0;
        atomic_set(&(journal->j_jlock), 0);
-        reiserfs_write_unlock(sb);
        journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
-        reiserfs_write_lock(sb);
        journal->j_cnode_free_orig = journal->j_cnode_free_list;
        journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
        journal->j_cnode_used = 0;
@@ -2848,24 +2822,37 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        init_journal_hash(sb);
        jl = journal->j_current_jl;
+        /*
+         * get_list_bitmap() may call flush_commit_list() which
+         * requires the lock. Calling flush_commit_list() shouldn't happen
+         * this early but I like to be paranoid.
+         */
+        reiserfs_write_lock(sb);
        jl->j_list_bitmap = get_list_bitmap(sb, jl);
+        reiserfs_write_unlock(sb);
        if (!jl->j_list_bitmap) {
                reiserfs_warning(sb, "journal-2005",
                                 "get_list_bitmap failed for journal list 0");
                goto free_and_return;
        }
-        if (journal_read(sb) < 0) {
+        /*
+         * Journal_read needs to be inspected in order to push down
+         * the lock further inside (or even remove it).
+         */
+        reiserfs_write_lock(sb);
+        ret = journal_read(sb);
+        reiserfs_write_unlock(sb);
+        if (ret < 0) {
                reiserfs_warning(sb, "reiserfs-2006",
                                 "Replay Failure, unable to mount");
                goto free_and_return;
        }
        reiserfs_mounted_fs_count++;
-        if (reiserfs_mounted_fs_count <= 1) {
+        if (reiserfs_mounted_fs_count <= 1)
-                reiserfs_write_unlock(sb);
                commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
-                reiserfs_write_lock(sb);
-        }
        INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
        journal->j_work_sb = sb;
@@ -2896,14 +2883,13 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
            journal->j_cnode_free < (journal->j_trans_max * 3)) {
                return 1;
        }
-        /* protected by the BKL here */
        journal->j_len_alloc += new_alloc;
        th->t_blocks_allocated += new_alloc ;
        return 0;
 }
-/* this must be called inside a transaction, and requires the
+/* this must be called inside a transaction
-** kernel_lock to be held
 */
 void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
 {
@@ -2914,8 +2900,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
        return;
 }
-/* this must be called without a transaction started, and does not
+/* this must be called without a transaction started
-** require BKL
 */
 void reiserfs_allow_writes(struct super_block *s)
 {
@@ -2924,8 +2909,7 @@ void reiserfs_allow_writes(struct super_block *s)
        wake_up(&journal->j_join_wait);
 }
-/* this must be called without a transaction started, and does not
+/* this must be called without a transaction started
-** require BKL
 */
 void reiserfs_wait_on_write_block(struct super_block *s)
 {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 19c454e61b79..e12d8b97cd4d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -455,16 +455,20 @@ int remove_save_link(struct inode *inode, int truncate)
 static void reiserfs_kill_sb(struct super_block *s)
 {
        if (REISERFS_SB(s)) {
-                if (REISERFS_SB(s)->xattr_root) {
+                /*
-                        d_invalidate(REISERFS_SB(s)->xattr_root);
+                 * Force any pending inode evictions to occur now. Any
-                        dput(REISERFS_SB(s)->xattr_root);
+                 * inodes to be removed that have extended attributes
-                        REISERFS_SB(s)->xattr_root = NULL;
+                 * associated with them need to clean them up before
-                }
+                 * we can release the extended attribute root dentries.
-                if (REISERFS_SB(s)->priv_root) {
+                 * shrink_dcache_for_umount will BUG if we don't release
-                        d_invalidate(REISERFS_SB(s)->priv_root);
+                 * those before it's called so ->put_super is too late.
-                        dput(REISERFS_SB(s)->priv_root);
+                 */
-                        REISERFS_SB(s)->priv_root = NULL;
+                shrink_dcache_sb(s);
-                }
+                dput(REISERFS_SB(s)->xattr_root);
+                REISERFS_SB(s)->xattr_root = NULL;
+                dput(REISERFS_SB(s)->priv_root);
+                REISERFS_SB(s)->priv_root = NULL;
        }
        kill_block_super(s);
@@ -1249,7 +1253,8 @@ static void handle_quota_files(struct super_block *s, char **qf_names,
                        kfree(REISERFS_SB(s)->s_qf_names[i]);
                REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
        }
-        REISERFS_SB(s)->s_jquota_fmt = *qfmt;
+        if (*qfmt)
+                REISERFS_SB(s)->s_jquota_fmt = *qfmt;
 }
 #endif
@@ -1514,9 +1519,7 @@ static int read_super_block(struct super_block *s, int offset)
 static int reread_meta_blocks(struct super_block *s)
 {
        ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
-        reiserfs_write_unlock(s);
        wait_on_buffer(SB_BUFFER_WITH_SB(s));
-        reiserfs_write_lock(s);
        if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
                reiserfs_warning(s, "reiserfs-2504", "error reading the super");
                return 1;
@@ -1741,22 +1744,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        mutex_init(&REISERFS_SB(s)->lock);
        REISERFS_SB(s)->lock_depth = -1;
-        /*
-         * This function is called with the bkl, which also was the old
-         * locking used here.
-         * do_journal_begin() will soon check if we hold the lock (ie: was the
-         * bkl). This is likely because do_journal_begin() has several another
-         * callers because at this time, it doesn't seem to be necessary to
-         * protect against anything.
-         * Anyway, let's be conservative and lock for now.
-         */
-        reiserfs_write_lock(s);
        jdev_name = NULL;
        if (reiserfs_parse_options
            (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
             &commit_max_age, qf_names, &qfmt) == 0) {
-                goto error;
+                goto error_unlocked;
        }
        if (jdev_name && jdev_name[0]) {
                REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
@@ -1772,7 +1764,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        if (blocks) {
                SWARN(silent, s, "jmacd-7", "resize option for remount only");
-                goto error;
+                goto error_unlocked;
        }
        /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */
@@ -1782,7 +1774,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
                SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
                      reiserfs_bdevname(s));
-                goto error;
+                goto error_unlocked;
        }
        rs = SB_DISK_SUPER_BLOCK(s);
@@ -1798,7 +1790,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
                      "or increase size of your LVM partition");
                SWARN(silent, s, "", "Or may be you forgot to "
                      "reboot after fdisk when it told you to");
-                goto error;
+                goto error_unlocked;
        }
        sbi->s_mount_state = SB_REISERFS_STATE(s);
@@ -1806,8 +1798,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        if ((errval = reiserfs_init_bitmap_cache(s))) {
                SWARN(silent, s, "jmacd-8", "unable to read bitmap");
-                goto error;
+                goto error_unlocked;
        }
        errval = -EINVAL;
 #ifdef CONFIG_REISERFS_CHECK
        SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
@@ -1830,24 +1823,26 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        if (reiserfs_barrier_flush(s)) {
                printk("reiserfs: using flush barriers\n");
        }
        // set_device_ro(s->s_dev, 1) ;
        if (journal_init(s, jdev_name, old_format, commit_max_age)) {
                SWARN(silent, s, "sh-2022",
                      "unable to initialize journal space");
-                goto error;
+                goto error_unlocked;
        } else {
                jinit_done = 1; /* once this is set, journal_release must be called
                                 ** if we error out of the mount
                                 */
        }
        if (reread_meta_blocks(s)) {
                SWARN(silent, s, "jmacd-9",
                      "unable to reread meta blocks after journal init");
-                goto error;
+                goto error_unlocked;
        }
        if (replay_only(s))
-                goto error;
+                goto error_unlocked;
        if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
                SWARN(silent, s, "clm-7000",
@@ -1861,9 +1856,19 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
                         reiserfs_init_locked_inode, (void *)(&args));
        if (!root_inode) {
                SWARN(silent, s, "jmacd-10", "get root inode failed");
-                goto error;
+                goto error_unlocked;
        }
+        /*
+         * This path assumed to be called with the BKL in the old times.
+         * Now we have inherited the big reiserfs lock from it and many
+         * reiserfs helpers called in the mount path and elsewhere require
+         * this lock to be held even if it's not always necessary. Let's be
+         * conservative and hold it early. The window can be reduced after
+         * careful review of the code.
+         */
+        reiserfs_write_lock(s);
        if (root_inode->i_state & I_NEW) {
                reiserfs_read_locked_inode(root_inode, &args);
                unlock_new_inode(root_inode);
@@ -1990,12 +1995,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        return (0);
 error:
-        if (jinit_done) {       /* kill the commit thread, free journal ram */
+        reiserfs_write_unlock(s);
+error_unlocked:
+        /* kill the commit thread, free journal ram */
+        if (jinit_done) {
+                reiserfs_write_lock(s);
                journal_release_error(NULL, s);
+                reiserfs_write_unlock(s);
        }
-        reiserfs_write_unlock(s);
        reiserfs_free_bitmap_cache(s);
        if (SB_BUFFER_WITH_SB(s))
                brelse(SB_BUFFER_WITH_SB(s));
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index eed99428f104..e1a7779dd3cb 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -28,9 +28,10 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
        struct inode *inode = file->f_mapping->host;
        struct mtd_info *mtd = inode->i_sb->s_mtd;
        unsigned long isize, offset, maxpages, lpages;
+        int ret;
        if (!mtd)
-                goto cant_map_directly;
+                return (unsigned long) -ENOSYS;
        /* the mapping mustn't extend beyond the EOF */
        lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -41,23 +42,20 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
        if ((pgoff >= maxpages) || (maxpages - pgoff < lpages))
                return (unsigned long) -EINVAL;
-        /* we need to call down to the MTD layer to do the actual mapping */
+        if (addr != 0)
-        if (mtd->get_unmapped_area) {
+                return (unsigned long) -EINVAL;
-                if (addr != 0)
-                        return (unsigned long) -EINVAL;
-                if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
-                        return (unsigned long) -EINVAL;
-                offset += ROMFS_I(inode)->i_dataoffset;
+        if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
-                if (offset > mtd->size - len)
+                return (unsigned long) -EINVAL;
-                        return (unsigned long) -EINVAL;
-                return mtd->get_unmapped_area(mtd, len, offset, flags);
+        offset += ROMFS_I(inode)->i_dataoffset;
-        }
+        if (offset > mtd->size - len)
+                return (unsigned long) -EINVAL;
-cant_map_directly:
+        ret = mtd_get_unmapped_area(mtd, len, offset, flags);
-        return (unsigned long) -ENOSYS;
+        if (ret == -EOPNOTSUPP)
+                ret = -ENOSYS;
+        return (unsigned long) ret;
 }
 /*
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f744be98cd5a..af0b73802592 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -70,11 +70,15 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
        spin_lock(&cache->lock);
        while (1) {
-                for (i = 0; i < cache->entries; i++)
+                for (i = cache->curr_blk, n = 0; n < cache->entries; n++) {
-                        if (cache->entry[i].block == block)
+                        if (cache->entry[i].block == block) {
+                                cache->curr_blk = i;
                                break;
+                        }
+                        i = (i + 1) % cache->entries;
+                }
-                if (i == cache->entries) {
+                if (n == cache->entries) {
                        /*
                         * Block not in cache, if all cache entries are used
                         * go to sleep waiting for one to become available.
@@ -245,6 +249,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
                goto cleanup;
        }
+        cache->curr_blk = 0;
        cache->next_blk = 0;
        cache->unused = entries;
        cache->entries = entries;
@@ -332,17 +337,20 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer,
                u64 *block, int *offset, int length)
 {
        struct squashfs_sb_info *msblk = sb->s_fs_info;
-        int bytes, copied = length;
+        int bytes, res = length;
        struct squashfs_cache_entry *entry;
        TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
        while (length) {
                entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
-                if (entry->error)
+                if (entry->error) {
-                        return entry->error;
+                        res = entry->error;
-                else if (*offset >= entry->length)
+                        goto error;
-                        return -EIO;
+                } else if (*offset >= entry->length) {
+                        res = -EIO;
+                        goto error;
+                }
                bytes = squashfs_copy_data(buffer, entry, *offset, length);
                if (buffer)
@@ -358,7 +366,11 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer,
                squashfs_cache_put(entry);
        }
-        return copied;
+        return res;
+error:
+        squashfs_cache_put(entry);
+        return res;
 }
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index fd7b3b3bda13..81afbccfa843 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -208,8 +208,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                inode->i_op = &squashfs_inode_ops;
                inode->i_fop = &generic_ro_fops;
                inode->i_mode |= S_IFREG;
-                inode->i_blocks = ((inode->i_size -
+                inode->i_blocks = (inode->i_size -
-                                le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1;
+                                le64_to_cpu(sqsh_ino->sparse) + 511) >> 9;
                squashfs_i(inode)->fragment_block = frag_blk;
                squashfs_i(inode)->fragment_size = frag_size;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 651f0b31d296..52934a22f296 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -28,6 +28,7 @@
 struct squashfs_cache {
        char                    *name;
        int                     entries;
+        int                     curr_blk;
        int                     next_blk;
        int                     num_waiters;
        int                     unused;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index d0858c2d9a47..ecaa2f7bdb8f 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -290,7 +290,7 @@ handle_fragments:
 check_directory_table:
        /* Sanity check directory_table */
-        if (msblk->directory_table >= next_table) {
+        if (msblk->directory_table > next_table) {
                err = -EINVAL;
                goto failed_mount;
        }
diff --git a/fs/super.c b/fs/super.c
index de41e1e46f09..6015c02296b7 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1186,6 +1186,8 @@ int freeze_super(struct super_block *sb)
                        printk(KERN_ERR
                                "VFS:Filesystem freeze failed\n");
                        sb->s_frozen = SB_UNFROZEN;
+                        smp_wmb();
+                        wake_up(&sb->s_wait_unfrozen);
                        deactivate_locked_super(sb);
                        return ret;
                }
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 62f4fb37789e..00012e31829d 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -493,6 +493,12 @@ int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
        const void *ns = NULL;
        int err;
+        if (!dir_sd) {
+                WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n",
+                        kobject_name(kobj));
+                return -ENOENT;
+        }
        err = 0;
        if (!sysfs_ns_type(dir_sd))
                goto out;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 4a802b4a9056..85eb81683a29 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -318,8 +318,11 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
        struct sysfs_addrm_cxt acxt;
        struct sysfs_dirent *sd;
-        if (!dir_sd)
+        if (!dir_sd) {
+                WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
+                        name);
                return -ENOENT;
+        }
        sysfs_addrm_start(&acxt, dir_sd);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index b09ba2dd8b62..f922cbacdb96 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -38,9 +38,6 @@
 DEFINE_SPINLOCK(dbg_lock);
-static char dbg_key_buf0[128];
-static char dbg_key_buf1[128];
 static const char *get_key_fmt(int fmt)
 {
        switch (fmt) {
@@ -103,8 +100,8 @@ static const char *get_dent_type(int type)
        }
 }
-static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
+const char *dbg_snprintf_key(const struct ubifs_info *c,
-                        char *buffer)
+                             const union ubifs_key *key, char *buffer, int len)
 {
        char *p = buffer;
        int type = key_type(c, key);
@@ -112,45 +109,34 @@ static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
        if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
                switch (type) {
                case UBIFS_INO_KEY:
-                        sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key),
+                        len -= snprintf(p, len, "(%lu, %s)",
-                               get_key_type(type));
+                                        (unsigned long)key_inum(c, key),
+                                        get_key_type(type));
                        break;
                case UBIFS_DENT_KEY:
                case UBIFS_XENT_KEY:
-                        sprintf(p, "(%lu, %s, %#08x)",
+                        len -= snprintf(p, len, "(%lu, %s, %#08x)",
-                                (unsigned long)key_inum(c, key),
+                                        (unsigned long)key_inum(c, key),
-                                get_key_type(type), key_hash(c, key));
+                                        get_key_type(type), key_hash(c, key));
                        break;
                case UBIFS_DATA_KEY:
-                        sprintf(p, "(%lu, %s, %u)",
+                        len -= snprintf(p, len, "(%lu, %s, %u)",
-                                (unsigned long)key_inum(c, key),
+                                        (unsigned long)key_inum(c, key),
-                                get_key_type(type), key_block(c, key));
+                                        get_key_type(type), key_block(c, key));
                        break;
                case UBIFS_TRUN_KEY:
-                        sprintf(p, "(%lu, %s)",
+                        len -= snprintf(p, len, "(%lu, %s)",
-                                (unsigned long)key_inum(c, key),
+                                        (unsigned long)key_inum(c, key),
-                                get_key_type(type));
+                                        get_key_type(type));
                        break;
                default:
-                        sprintf(p, "(bad key type: %#08x, %#08x)",
+                        len -= snprintf(p, len, "(bad key type: %#08x, %#08x)",
-                                key->u32[0], key->u32[1]);
+                                        key->u32[0], key->u32[1]);
                }
        } else
-                sprintf(p, "bad key format %d", c->key_fmt);
+                len -= snprintf(p, len, "bad key format %d", c->key_fmt);
-}
+        ubifs_assert(len > 0);
+        return p;
-const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
-{
-        /* dbg_lock must be held */
-        sprintf_key(c, key, dbg_key_buf0);
-        return dbg_key_buf0;
-}
-const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
-{
-        /* dbg_lock must be held */
-        sprintf_key(c, key, dbg_key_buf1);
-        return dbg_key_buf1;
 }
 const char *dbg_ntype(int type)
@@ -319,6 +305,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
        int i, n;
        union ubifs_key key;
        const struct ubifs_ch *ch = node;
+        char key_buf[DBG_KEY_BUF_LEN];
        if (dbg_is_tst_rcvry(c))
                return;
@@ -474,7 +461,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                const struct ubifs_ino_node *ino = node;
                key_read(c, &ino->key, &key);
-                printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+                printk(KERN_DEBUG "\tkey            %s\n",
+                       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
                printk(KERN_DEBUG "\tcreat_sqnum    %llu\n",
                       (unsigned long long)le64_to_cpu(ino->creat_sqnum));
                printk(KERN_DEBUG "\tsize           %llu\n",
@@ -517,7 +505,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                int nlen = le16_to_cpu(dent->nlen);
                key_read(c, &dent->key, &key);
-                printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+                printk(KERN_DEBUG "\tkey            %s\n",
+                       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
                printk(KERN_DEBUG "\tinum           %llu\n",
                       (unsigned long long)le64_to_cpu(dent->inum));
                printk(KERN_DEBUG "\ttype           %d\n", (int)dent->type);
@@ -541,7 +530,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
                key_read(c, &dn->key, &key);
-                printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
+                printk(KERN_DEBUG "\tkey            %s\n",
+                       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
                printk(KERN_DEBUG "\tsize           %u\n",
                       le32_to_cpu(dn->size));
                printk(KERN_DEBUG "\tcompr_typ      %d\n",
@@ -582,7 +572,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
                        key_read(c, &br->key, &key);
                        printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
                               i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
-                               le32_to_cpu(br->len), DBGKEY(&key));
+                               le32_to_cpu(br->len),
+                               dbg_snprintf_key(c, &key, key_buf,
+                                                DBG_KEY_BUF_LEN));
                }
                break;
        }
@@ -934,6 +926,7 @@ void dbg_dump_znode(const struct ubifs_info *c,
 {
        int n;
        const struct ubifs_zbranch *zbr;
+        char key_buf[DBG_KEY_BUF_LEN];
        spin_lock(&dbg_lock);
        if (znode->parent)
@@ -958,12 +951,16 @@ void dbg_dump_znode(const struct ubifs_info *c,
                        printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
                                          "%s\n", n, zbr->znode, zbr->lnum,
                                          zbr->offs, zbr->len,
-                                          DBGKEY(&zbr->key));
+                                          dbg_snprintf_key(c, &zbr->key,
+                                                           key_buf,
+                                                           DBG_KEY_BUF_LEN));
                else
                        printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
                                          "%s\n", n, zbr->znode, zbr->lnum,
                                          zbr->offs, zbr->len,
-                                          DBGKEY(&zbr->key));
+                                          dbg_snprintf_key(c, &zbr->key,
+                                                           key_buf,
+                                                           DBG_KEY_BUF_LEN));
        }
        spin_unlock(&dbg_lock);
 }
@@ -1260,6 +1257,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
        int err, nlen1, nlen2, cmp;
        struct ubifs_dent_node *dent1, *dent2;
        union ubifs_key key;
+        char key_buf[DBG_KEY_BUF_LEN];
        ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
        dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
@@ -1290,9 +1288,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
        key_read(c, &dent1->key, &key);
        if (keys_cmp(c, &zbr1->key, &key)) {
                dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
-                        zbr1->offs, DBGKEY(&key));
+                        zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+                                                     DBG_KEY_BUF_LEN));
                dbg_err("but it should have key %s according to tnc",
-                        DBGKEY(&zbr1->key));
+                        dbg_snprintf_key(c, &zbr1->key, key_buf,
+                                         DBG_KEY_BUF_LEN));
                dbg_dump_node(c, dent1);
                goto out_free;
        }
@@ -1300,9 +1300,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
        key_read(c, &dent2->key, &key);
        if (keys_cmp(c, &zbr2->key, &key)) {
                dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
-                        zbr1->offs, DBGKEY(&key));
+                        zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+                                                     DBG_KEY_BUF_LEN));
                dbg_err("but it should have key %s according to tnc",
-                        DBGKEY(&zbr2->key));
+                        dbg_snprintf_key(c, &zbr2->key, key_buf,
+                                         DBG_KEY_BUF_LEN));
                dbg_dump_node(c, dent2);
                goto out_free;
        }
@@ -1319,7 +1321,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
                dbg_err("2 xent/dent nodes with the same name");
        else
                dbg_err("bad order of colliding key %s",
-                        DBGKEY(&key));
+                        dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
        ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
        dbg_dump_node(c, dent1);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 8d9c46810189..ad1a6fee6010 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -169,40 +169,39 @@ struct ubifs_global_debug_info {
        spin_unlock(&dbg_lock);                                                \
 } while (0)
-const char *dbg_key_str0(const struct ubifs_info *c,
+#define ubifs_dbg_msg(type, fmt, ...) \
-                         const union ubifs_key *key);
+        pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
-const char *dbg_key_str1(const struct ubifs_info *c,
-                         const union ubifs_key *key);
+#define DBG_KEY_BUF_LEN 32
+#define ubifs_dbg_msg_key(type, key, fmt, ...) do {                            \
-/*
+        char __tmp_key_buf[DBG_KEY_BUF_LEN];                                   \
- * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message
+        pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__,             \
- * macros.
+                 dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN));    \
- */
-#define DBGKEY(key) dbg_key_str0(c, (key))
-#define DBGKEY1(key) dbg_key_str1(c, (key))
-extern spinlock_t dbg_lock;
-#define ubifs_dbg_msg(type, fmt, ...) do {                        \
-        spin_lock(&dbg_lock);                                     \
-        pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
-        spin_unlock(&dbg_lock);                                   \
 } while (0)
 /* Just a debugging messages not related to any specific UBIFS subsystem */
-#define dbg_msg(fmt, ...)   ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...)                                                      \
+        printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid,   \
+               __func__, ##__VA_ARGS__)
 /* General messages */
 #define dbg_gen(fmt, ...)   ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
 /* Additional journal messages */
 #define dbg_jnl(fmt, ...)   ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
+#define dbg_jnlk(key, fmt, ...) \
+        ubifs_dbg_msg_key("jnl", key, fmt, ##__VA_ARGS__)
 /* Additional TNC messages */
 #define dbg_tnc(fmt, ...)   ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
+#define dbg_tnck(key, fmt, ...) \
+        ubifs_dbg_msg_key("tnc", key, fmt, ##__VA_ARGS__)
 /* Additional lprops messages */
 #define dbg_lp(fmt, ...)    ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
 /* Additional LEB find messages */
 #define dbg_find(fmt, ...)  ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
 /* Additional mount messages */
 #define dbg_mnt(fmt, ...)   ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
+#define dbg_mntk(key, fmt, ...) \
+        ubifs_dbg_msg_key("mnt", key, fmt, ##__VA_ARGS__)
 /* Additional I/O messages */
 #define dbg_io(fmt, ...)    ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
 /* Additional commit messages */
@@ -218,6 +217,7 @@ extern spinlock_t dbg_lock;
 /* Additional recovery messages */
 #define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
+extern spinlock_t dbg_lock;
 extern struct ubifs_global_debug_info ubifs_dbg;
 static inline int dbg_is_chk_gen(const struct ubifs_info *c)
@@ -258,6 +258,8 @@ const char *dbg_cstate(int cmt_state);
 const char *dbg_jhead(int jhead);
 const char *dbg_get_key_dump(const struct ubifs_info *c,
                             const union ubifs_key *key);
+const char *dbg_snprintf_key(const struct ubifs_info *c,
+                             const union ubifs_key *key, char *buffer, int len);
 void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode);
 void dbg_dump_node(const struct ubifs_info *c, const void *node);
 void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
@@ -345,20 +347,23 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 #define dbg_dump_stack()
 #define ubifs_assert_cmt_locked(c)
-#define dbg_msg(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...)       ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gen(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...)       ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_jnl(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...)       ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_tnc(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnlk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_lp(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...)       ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_find(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnck(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_mnt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...)        ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_io(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...)      ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_cmt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...)       ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_budg(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mntk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_log(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...)        ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gc(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...)       ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_scan(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...)      ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...)       ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...)        ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...)      ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...)     ubifs_dbg_msg(fmt, ##__VA_ARGS__)
 static inline int ubifs_debugging_init(struct ubifs_info *c)      { return 0; }
 static inline void ubifs_debugging_exit(struct ubifs_info *c)     { return; }
@@ -368,6 +373,10 @@ static inline const char *dbg_jhead(int jhead)                    { return ""; }
 static inline const char *
 dbg_get_key_dump(const struct ubifs_info *c,
                 const union ubifs_key *key)                      { return ""; }
+static inline const char *
+dbg_snprintf_key(const struct ubifs_info *c,
+                 const union ubifs_key *key, char *buffer,
+                 int len)                                         { return ""; }
 static inline void dbg_dump_inode(struct ubifs_info *c,
                                  const struct inode *inode)      { return; }
 static inline void dbg_dump_node(const struct ubifs_info *c,
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index cef0460f4c54..2f438ab2e7a2 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -697,9 +697,8 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
        int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
        struct ubifs_inode *ui = ubifs_inode(inode);
-        dbg_jnl("ino %lu, blk %u, len %d, key %s",
+        dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
-                (unsigned long)key_inum(c, key), key_block(c, key), len,
+                (unsigned long)key_inum(c, key), key_block(c, key), len);
-                DBGKEY(key));
        ubifs_assert(len <= UBIFS_BLOCK_SIZE);
        data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
@@ -1177,7 +1176,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
                dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
                blk = new_size >> UBIFS_BLOCK_SHIFT;
                data_key_init(c, &key, inum, blk);
-                dbg_jnl("last block key %s", DBGKEY(&key));
+                dbg_jnlk(&key, "last block key ");
                err = ubifs_tnc_lookup(c, &key, dn);
                if (err == -ENOENT)
                        dlen = 0; /* Not found (so it is a hole) */
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 6189c74d97f0..66d59d0a1402 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1986,12 +1986,11 @@ again:
                                if (path[h].in_tree)
                                        continue;
-                                nnode = kmalloc(sz, GFP_NOFS);
+                                nnode = kmemdup(&path[h].nnode, sz, GFP_NOFS);
                                if (!nnode) {
                                        err = -ENOMEM;
                                        goto out;
                                }
-                                memcpy(nnode, &path[h].nnode, sz);
                                parent = nnode->parent;
                                parent->nbranch[nnode->iip].nnode = nnode;
                                path[h].ptr.nnode = nnode;
@@ -2004,12 +2003,11 @@ again:
                                const size_t sz = sizeof(struct ubifs_pnode);
                                struct ubifs_nnode *parent;
-                                pnode = kmalloc(sz, GFP_NOFS);
+                                pnode = kmemdup(&path[h].pnode, sz, GFP_NOFS);
                                if (!pnode) {
                                        err = -ENOMEM;
                                        goto out;
                                }
-                                memcpy(pnode, &path[h].pnode, sz);
                                parent = pnode->parent;
                                parent->nbranch[pnode->iip].pnode = pnode;
                                path[h].ptr.pnode = pnode;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ccabaf1164b3..b007637f0406 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -221,8 +221,8 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
 {
        int err;
-        dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
+        dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ",
-                r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
+                 r->lnum, r->offs, r->len, r->deletion, r->sqnum);
        /* Set c->replay_sqnum to help deal with dangling branches. */
        c->replay_sqnum = r->sqnum;
@@ -361,7 +361,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
 {
        struct replay_entry *r;
-        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+        dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
        if (key_inum(c, key) >= c->highest_inum)
                c->highest_inum = key_inum(c, key);
@@ -409,7 +409,7 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
        struct replay_entry *r;
        char *nbuf;
-        dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+        dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
        if (key_inum(c, key) >= c->highest_inum)
                c->highest_inum = key_inum(c, key);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 066738647685..16ad84d8402f 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -344,12 +344,11 @@ static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
                return err;
        }
-        lnc_node = kmalloc(zbr->len, GFP_NOFS);
+        lnc_node = kmemdup(node, zbr->len, GFP_NOFS);
        if (!lnc_node)
                /* We don't have to have the cache, so no error */
                return 0;
-        memcpy(lnc_node, node, zbr->len);
        zbr->leaf = lnc_node;
        return 0;
 }
@@ -506,7 +505,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
 {
        int ret;
-        dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key));
+        dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs);
        ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
                            zbr->offs);
@@ -520,8 +519,8 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
                        ret = 0;
        }
        if (ret == 0 && c->replaying)
-                dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
+                dbg_mntk(key, "dangling branch LEB %d:%d len %d, key ",
-                        zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
+                        zbr->lnum, zbr->offs, zbr->len);
        return ret;
 }
@@ -996,9 +995,9 @@ static int fallible_resolve_collision(struct ubifs_info *c,
        if (adding || !o_znode)
                return 0;
-        dbg_mnt("dangling match LEB %d:%d len %d %s",
+        dbg_mntk(key, "dangling match LEB %d:%d len %d key ",
                o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
-                o_znode->zbranch[o_n].len, DBGKEY(key));
+                o_znode->zbranch[o_n].len);
        *zn = o_znode;
        *n = o_n;
        return 1;
@@ -1180,7 +1179,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
        struct ubifs_znode *znode;
        unsigned long time = get_seconds();
-        dbg_tnc("search key %s", DBGKEY(key));
+        dbg_tnck(key, "search key ");
        ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
        znode = c->zroot.znode;
@@ -1316,7 +1315,7 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
        struct ubifs_znode *znode;
        unsigned long time = get_seconds();
-        dbg_tnc("search and dirty key %s", DBGKEY(key));
+        dbg_tnck(key, "search and dirty key ");
        znode = c->zroot.znode;
        if (unlikely(!znode)) {
@@ -1723,8 +1722,8 @@ static int validate_data_node(struct ubifs_info *c, void *buf,
        if (!keys_eq(c, &zbr->key, &key1)) {
                ubifs_err("bad key in node at LEB %d:%d",
                          zbr->lnum, zbr->offs);
-                dbg_tnc("looked for key %s found node's key %s",
+                dbg_tnck(&zbr->key, "looked for key ");
-                        DBGKEY(&zbr->key), DBGKEY1(&key1));
+                dbg_tnck(&key1, "found node's key ");
                goto out_err;
        }
@@ -1777,7 +1776,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
                ubifs_err("failed to read from LEB %d:%d, error %d",
                          lnum, offs, err);
                dbg_dump_stack();
-                dbg_tnc("key %s", DBGKEY(&bu->key));
+                dbg_tnck(&bu->key, "key ");
                return err;
        }
@@ -1812,7 +1811,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
        int found, n, err;
        struct ubifs_znode *znode;
-        dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
+        dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
        mutex_lock(&c->tnc_mutex);
        found = ubifs_lookup_level0(c, key, &znode, &n);
        if (!found) {
@@ -1986,8 +1985,7 @@ again:
        zp = znode->parent;
        if (znode->child_cnt < c->fanout) {
                ubifs_assert(n != c->fanout);
-                dbg_tnc("inserted at %d level %d, key %s", n, znode->level,
+                dbg_tnck(key, "inserted at %d level %d, key ", n, znode->level);
-                        DBGKEY(key));
                insert_zbranch(znode, zbr, n);
@@ -2002,7 +2000,7 @@ again:
         * Unfortunately, @znode does not have more empty slots and we have to
         * split it.
         */
-        dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key));
+        dbg_tnck(key, "splitting level %d, key ", znode->level);
        if (znode->alt)
                /*
@@ -2096,7 +2094,7 @@ do_split:
        }
        /* Insert new key and branch */
-        dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key));
+        dbg_tnck(key, "inserting at %d level %d, key ", n, zn->level);
        insert_zbranch(zi, zbr, n);
@@ -2172,7 +2170,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
        struct ubifs_znode *znode;
        mutex_lock(&c->tnc_mutex);
-        dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key));
+        dbg_tnck(key, "%d:%d, len %d, key ", lnum, offs, len);
        found = lookup_level0_dirty(c, key, &znode, &n);
        if (!found) {
                struct ubifs_zbranch zbr;
@@ -2221,8 +2219,8 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
        struct ubifs_znode *znode;
        mutex_lock(&c->tnc_mutex);
-        dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum,
+        dbg_tnck(key, "old LEB %d:%d, new LEB %d:%d, len %d, key ", old_lnum,
-                old_offs, lnum, offs, len, DBGKEY(key));
+                 old_offs, lnum, offs, len);
        found = lookup_level0_dirty(c, key, &znode, &n);
        if (found < 0) {
                err = found;
@@ -2304,8 +2302,8 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
        struct ubifs_znode *znode;
        mutex_lock(&c->tnc_mutex);
-        dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name,
+        dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
-                DBGKEY(key));
+                 lnum, offs, nm->len, nm->name);
        found = lookup_level0_dirty(c, key, &znode, &n);
        if (found < 0) {
                err = found;
@@ -2398,7 +2396,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
        /* Delete without merge for now */
        ubifs_assert(znode->level == 0);
        ubifs_assert(n >= 0 && n < c->fanout);
-        dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key));
+        dbg_tnck(&znode->zbranch[n].key, "deleting key ");
        zbr = &znode->zbranch[n];
        lnc_free(zbr);
@@ -2508,7 +2506,7 @@ int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
        struct ubifs_znode *znode;
        mutex_lock(&c->tnc_mutex);
-        dbg_tnc("key %s", DBGKEY(key));
+        dbg_tnck(key, "key ");
        found = lookup_level0_dirty(c, key, &znode, &n);
        if (found < 0) {
                err = found;
@@ -2539,7 +2537,7 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
        struct ubifs_znode *znode;
        mutex_lock(&c->tnc_mutex);
-        dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key));
+        dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
        err = lookup_level0_dirty(c, key, &znode, &n);
        if (err < 0)
                goto out_unlock;
@@ -2654,7 +2652,7 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
                                dbg_dump_znode(c, znode);
                                goto out_unlock;
                        }
-                        dbg_tnc("removing %s", DBGKEY(key));
+                        dbg_tnck(key, "removing key ");
                }
                if (k) {
                        for (i = n + 1 + k; i < znode->child_cnt; i++)
@@ -2774,7 +2772,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
        struct ubifs_zbranch *zbr;
        union ubifs_key *dkey;
-        dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key));
+        dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
        ubifs_assert(is_hash_key(c, key));
        mutex_lock(&c->tnc_mutex);
@@ -3333,9 +3331,9 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
 out_dump:
        block = key_block(c, key);
-        ubifs_err("inode %lu has size %lld, but there are data at offset %lld "
+        ubifs_err("inode %lu has size %lld, but there are data at offset %lld",
-                  "(data key %s)", (unsigned long)inode->i_ino, size,
+                  (unsigned long)inode->i_ino, size,
-                  ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key));
+                  ((loff_t)block) << UBIFS_BLOCK_SHIFT);
        mutex_unlock(&c->tnc_mutex);
        dbg_dump_inode(c, inode);
        dbg_dump_stack();
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index b48db999903e..dc28fe6ec07a 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -328,8 +328,8 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
                case UBIFS_XENT_KEY:
                        break;
                default:
-                        dbg_msg("bad key type at slot %d: %s", i,
+                        dbg_msg("bad key type at slot %d: %d",
-                                DBGKEY(&zbr->key));
+                                i, key_type(c, &zbr->key));
                        err = 3;
                        goto out_dump;
                }
@@ -475,7 +475,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
                                      zbr->offs);
        if (err) {
-                dbg_tnc("key %s", DBGKEY(key));
+                dbg_tnck(key, "key ");
                return err;
        }
@@ -484,8 +484,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
        if (!keys_eq(c, key, &key1)) {
                ubifs_err("bad key in node at LEB %d:%d",
                          zbr->lnum, zbr->offs);
-                dbg_tnc("looked for key %s found node's key %s",
+                dbg_tnck(key, "looked for key ");
-                        DBGKEY(key), DBGKEY1(&key1));
+                dbg_tnck(&key1, "but found node's key ");
                dbg_dump_node(c, node);
                return -EINVAL;
        }
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index bf18f7a04544..85b272268754 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -138,12 +138,11 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
        ui = ubifs_inode(inode);
        ui->xattr = 1;
        ui->flags |= UBIFS_XATTR_FL;
-        ui->data = kmalloc(size, GFP_NOFS);
+        ui->data = kmemdup(value, size, GFP_NOFS);
        if (!ui->data) {
                err = -ENOMEM;
                goto out_free;
        }
-        memcpy(ui->data, value, size);
        inode->i_size = ui->ui_size = size;
        ui->data_len = size;
@@ -204,12 +203,11 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
                return err;
        kfree(ui->data);
-        ui->data = kmalloc(size, GFP_NOFS);
+        ui->data = kmemdup(value, size, GFP_NOFS);
        if (!ui->data) {
                err = -ENOMEM;
                goto out_free;
        }
-        memcpy(ui->data, value, size);
        inode->i_size = ui->ui_size = size;
        ui->data_len = size;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index d8ffa7cc661d..dca0c3881e82 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -125,7 +125,6 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                        err = udf_expand_file_adinicb(inode);
                        if (err) {
                                udf_debug("udf_expand_adinicb: err=%d\n", err);
-                                up_write(&iinfo->i_data_sem);
                                return err;
                        }
                } else {
@@ -133,9 +132,10 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                                iinfo->i_lenAlloc = pos + count;
                        else
                                iinfo->i_lenAlloc = inode->i_size;
+                        up_write(&iinfo->i_data_sem);
                }
-        }
+        } else
-        up_write(&iinfo->i_data_sem);
+                up_write(&iinfo->i_data_sem);
        retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
        if (retval > 0)
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 4598904be1bb..7699df7b3198 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -53,8 +53,7 @@ static int udf_update_inode(struct inode *, int);
 static void udf_fill_inode(struct inode *, struct buffer_head *);
 static int udf_sync_inode(struct inode *inode);
 static int udf_alloc_i_data(struct inode *inode, size_t size);
-static struct buffer_head *inode_getblk(struct inode *, sector_t, int *,
+static sector_t inode_getblk(struct inode *, sector_t, int *, int *);
-                                        sector_t *, int *);
 static int8_t udf_insert_aext(struct inode *, struct extent_position,
                              struct kernel_lb_addr, uint32_t);
 static void udf_split_extents(struct inode *, int *, int, int,
@@ -151,6 +150,12 @@ const struct address_space_operations udf_aops = {
        .bmap           = udf_bmap,
 };
+/*
+ * Expand file stored in ICB to a normal one-block-file
+ *
+ * This function requires i_data_sem for writing and releases it.
+ * This function requires i_mutex held
+ */
 int udf_expand_file_adinicb(struct inode *inode)
 {
        struct page *page;
@@ -169,9 +174,15 @@ int udf_expand_file_adinicb(struct inode *inode)
                        iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
                /* from now on we have normal address_space methods */
                inode->i_data.a_ops = &udf_aops;
+                up_write(&iinfo->i_data_sem);
                mark_inode_dirty(inode);
                return 0;
        }
+        /*
+         * Release i_data_sem so that we can lock a page - page lock ranks
+         * above i_data_sem. i_mutex still protects us against file changes.
+         */
+        up_write(&iinfo->i_data_sem);
        page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
        if (!page)
@@ -187,6 +198,7 @@ int udf_expand_file_adinicb(struct inode *inode)
                SetPageUptodate(page);
                kunmap(page);
        }
+        down_write(&iinfo->i_data_sem);
        memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00,
               iinfo->i_lenAlloc);
        iinfo->i_lenAlloc = 0;
@@ -196,17 +208,20 @@ int udf_expand_file_adinicb(struct inode *inode)
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
        /* from now on we have normal address_space methods */
        inode->i_data.a_ops = &udf_aops;
+        up_write(&iinfo->i_data_sem);
        err = inode->i_data.a_ops->writepage(page, &udf_wbc);
        if (err) {
                /* Restore everything back so that we don't lose data... */
                lock_page(page);
                kaddr = kmap(page);
+                down_write(&iinfo->i_data_sem);
                memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
                       inode->i_size);
                kunmap(page);
                unlock_page(page);
                iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
                inode->i_data.a_ops = &udf_adinicb_aops;
+                up_write(&iinfo->i_data_sem);
        }
        page_cache_release(page);
        mark_inode_dirty(inode);
@@ -310,7 +325,6 @@ static int udf_get_block(struct inode *inode, sector_t block,
                         struct buffer_head *bh_result, int create)
 {
        int err, new;
-        struct buffer_head *bh;
        sector_t phys = 0;
        struct udf_inode_info *iinfo;
@@ -323,7 +337,6 @@ static int udf_get_block(struct inode *inode, sector_t block,
        err = -EIO;
        new = 0;
-        bh = NULL;
        iinfo = UDF_I(inode);
        down_write(&iinfo->i_data_sem);
@@ -332,13 +345,10 @@ static int udf_get_block(struct inode *inode, sector_t block,
                iinfo->i_next_alloc_goal++;
        }
-        err = 0;
-        bh = inode_getblk(inode, block, &err, &phys, &new);
+        phys = inode_getblk(inode, block, &err, &new);
-        BUG_ON(bh);
+        if (!phys)
-        if (err)
                goto abort;
-        BUG_ON(!phys);
        if (new)
                set_buffer_new(bh_result);
@@ -547,11 +557,10 @@ out:
        return err;
 }
-static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
+static sector_t inode_getblk(struct inode *inode, sector_t block,
-                                        int *err, sector_t *phys, int *new)
+                             int *err, int *new)
 {
        static sector_t last_block;
-        struct buffer_head *result = NULL;
        struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
        struct extent_position prev_epos, cur_epos, next_epos;
        int count = 0, startnum = 0, endnum = 0;
@@ -566,6 +575,8 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
        int goal = 0, pgoal = iinfo->i_location.logicalBlockNum;
        int lastblock = 0;
+        *err = 0;
+        *new = 0;
        prev_epos.offset = udf_file_entry_alloc_offset(inode);
        prev_epos.block = iinfo->i_location;
        prev_epos.bh = NULL;
@@ -635,8 +646,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                brelse(cur_epos.bh);
                brelse(next_epos.bh);
                newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
-                *phys = newblock;
+                return newblock;
-                return NULL;
        }
        last_block = block;
@@ -664,7 +674,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                        brelse(cur_epos.bh);
                        brelse(next_epos.bh);
                        *err = ret;
-                        return NULL;
+                        return 0;
                }
                c = 0;
                offset = 0;
@@ -729,7 +739,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
                if (!newblocknum) {
                        brelse(prev_epos.bh);
                        *err = -ENOSPC;
-                        return NULL;
+                        return 0;
                }
                iinfo->i_lenExtents += inode->i_sb->s_blocksize;
        }
@@ -761,10 +771,10 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
        newblock = udf_get_pblock(inode->i_sb, newblocknum,
                                iinfo->i_location.partitionReferenceNum, 0);
-        if (!newblock)
+        if (!newblock) {
-                return NULL;
+                *err = -EIO;
-        *phys = newblock;
+                return 0;
-        *err = 0;
+        }
        *new = 1;
        iinfo->i_next_alloc_block = block;
        iinfo->i_next_alloc_goal = newblocknum;
@@ -775,7 +785,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
        else
                mark_inode_dirty(inode);
-        return result;
+        return newblock;
 }
 static void udf_split_extents(struct inode *inode, int *c, int offset,
@@ -1111,10 +1121,9 @@ int udf_setsize(struct inode *inode, loff_t newsize)
                        if (bsize <
                            (udf_file_entry_alloc_offset(inode) + newsize)) {
                                err = udf_expand_file_adinicb(inode);
-                                if (err) {
+                                if (err)
-                                        up_write(&iinfo->i_data_sem);
                                        return err;
-                                }
+                                down_write(&iinfo->i_data_sem);
                        } else
                                iinfo->i_lenAlloc = newsize;
                }
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 0c33225647a0..c09a84daaf50 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1798,6 +1798,12 @@ static void udf_close_lvid(struct super_block *sb)
                                le16_to_cpu(lvid->descTag.descCRCLength)));
        lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
+        /*
+         * We set buffer uptodate unconditionally here to avoid spurious
+         * warnings from mark_buffer_dirty() when previous EIO has marked
+         * the buffer as !uptodate
+         */
+        set_buffer_uptodate(bh);
        mark_buffer_dirty(bh);
        sbi->s_lvid_dirty = 0;
        mutex_unlock(&sbi->s_alloc_mutex);
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index b1d4488b0f14..d7c6dbe4194b 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -41,10 +41,16 @@ static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
                pc = (struct pathComponent *)(from + elen);
                switch (pc->componentType) {
                case 1:
-                        if (pc->lengthComponentIdent == 0) {
+                        /*
-                                p = to;
+                         * Symlink points to some place which should be agreed
-                                *p++ = '/';
+                         * upon between originator and receiver of the media. Ignore.
-                        }
+                         */
+                        if (pc->lengthComponentIdent > 0)
+                                break;
+                        /* Fall through */
+                case 2:
+                        p = to;
+                        *p++ = '/';
                        break;
                case 3:
                        memcpy(p, "../", 3);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee9b625..74b9baf36ac3 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -111,8 +111,7 @@ xfs_ioend_new_eof(
        xfs_fsize_t             bsize;
        bsize = ioend->io_offset + ioend->io_size;
-        isize = MAX(ip->i_size, ip->i_new_size);
+        isize = MIN(i_size_read(VFS_I(ip)), bsize);
-        isize = MIN(isize, bsize);
        return isize > ip->i_d.di_size ? isize : 0;
 }
@@ -126,11 +125,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 }
 /*
- * Update on-disk file size now that data has been written to disk.  The
+ * Update on-disk file size now that data has been written to disk.
- * current in-memory file size is i_size.  If a write is beyond eof i_new_size
- * will be the intended file size until i_size is updated.  If this write does
- * not extend all the way to the valid file size then restrict this update to
- * the end of the write.
 *
 * This function does not block as blocking on the inode lock in IO completion
 * can lead to IO completion order dependency deadlocks.. If it can't get the
@@ -1279,6 +1274,15 @@ xfs_end_io_direct_write(
        struct xfs_ioend        *ioend = iocb->private;
        /*
+         * While the generic direct I/O code updates the inode size, it does
+         * so only after the end_io handler is called, which means our
+         * end_io handler thinks the on-disk size is outside the in-core
+         * size.  To prevent this just update it a little bit earlier here.
+         */
+        if (offset + size > i_size_read(ioend->io_inode))
+                i_size_write(ioend->io_inode, offset + size);
+        /*
         * blockdev_direct_IO can return an error even after the I/O
         * completion handler was called.  Thus we need to protect
         * against double-freeing.
@@ -1340,12 +1344,11 @@ xfs_vm_write_failed(
        if (to > inode->i_size) {
                /*
-                 * punch out the delalloc blocks we have already allocated. We
+                 * Punch out the delalloc blocks we have already allocated.
-                 * don't call xfs_setattr() to do this as we may be in the
+                 *
-                 * middle of a multi-iovec write and so the vfs inode->i_size
+                 * Don't bother with xfs_setattr given that nothing can have
-                 * will not match the xfs ip->i_size and so it will zero too
+                 * made it to disk yet as the page is still locked at this
-                 * much. Hence we jus truncate the page cache to zero what is
+                 * point.
-                 * necessary and punch the delalloc blocks directly.
                 */
                struct xfs_inode        *ip = XFS_I(inode);
                xfs_fileoff_t           start_fsb;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 1e5d97f86ea8..08b9ac644c31 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
        if (error)
                goto out;
-        /*
-         * Commit the last in the sequence of transactions.
-         */
-        xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
        error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index c1b55e596551..d25eafd4d28d 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -271,10 +271,6 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
        dp = args->dp;
        mp = dp->i_mount;
        dp->i_d.di_forkoff = forkoff;
-        dp->i_df.if_ext_max =
-                XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-        dp->i_afp->if_ext_max =
-                XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
        ifp = dp->i_afp;
        ASSERT(ifp->if_flags & XFS_IFINLINE);
@@ -326,7 +322,6 @@ xfs_attr_fork_reset(
        ASSERT(ip->i_d.di_anextents == 0);
        ASSERT(ip->i_afp == NULL);
-        ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
@@ -389,10 +384,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
                                (args->op_flags & XFS_DA_OP_ADDNAME) ||
                                !(mp->m_flags & XFS_MOUNT_ATTR2) ||
                                dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
-                dp->i_afp->if_ext_max =
-                        XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-                dp->i_df.if_ext_max =
-                        XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
                xfs_trans_log_inode(args->trans, dp,
                                        XFS_ILOG_CORE | XFS_ILOG_ADATA);
        }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index d0ab78837057..188ef2fbd628 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge(
 }
 /*
-* Update the record referred to by cur to the value given
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+        return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+                XFS_IFORK_NEXTENTS(ip, whichfork) >
+                        XFS_IFORK_MAXEXT(ip, whichfork);
+}
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+        return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+                XFS_IFORK_NEXTENTS(ip, whichfork) <=
+                        XFS_IFORK_MAXEXT(ip, whichfork);
+}
+/*
+ * Update the record referred to by cur to the value given
 * by [off, bno, len, state].
 * This either works (return 0) or gets an EFSCORRUPTED error.
 */
@@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-                    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+                if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
                                        bma->firstblock, bma->flist,
                                        &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
@@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-                    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+                if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
                                bma->firstblock, bma->flist, &bma->cur, 1,
                                &tmp_rval, XFS_DATA_FORK);
@@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-                if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-                    bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+                if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
                                        bma->firstblock, bma->flist, &bma->cur,
                                        1, &tmp_rval, XFS_DATA_FORK);
@@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real(
        }
        /* convert to a btree if necessary */
-        if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
+        if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
-            XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
                int     tmp_logflags;   /* partial log flag return val */
                ASSERT(bma->cur == NULL);
@@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real(
        }
        /* convert to a btree if necessary */
-        if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
+        if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
-            XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
                int     tmp_logflags;   /* partial log flag return val */
                ASSERT(cur == NULL);
@@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real(
        }
        /* convert to a btree if necessary */
-        if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+        if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
-            XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
                int     tmp_logflags;   /* partial log flag return val */
                ASSERT(bma->cur == NULL);
@@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree(
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
-        ASSERT(ifp->if_ext_max ==
-               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
        /*
         * Make space in the inode incore.
         */
@@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset(
            ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
                uint    dfl_forkoff = xfs_default_attroffset(ip) >> 3;
-                if (dfl_forkoff > ip->i_d.di_forkoff) {
+                if (dfl_forkoff > ip->i_d.di_forkoff)
                        ip->i_d.di_forkoff = dfl_forkoff;
-                        ip->i_df.if_ext_max =
-                                XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
-                        ip->i_afp->if_ext_max =
-                                XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
-                }
        }
 }
@@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork(
        int                     error;          /* error return value */
        ASSERT(XFS_IFORK_Q(ip) == 0);
-        ASSERT(ip->i_df.if_ext_max ==
-               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
        mp = ip->i_mount;
        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork(
                error = XFS_ERROR(EINVAL);
                goto error1;
        }
-        ip->i_df.if_ext_max =
-                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        ASSERT(ip->i_afp == NULL);
        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
-        ip->i_afp->if_ext_max =
-                XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        ip->i_afp->if_flags = XFS_IFEXTENTS;
        logflags = 0;
        xfs_bmap_init(&flist, &firstblock);
@@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork(
                } else
                        spin_unlock(&mp->m_sb_lock);
        }
-        if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
+        error = xfs_bmap_finish(&tp, &flist, &committed);
+        if (error)
                goto error2;
-        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+        return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        ASSERT(ip->i_df.if_ext_max ==
-               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
-        return error;
 error2:
        xfs_bmap_cancel(&flist);
 error1:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 error0:
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-        ASSERT(ip->i_df.if_ext_max ==
-               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
        return error;
 }
@@ -3994,11 +3997,8 @@ xfs_bmap_one_block(
        xfs_bmbt_irec_t s;              /* internal version of extent */
 #ifndef DEBUG
-        if (whichfork == XFS_DATA_FORK) {
+        if (whichfork == XFS_DATA_FORK)
-                return S_ISREG(ip->i_d.di_mode) ?
+                return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
-                        (ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
-                        (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
-        }
 #endif  /* !DEBUG */
        if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
                return 0;
@@ -4010,7 +4010,7 @@ xfs_bmap_one_block(
        xfs_bmbt_get_all(ep, &s);
        rval = s.br_startoff == 0 && s.br_blockcount == 1;
        if (rval && whichfork == XFS_DATA_FORK)
-                ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize);
+                ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
        return rval;
 }
@@ -4379,8 +4379,6 @@ xfs_bmapi_read(
        XFS_STATS_INC(xs_blk_mapr);
        ifp = XFS_IFORK_PTR(ip, whichfork);
-        ASSERT(ifp->if_ext_max ==
-               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
                error = xfs_iread_extents(NULL, ip, whichfork);
@@ -4871,8 +4869,6 @@ xfs_bmapi_write(
                return XFS_ERROR(EIO);
        ifp = XFS_IFORK_PTR(ip, whichfork);
-        ASSERT(ifp->if_ext_max ==
-               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
        XFS_STATS_INC(xs_blk_mapw);
@@ -4981,8 +4977,7 @@ xfs_bmapi_write(
        /*
         * Transform from btree to extents, give it cur.
         */
-        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+        if (xfs_bmap_wants_extents(ip, whichfork)) {
-            XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
                int             tmp_logflags = 0;
                ASSERT(bma.cur);
@@ -4992,10 +4987,10 @@ xfs_bmapi_write(
                if (error)
                        goto error0;
        }
-        ASSERT(ifp->if_ext_max ==
-               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
-               XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
+               XFS_IFORK_NEXTENTS(ip, whichfork) >
+                XFS_IFORK_MAXEXT(ip, whichfork));
        error = 0;
 error0:
        /*
@@ -5095,8 +5090,7 @@ xfs_bunmapi(
        ASSERT(len > 0);
        ASSERT(nexts >= 0);
-        ASSERT(ifp->if_ext_max ==
-               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
            (error = xfs_iread_extents(tp, ip, whichfork)))
                return error;
@@ -5322,7 +5316,8 @@ xfs_bunmapi(
                 */
                if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
                    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-                    XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+                    XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+                        XFS_IFORK_MAXEXT(ip, whichfork) &&
                    del.br_startoff > got.br_startoff &&
                    del.br_startoff + del.br_blockcount <
                    got.br_startoff + got.br_blockcount) {
@@ -5353,13 +5348,11 @@ nodelete:
                }
        }
        *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
-        ASSERT(ifp->if_ext_max ==
-               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
        /*
         * Convert to a btree if necessary.
         */
-        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+        if (xfs_bmap_needs_btree(ip, whichfork)) {
-            XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
                ASSERT(cur == NULL);
                error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
                        &cur, 0, &tmp_logflags, whichfork);
@@ -5370,8 +5363,7 @@ nodelete:
        /*
         * transform from btree to extents, give it cur
         */
-        else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+        else if (xfs_bmap_wants_extents(ip, whichfork)) {
-                 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
                ASSERT(cur != NULL);
                error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
                        whichfork);
@@ -5382,8 +5374,6 @@ nodelete:
        /*
         * transform from extents to local?
         */
-        ASSERT(ifp->if_ext_max ==
-               XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
        error = 0;
 error0:
        /*
@@ -5434,7 +5424,7 @@ xfs_getbmapx_fix_eof_hole(
        if (startblock == HOLESTARTBLOCK) {
                mp = ip->i_mount;
                out->bmv_block = -1;
-                fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size));
+                fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
                fixlen -= out->bmv_offset;
                if (prealloced && out->bmv_offset + out->bmv_length == end) {
                        /* Came to hole at EOF. Trim it. */
@@ -5522,7 +5512,7 @@ xfs_getbmap(
                        fixlen = XFS_MAXIOFFSET(mp);
                } else {
                        prealloced = 0;
-                        fixlen = ip->i_size;
+                        fixlen = XFS_ISIZE(ip);
                }
        }
@@ -5551,7 +5541,7 @@ xfs_getbmap(
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
-                if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
+                if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
                        error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
                        if (error)
                                goto out_unlock_iolock;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 654dc6f05bac..dd974a55c77d 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -163,12 +163,14 @@ xfs_swap_extents_check_format(
        /* Check temp in extent form to max in target */
        if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-            XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+            XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+                        XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
                return EINVAL;
        /* Check target in extent form to max in temp */
        if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-            XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+            XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+                        XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
                return EINVAL;
        /*
@@ -180,18 +182,25 @@ xfs_swap_extents_check_format(
         * (a common defrag case) which will occur when the temp inode is in
         * extent format...
         */
-        if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+        if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-            ((XFS_IFORK_BOFF(ip) &&
+                if (XFS_IFORK_BOFF(ip) &&
-              tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
+                    tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
-             XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
+                        return EINVAL;
-                return EINVAL;
+                if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+                    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+                        return EINVAL;
+        }
        /* Reciprocal target->temp btree format checks */
-        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-            ((XFS_IFORK_BOFF(tip) &&
+                if (XFS_IFORK_BOFF(tip) &&
-              ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
+                    ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
-             XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
+                        return EINVAL;
-                return EINVAL;
+                if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+                    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+                        return EINVAL;
+        }
        return 0;
 }
@@ -349,16 +358,6 @@ xfs_swap_extents(
        *tifp = *tempifp;       /* struct copy */
        /*
-         * Fix the in-memory data fork values that are dependent on the fork
-         * offset in the inode. We can't assume they remain the same as attr2
-         * has dynamic fork offsets.
-         */
-        ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
-                                        (uint)sizeof(xfs_bmbt_rec_t);
-        tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
-                                        (uint)sizeof(xfs_bmbt_rec_t);
-        /*
         * Fix the on-disk inode values
         */
        tmp = (__uint64_t)ip->i_d.di_nblocks;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 8a24f0c6c860..286a051f12cf 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -68,7 +68,7 @@ xfs_trim_extents(
         * Look up the longest btree in the AGF and start with it.
         */
        error = xfs_alloc_lookup_le(cur, 0,
-                                    XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+                            be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
        if (error)
                goto out_del_cursor;
@@ -84,7 +84,7 @@ xfs_trim_extents(
                if (error)
                        goto out_del_cursor;
                XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
-                ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+                ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
                /*
                 * Too small?  Give up.
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f675f3d9d7b3..7e5bc872f2b4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -327,7 +327,7 @@ xfs_file_aio_read(
                                mp->m_rtdev_targp : mp->m_ddev_targp;
                if ((iocb->ki_pos & target->bt_smask) ||
                    (size & target->bt_smask)) {
-                        if (iocb->ki_pos == ip->i_size)
+                        if (iocb->ki_pos == i_size_read(inode))
                                return 0;
                        return -XFS_ERROR(EINVAL);
                }
@@ -412,51 +412,6 @@ xfs_file_splice_read(
        return ret;
 }
-STATIC void
-xfs_aio_write_isize_update(
-        struct inode    *inode,
-        loff_t          *ppos,
-        ssize_t         bytes_written)
-{
-        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_fsize_t             isize = i_size_read(inode);
-        if (bytes_written > 0)
-                XFS_STATS_ADD(xs_write_bytes, bytes_written);
-        if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
-                                        *ppos > isize))
-                *ppos = isize;
-        if (*ppos > ip->i_size) {
-                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-                if (*ppos > ip->i_size)
-                        ip->i_size = *ppos;
-                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-        }
-}
-/*
- * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
- * part of the I/O may have been written to disk before the error occurred.  In
- * this case the on-disk file size may have been adjusted beyond the in-memory
- * file size and now needs to be truncated back.
- */
-STATIC void
-xfs_aio_write_newsize_update(
-        struct xfs_inode        *ip,
-        xfs_fsize_t             new_size)
-{
-        if (new_size == ip->i_new_size) {
-                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-                if (new_size == ip->i_new_size)
-                        ip->i_new_size = 0;
-                if (ip->i_d.di_size > ip->i_size)
-                        ip->i_d.di_size = ip->i_size;
-                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-        }
-}
 /*
 * xfs_file_splice_write() does not use xfs_rw_ilock() because
 * generic_file_splice_write() takes the i_mutex itself. This, in theory,
@@ -475,7 +430,6 @@ xfs_file_splice_write(
 {
        struct inode            *inode = outfilp->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_fsize_t             new_size;
        int                     ioflags = 0;
        ssize_t                 ret;
@@ -489,19 +443,12 @@ xfs_file_splice_write(
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-        new_size = *ppos + count;
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        if (new_size > ip->i_size)
-                ip->i_new_size = new_size;
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
        ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+        if (ret > 0)
+                XFS_STATS_ADD(xs_write_bytes, ret);
-        xfs_aio_write_isize_update(inode, ppos, ret);
-        xfs_aio_write_newsize_update(ip, new_size);
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
        return ret;
 }
@@ -689,28 +636,26 @@ out_lock:
 /*
 * Common pre-write limit and setup checks.
 *
- * Returns with iolock held according to @iolock.
+ * Called with the iolocked held either shared and exclusive according to
+ * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
+ * if called for a direct write beyond i_size.
 */
 STATIC ssize_t
 xfs_file_aio_write_checks(
        struct file             *file,
        loff_t                  *pos,
        size_t                  *count,
-        xfs_fsize_t             *new_sizep,
        int                     *iolock)
 {
        struct inode            *inode = file->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_fsize_t             new_size;
        int                     error = 0;
        xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-        *new_sizep = 0;
 restart:
        error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
        if (error) {
-                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-                *iolock = 0;
                return error;
        }
@@ -720,36 +665,21 @@ restart:
        /*
         * If the offset is beyond the size of the file, we need to zero any
         * blocks that fall between the existing EOF and the start of this
-         * write. There is no need to issue zeroing if another in-flght IO ends
+         * write.  If zeroing is needed and we are currently holding the
-         * at or before this one If zeronig is needed and we are currently
+         * iolock shared, we need to update it to exclusive which involves
-         * holding the iolock shared, we need to update it to exclusive which
+         * dropping all locks and relocking to maintain correct locking order.
-         * involves dropping all locks and relocking to maintain correct locking
+         * If we do this, restart the function to ensure all checks and values
-         * order. If we do this, restart the function to ensure all checks and
+         * are still valid.
-         * values are still valid.
         */
-        if ((ip->i_new_size && *pos > ip->i_new_size) ||
+        if (*pos > i_size_read(inode)) {
-            (!ip->i_new_size && *pos > ip->i_size)) {
                if (*iolock == XFS_IOLOCK_SHARED) {
                        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
                        *iolock = XFS_IOLOCK_EXCL;
                        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
                        goto restart;
                }
-                error = -xfs_zero_eof(ip, *pos, ip->i_size);
+                error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
        }
-        /*
-         * If this IO extends beyond EOF, we may need to update ip->i_new_size.
-         * We have already zeroed space beyond EOF (if necessary).  Only update
-         * ip->i_new_size if this IO ends beyond any other in-flight writes.
-         */
-        new_size = *pos + *count;
-        if (new_size > ip->i_size) {
-                if (new_size > ip->i_new_size)
-                        ip->i_new_size = new_size;
-                *new_sizep = new_size;
-        }
        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
        if (error)
                return error;
@@ -794,9 +724,7 @@ xfs_file_dio_aio_write(
        const struct iovec      *iovp,
        unsigned long           nr_segs,
        loff_t                  pos,
-        size_t                  ocount,
+        size_t                  ocount)
-        xfs_fsize_t             *new_size,
-        int                     *iolock)
 {
        struct file             *file = iocb->ki_filp;
        struct address_space    *mapping = file->f_mapping;
@@ -806,10 +734,10 @@ xfs_file_dio_aio_write(
        ssize_t                 ret = 0;
        size_t                  count = ocount;
        int                     unaligned_io = 0;
+        int                     iolock;
        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
-        *iolock = 0;
        if ((pos & target->bt_smask) || (count & target->bt_smask))
                return -XFS_ERROR(EINVAL);
@@ -824,31 +752,31 @@ xfs_file_dio_aio_write(
         * EOF zeroing cases and fill out the new inode size as appropriate.
         */
        if (unaligned_io || mapping->nrpages)
-                *iolock = XFS_IOLOCK_EXCL;
+                iolock = XFS_IOLOCK_EXCL;
        else
-                *iolock = XFS_IOLOCK_SHARED;
+                iolock = XFS_IOLOCK_SHARED;
-        xfs_rw_ilock(ip, *iolock);
+        xfs_rw_ilock(ip, iolock);
        /*
         * Recheck if there are cached pages that need invalidate after we got
         * the iolock to protect against other threads adding new pages while
         * we were waiting for the iolock.
         */
-        if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
+        if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
-                xfs_rw_iunlock(ip, *iolock);
+                xfs_rw_iunlock(ip, iolock);
-                *iolock = XFS_IOLOCK_EXCL;
+                iolock = XFS_IOLOCK_EXCL;
-                xfs_rw_ilock(ip, *iolock);
+                xfs_rw_ilock(ip, iolock);
        }
-        ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+        ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
        if (ret)
-                return ret;
+                goto out;
        if (mapping->nrpages) {
                ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
                                                        FI_REMAPF_LOCKED);
                if (ret)
-                        return ret;
+                        goto out;
        }
        /*
@@ -857,15 +785,18 @@ xfs_file_dio_aio_write(
         */
        if (unaligned_io)
                inode_dio_wait(inode);
-        else if (*iolock == XFS_IOLOCK_EXCL) {
+        else if (iolock == XFS_IOLOCK_EXCL) {
                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-                *iolock = XFS_IOLOCK_SHARED;
+                iolock = XFS_IOLOCK_SHARED;
        }
        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
        ret = generic_file_direct_write(iocb, iovp,
                        &nr_segs, pos, &iocb->ki_pos, count, ocount);
+out:
+        xfs_rw_iunlock(ip, iolock);
        /* No fallback to buffered IO on errors for XFS. */
        ASSERT(ret < 0 || ret == count);
        return ret;
@@ -877,9 +808,7 @@ xfs_file_buffered_aio_write(
        const struct iovec      *iovp,
        unsigned long           nr_segs,
        loff_t                  pos,
-        size_t                  ocount,
+        size_t                  ocount)
-        xfs_fsize_t             *new_size,
-        int                     *iolock)
 {
        struct file             *file = iocb->ki_filp;
        struct address_space    *mapping = file->f_mapping;
@@ -887,14 +816,14 @@ xfs_file_buffered_aio_write(
        struct xfs_inode        *ip = XFS_I(inode);
        ssize_t                 ret;
        int                     enospc = 0;
+        int                     iolock = XFS_IOLOCK_EXCL;
        size_t                  count = ocount;
-        *iolock = XFS_IOLOCK_EXCL;
+        xfs_rw_ilock(ip, iolock);
-        xfs_rw_ilock(ip, *iolock);
-        ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+        ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
        if (ret)
-                return ret;
+                goto out;
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
@@ -908,13 +837,15 @@ write_retry:
         * page locks and retry *once*
         */
        if (ret == -ENOSPC && !enospc) {
-                ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-                if (ret)
-                        return ret;
                enospc = 1;
-                goto write_retry;
+                ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+                if (!ret)
+                        goto write_retry;
        }
        current->backing_dev_info = NULL;
+out:
+        xfs_rw_iunlock(ip, iolock);
        return ret;
 }
@@ -930,9 +861,7 @@ xfs_file_aio_write(
        struct inode            *inode = mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
        ssize_t                 ret;
-        int                     iolock;
        size_t                  ocount = 0;
-        xfs_fsize_t             new_size = 0;
        XFS_STATS_INC(xs_write_calls);
@@ -951,33 +880,22 @@ xfs_file_aio_write(
                return -EIO;
        if (unlikely(file->f_flags & O_DIRECT))
-                ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+                ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
-                                                ocount, &new_size, &iolock);
        else
                ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-                                                ocount, &new_size, &iolock);
+                                                  ocount);
-        xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
-        if (ret <= 0)
+        if (ret > 0) {
-                goto out_unlock;
+                ssize_t err;
-        /* Handle various SYNC-type writes */
+                XFS_STATS_ADD(xs_write_bytes, ret);
-        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-                loff_t end = pos + ret - 1;
-                int error;
-                xfs_rw_iunlock(ip, iolock);
+                /* Handle various SYNC-type writes */
-                error = xfs_file_fsync(file, pos, end,
+                err = generic_write_sync(file, pos, ret);
-                                      (file->f_flags & __O_SYNC) ? 0 : 1);
+                if (err < 0)
-                xfs_rw_ilock(ip, iolock);
+                        ret = err;
-                if (error)
-                        ret = error;
        }
-out_unlock:
-        xfs_aio_write_newsize_update(ip, new_size);
-        xfs_rw_iunlock(ip, iolock);
        return ret;
 }
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index ed88ed16811c..652b875a9d4c 100644
--- a/fs/xfs/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
@@ -90,7 +90,7 @@ xfs_wait_on_pages(
        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
                return -filemap_fdatawait_range(mapping, first,
-                                        last == -1 ? ip->i_size - 1 : last);
+                                        last == -1 ? XFS_ISIZE(ip) - 1 : last);
        }
        return 0;
 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 3960a066d7ff..8c3e46394d48 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -77,7 +77,7 @@ xfs_inode_alloc(
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-        ASSERT(completion_done(&ip->i_flush));
+        ASSERT(!xfs_isiflocked(ip));
        ASSERT(ip->i_ino == 0);
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
@@ -94,8 +94,6 @@ xfs_inode_alloc(
        ip->i_update_core = 0;
        ip->i_delayed_blks = 0;
        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-        ip->i_size = 0;
-        ip->i_new_size = 0;
        return ip;
 }
@@ -150,7 +148,7 @@ xfs_inode_free(
        /* asserts to verify all state is correct here */
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-        ASSERT(completion_done(&ip->i_flush));
+        ASSERT(!xfs_isiflocked(ip));
        /*
         * Because we use RCU freeing we need to ensure the inode always
@@ -450,8 +448,6 @@ again:
        *ipp = ip;
-        ASSERT(ip->i_df.if_ext_max ==
-               XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
        /*
         * If we have a real type for an on-disk inode, we can set ops(&unlock)
         * now.  If it's a new inode being created, xfs_ialloc will handle it.
@@ -715,3 +711,19 @@ xfs_isilocked(
        return 0;
 }
 #endif
+void
+__xfs_iflock(
+        struct xfs_inode        *ip)
+{
+        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+        do {
+                prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+                if (xfs_isiflocked(ip))
+                        io_schedule();
+        } while (!xfs_iflock_nowait(ip));
+        finish_wait(wq, &wait.wait);
+}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9dda7cc32848..b21022499c2e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -299,11 +299,8 @@ xfs_iformat(
 {
        xfs_attr_shortform_t    *atp;
        int                     size;
-        int                     error;
+        int                     error = 0;
        xfs_fsize_t             di_size;
-        ip->i_df.if_ext_max =
-                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
-        error = 0;
        if (unlikely(be32_to_cpu(dip->di_nextents) +
                     be16_to_cpu(dip->di_anextents) >
@@ -350,7 +347,6 @@ xfs_iformat(
                        return XFS_ERROR(EFSCORRUPTED);
                }
                ip->i_d.di_size = 0;
-                ip->i_size = 0;
                ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
                break;
@@ -409,10 +405,10 @@ xfs_iformat(
        }
        if (!XFS_DFORK_Q(dip))
                return 0;
        ASSERT(ip->i_afp == NULL);
        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
-        ip->i_afp->if_ext_max =
-                XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        switch (dip->di_aformat) {
        case XFS_DINODE_FMT_LOCAL:
                atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
@@ -604,10 +600,11 @@ xfs_iformat_btree(
         * or the number of extents is greater than the number of
         * blocks.
         */
-        if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
+        if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
-            || XFS_BMDR_SPACE_CALC(nrecs) >
+                        XFS_IFORK_MAXEXT(ip, whichfork) ||
-                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
+                     XFS_BMDR_SPACE_CALC(nrecs) >
-            || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
+                     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
                xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
                        (unsigned long long) ip->i_ino);
                XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -835,12 +832,6 @@ xfs_iread(
                 * with the uninitialized part of it.
                 */
                ip->i_d.di_mode = 0;
-                /*
-                 * Initialize the per-fork minima and maxima for a new
-                 * inode here.  xfs_iformat will do it for old inodes.
-                 */
-                ip->i_df.if_ext_max =
-                        XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        }
        /*
@@ -861,7 +852,6 @@ xfs_iread(
        }
        ip->i_delayed_blks = 0;
-        ip->i_size = ip->i_d.di_size;
        /*
         * Mark the buffer containing the inode as something to keep
@@ -1051,7 +1041,6 @@ xfs_ialloc(
        }
        ip->i_d.di_size = 0;
-        ip->i_size = 0;
        ip->i_d.di_nextents = 0;
        ASSERT(ip->i_d.di_nblocks == 0);
@@ -1166,52 +1155,6 @@ xfs_ialloc(
 }
 /*
- * Check to make sure that there are no blocks allocated to the
- * file beyond the size of the file.  We don't check this for
- * files with fixed size extents or real time extents, but we
- * at least do it for regular files.
- */
-#ifdef DEBUG
-STATIC void
-xfs_isize_check(
-        struct xfs_inode        *ip,
-        xfs_fsize_t             isize)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        xfs_fileoff_t           map_first;
-        int                     nimaps;
-        xfs_bmbt_irec_t         imaps[2];
-        int                     error;
-        if (!S_ISREG(ip->i_d.di_mode))
-                return;
-        if (XFS_IS_REALTIME_INODE(ip))
-                return;
-        if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-                return;
-        nimaps = 2;
-        map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-        /*
-         * The filesystem could be shutting down, so bmapi may return
-         * an error.
-         */
-        error = xfs_bmapi_read(ip, map_first,
-                         (XFS_B_TO_FSB(mp,
-                               (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
-                         imaps, &nimaps, XFS_BMAPI_ENTIRE);
-        if (error)
-                return;
-        ASSERT(nimaps == 1);
-        ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
-}
-#else   /* DEBUG */
-#define xfs_isize_check(ip, isize)
-#endif  /* DEBUG */
-/*
 * Free up the underlying blocks past new_size.  The new size must be smaller
 * than the current size.  This routine can be used both for the attribute and
 * data fork, and does not modify the inode size, which is left to the caller.
@@ -1252,12 +1195,14 @@ xfs_itruncate_extents(
        int                     done = 0;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-        ASSERT(new_size <= ip->i_size);
+        ASSERT(new_size <= XFS_ISIZE(ip));
        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
        ASSERT(ip->i_itemp != NULL);
        ASSERT(ip->i_itemp->ili_lock_flags == 0);
        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+        trace_xfs_itruncate_extents_start(ip, new_size);
        /*
         * Since it is possible for space to become allocated beyond
         * the end of the file (in a crash where the space is allocated
@@ -1325,6 +1270,14 @@ xfs_itruncate_extents(
                        goto out;
        }
+        /*
+         * Always re-log the inode so that our permanent transaction can keep
+         * on rolling it forward in the log.
+         */
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        trace_xfs_itruncate_extents_end(ip, new_size);
 out:
        *tpp = tp;
        return error;
@@ -1338,74 +1291,6 @@ out_bmap_cancel:
        goto out;
 }
-int
-xfs_itruncate_data(
-        struct xfs_trans        **tpp,
-        struct xfs_inode        *ip,
-        xfs_fsize_t             new_size)
-{
-        int                     error;
-        trace_xfs_itruncate_data_start(ip, new_size);
-        /*
-         * The first thing we do is set the size to new_size permanently on
-         * disk.  This way we don't have to worry about anyone ever being able
-         * to look at the data being freed even in the face of a crash.
-         * What we're getting around here is the case where we free a block, it
-         * is allocated to another file, it is written to, and then we crash.
-         * If the new data gets written to the file but the log buffers
-         * containing the free and reallocation don't, then we'd end up with
-         * garbage in the blocks being freed.  As long as we make the new_size
-         * permanent before actually freeing any blocks it doesn't matter if
-         * they get written to.
-         */
-        if (ip->i_d.di_nextents > 0) {
-                /*
-                 * If we are not changing the file size then do not update
-                 * the on-disk file size - we may be called from
-                 * xfs_inactive_free_eofblocks().  If we update the on-disk
-                 * file size and then the system crashes before the contents
-                 * of the file are flushed to disk then the files may be
-                 * full of holes (ie NULL files bug).
-                 */
-                if (ip->i_size != new_size) {
-                        ip->i_d.di_size = new_size;
-                        ip->i_size = new_size;
-                        xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-                }
-        }
-        error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
-        if (error)
-                return error;
-        /*
-         * If we are not changing the file size then do not update the on-disk
-         * file size - we may be called from xfs_inactive_free_eofblocks().
-         * If we update the on-disk file size and then the system crashes
-         * before the contents of the file are flushed to disk then the files
-         * may be full of holes (ie NULL files bug).
-         */
-        xfs_isize_check(ip, new_size);
-        if (ip->i_size != new_size) {
-                ip->i_d.di_size = new_size;
-                ip->i_size = new_size;
-        }
-        ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
-        ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
-        /*
-         * Always re-log the inode so that our permanent transaction can keep
-         * on rolling it forward in the log.
-         */
-        xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-        trace_xfs_itruncate_data_end(ip, new_size);
-        return 0;
-}
 /*
 * This is called when the inode's link count goes to 0.
 * We place the on-disk inode on a list in the AGI.  It
@@ -1824,8 +1709,7 @@ xfs_ifree(
        ASSERT(ip->i_d.di_nlink == 0);
        ASSERT(ip->i_d.di_nextents == 0);
        ASSERT(ip->i_d.di_anextents == 0);
-        ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
+        ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
-               (!S_ISREG(ip->i_d.di_mode)));
        ASSERT(ip->i_d.di_nblocks == 0);
        /*
@@ -1844,8 +1728,6 @@ xfs_ifree(
        ip->i_d.di_flags = 0;
        ip->i_d.di_dmevmask = 0;
        ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
-        ip->i_df.if_ext_max =
-                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
        /*
@@ -2151,7 +2033,7 @@ xfs_idestroy_fork(
 * once someone is waiting for it to be unpinned.
 */
 static void
-xfs_iunpin_nowait(
+xfs_iunpin(
        struct xfs_inode        *ip)
 {
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
@@ -2163,14 +2045,29 @@ xfs_iunpin_nowait(
 }
+static void
+__xfs_iunpin_wait(
+        struct xfs_inode        *ip)
+{
+        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
+        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
+        xfs_iunpin(ip);
+        do {
+                prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+                if (xfs_ipincount(ip))
+                        io_schedule();
+        } while (xfs_ipincount(ip));
+        finish_wait(wq, &wait.wait);
+}
 void
 xfs_iunpin_wait(
        struct xfs_inode        *ip)
 {
-        if (xfs_ipincount(ip)) {
+        if (xfs_ipincount(ip))
-                xfs_iunpin_nowait(ip);
+                __xfs_iunpin_wait(ip);
-                wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
-        }
 }
 /*
@@ -2510,9 +2407,9 @@ xfs_iflush(
        XFS_STATS_INC(xs_iflush_count);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-        ASSERT(!completion_done(&ip->i_flush));
+        ASSERT(xfs_isiflocked(ip));
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
-               ip->i_d.di_nextents > ip->i_df.if_ext_max);
+               ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
        iip = ip->i_itemp;
        mp = ip->i_mount;
@@ -2529,7 +2426,7 @@ xfs_iflush(
         * out for us if they occur after the log force completes.
         */
        if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
-                xfs_iunpin_nowait(ip);
+                xfs_iunpin(ip);
                xfs_ifunlock(ip);
                return EAGAIN;
        }
@@ -2626,9 +2523,9 @@ xfs_iflush_int(
 #endif
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-        ASSERT(!completion_done(&ip->i_flush));
+        ASSERT(xfs_isiflocked(ip));
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
-               ip->i_d.di_nextents > ip->i_df.if_ext_max);
+               ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
        iip = ip->i_itemp;
        mp = ip->i_mount;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f0e6b151ba37..2f27b7454085 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -66,7 +66,6 @@ typedef struct xfs_ifork {
        struct xfs_btree_block  *if_broot;      /* file's incore btree root */
        short                   if_broot_bytes; /* bytes allocated for root */
        unsigned char           if_flags;       /* per-fork flags */
-        unsigned char           if_ext_max;     /* max # of extent records */
        union {
                xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
                xfs_ext_irec_t  *if_ext_irec;   /* irec map file exts */
@@ -206,12 +205,12 @@ typedef struct xfs_icdinode {
        ((w) == XFS_DATA_FORK ? \
                ((ip)->i_d.di_nextents = (n)) : \
                ((ip)->i_d.di_anextents = (n)))
+#define XFS_IFORK_MAXEXT(ip, w) \
+        (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
 #ifdef __KERNEL__
-struct bhv_desc;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -220,12 +219,6 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot;
-typedef struct dm_attrs_s {
-        __uint32_t      da_dmevmask;    /* DMIG event mask */
-        __uint16_t      da_dmstate;     /* DMIG state info */
-        __uint16_t      da_pad;         /* DMIG extra padding */
-} dm_attrs_t;
 typedef struct xfs_inode {
        /* Inode linking and identification information. */
        struct xfs_mount        *i_mount;       /* fs mount struct ptr */
@@ -244,27 +237,19 @@ typedef struct xfs_inode {
        struct xfs_inode_log_item *i_itemp;     /* logging information */
        mrlock_t                i_lock;         /* inode lock */
        mrlock_t                i_iolock;       /* inode IO lock */
-        struct completion       i_flush;        /* inode flush completion q */
        atomic_t                i_pincount;     /* inode pin count */
-        wait_queue_head_t       i_ipin_wait;    /* inode pinning wait queue */
        spinlock_t              i_flags_lock;   /* inode i_flags lock */
        /* Miscellaneous state. */
-        unsigned short          i_flags;        /* see defined flags below */
+        unsigned long           i_flags;        /* see defined flags below */
        unsigned char           i_update_core;  /* timestamps/size is dirty */
        unsigned int            i_delayed_blks; /* count of delay alloc blks */
        xfs_icdinode_t          i_d;            /* most of ondisk inode */
-        xfs_fsize_t             i_size;         /* in-memory size */
-        xfs_fsize_t             i_new_size;     /* size when write completes */
        /* VFS inode */
        struct inode            i_vnode;        /* embedded VFS inode */
 } xfs_inode_t;
-#define XFS_ISIZE(ip)   S_ISREG((ip)->i_d.di_mode) ? \
-                                (ip)->i_size : (ip)->i_d.di_size;
 /* Convert from vfs inode to xfs inode */
 static inline struct xfs_inode *XFS_I(struct inode *inode)
 {
@@ -278,6 +263,18 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
 }
 /*
+ * For regular files we only update the on-disk filesize when actually
+ * writing data back to disk.  Until then only the copy in the VFS inode
+ * is uptodate.
+ */
+static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
+{
+        if (S_ISREG(ip->i_d.di_mode))
+                return i_size_read(VFS_I(ip));
+        return ip->i_d.di_size;
+}
+/*
 * i_flags helper functions
 */
 static inline void
@@ -331,6 +328,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
        return ret;
 }
+static inline int
+xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
+{
+        int ret;
+        spin_lock(&ip->i_flags_lock);
+        ret = ip->i_flags & flags;
+        if (!ret)
+                ip->i_flags |= flags;
+        spin_unlock(&ip->i_flags_lock);
+        return ret;
+}
 /*
 * Project quota id helpers (previously projid was 16bit only
 * and using two 16bit values to hold new 32bit projid was chosen
@@ -351,35 +361,19 @@ xfs_set_projid(struct xfs_inode *ip,
 }
 /*
- * Manage the i_flush queue embedded in the inode.  This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
-        wait_for_completion(&ip->i_flush);
-}
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
-        return try_wait_for_completion(&ip->i_flush);
-}
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
-        complete(&ip->i_flush);
-}
-/*
 * In-core inode flags.
 */
-#define XFS_IRECLAIM            0x0001  /* started reclaiming this inode */
+#define XFS_IRECLAIM            (1 << 0) /* started reclaiming this inode */
-#define XFS_ISTALE              0x0002  /* inode has been staled */
+#define XFS_ISTALE              (1 << 1) /* inode has been staled */
-#define XFS_IRECLAIMABLE        0x0004  /* inode can be reclaimed */
+#define XFS_IRECLAIMABLE        (1 << 2) /* inode can be reclaimed */
-#define XFS_INEW                0x0008  /* inode has just been allocated */
+#define XFS_INEW                (1 << 3) /* inode has just been allocated */
-#define XFS_IFILESTREAM         0x0010  /* inode is in a filestream directory */
+#define XFS_IFILESTREAM         (1 << 4) /* inode is in a filestream dir. */
-#define XFS_ITRUNCATED          0x0020  /* truncated down so flush-on-close */
+#define XFS_ITRUNCATED          (1 << 5) /* truncated down so flush-on-close */
-#define XFS_IDIRTY_RELEASE      0x0040  /* dirty release already seen */
+#define XFS_IDIRTY_RELEASE      (1 << 6) /* dirty release already seen */
+#define __XFS_IFLOCK_BIT        7        /* inode is being flushed right now */
+#define XFS_IFLOCK              (1 << __XFS_IFLOCK_BIT)
+#define __XFS_IPINNED_BIT       8        /* wakeup key for zero pin count */
+#define XFS_IPINNED             (1 << __XFS_IPINNED_BIT)
 /*
 * Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -392,6 +386,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
         XFS_IFILESTREAM);
 /*
+ * Synchronize processes attempting to flush the in-core inode back to disk.
+ */
+extern void __xfs_iflock(struct xfs_inode *ip);
+static inline int xfs_iflock_nowait(struct xfs_inode *ip)
+{
+        return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
+}
+static inline void xfs_iflock(struct xfs_inode *ip)
+{
+        if (!xfs_iflock_nowait(ip))
+                __xfs_iflock(ip);
+}
+static inline void xfs_ifunlock(struct xfs_inode *ip)
+{
+        xfs_iflags_clear(ip, XFS_IFLOCK);
+        wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
+}
+static inline int xfs_isiflocked(struct xfs_inode *ip)
+{
+        return xfs_iflags_test(ip, XFS_IFLOCK);
+}
+/*
 * Flags for inode locking.
 * Bit ranges:  1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
 *              1<<16 - 1<<32-1 -- lockdep annotation (integers)
@@ -491,8 +513,6 @@ int		xfs_ifree(struct xfs_trans *, xfs_inode_t *,
                           struct xfs_bmap_free *);
 int             xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
                                      int, xfs_fsize_t);
-int             xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *,
-                                   xfs_fsize_t);
 int             xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 void            xfs_iext_realloc(xfs_inode_t *, int, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cfd6c7f8cc3c..91d71dcd4852 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -79,8 +79,6 @@ xfs_inode_item_size(
                break;
        case XFS_DINODE_FMT_BTREE:
-                ASSERT(ip->i_df.if_ext_max ==
-                       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
                iip->ili_format.ilf_fields &=
                        ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
                          XFS_ILOG_DEV | XFS_ILOG_UUID);
@@ -557,7 +555,7 @@ xfs_inode_item_unpin(
        trace_xfs_inode_unpin(ip, _RET_IP_);
        ASSERT(atomic_read(&ip->i_pincount) > 0);
        if (atomic_dec_and_test(&ip->i_pincount))
-                wake_up(&ip->i_ipin_wait);
+                wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
 }
 /*
@@ -719,7 +717,7 @@ xfs_inode_item_pushbuf(
         * If a flush is not in progress anymore, chances are that the
         * inode was taken off the AIL. So, just get out.
         */
-        if (completion_done(&ip->i_flush) ||
+        if (!xfs_isiflocked(ip) ||
            !(lip->li_flags & XFS_LI_IN_AIL)) {
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                return true;
@@ -752,7 +750,7 @@ xfs_inode_item_push(
        struct xfs_inode        *ip = iip->ili_inode;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
-        ASSERT(!completion_done(&ip->i_flush));
+        ASSERT(xfs_isiflocked(ip));
        /*
         * Since we were able to lock the inode's flush lock and
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 9afa282aa937..246c7d57c6f9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb(
        xfs_fileoff_t   *last_fsb)
 {
        xfs_fileoff_t   new_last_fsb = 0;
-        xfs_extlen_t    align;
+        xfs_extlen_t    align = 0;
        int             eof, error;
-        if (XFS_IS_REALTIME_INODE(ip))
+        if (!XFS_IS_REALTIME_INODE(ip)) {
-                ;
+                /*
-        /*
+                 * Round up the allocation request to a stripe unit
-         * If mounted with the "-o swalloc" option, roundup the allocation
+                 * (m_dalign) boundary if the file size is >= stripe unit
-         * request to a stripe width boundary if the file size is >=
+                 * size, and we are allocating past the allocation eof.
-         * stripe width and we are allocating past the allocation eof.
+                 *
-         */
+                 * If mounted with the "-o swalloc" option the alignment is
-        else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
+                 * increased from the strip unit size to the stripe width.
-                (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
+                 */
-                new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
+                if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
-        /*
+                        align = mp->m_swidth;
-         * Roundup the allocation request to a stripe unit (m_dalign) boundary
+                else if (mp->m_dalign)
-         * if the file size is >= stripe unit size, and we are allocating past
+                        align = mp->m_dalign;
-         * the allocation eof.
-         */
+                if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
-        else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
+                        new_last_fsb = roundup_64(*last_fsb, align);
-                new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
+        }
        /*
         * Always round up the allocation request to an extent boundary
@@ -154,7 +154,7 @@ xfs_iomap_write_direct(
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-        if ((offset + count) > ip->i_size) {
+        if ((offset + count) > XFS_ISIZE(ip)) {
                error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
                if (error)
                        goto error_out;
@@ -211,7 +211,7 @@ xfs_iomap_write_direct(
        xfs_trans_ijoin(tp, ip, 0);
        bmapi_flag = 0;
-        if (offset < ip->i_size || extsz)
+        if (offset < XFS_ISIZE(ip) || extsz)
                bmapi_flag |= XFS_BMAPI_PREALLOC;
        /*
@@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate(
        int             found_delalloc = 0;
        *prealloc = 0;
-        if ((offset + count) <= ip->i_size)
+        if (offset + count <= XFS_ISIZE(ip))
                return 0;
        /*
@@ -340,7 +340,7 @@ xfs_iomap_prealloc_size(
                 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
                 * ensure we always pass in a non-zero value.
                 */
-                alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+                alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
                alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
                                        rounddown_pow_of_two(alloc_blocks));
@@ -564,7 +564,7 @@ xfs_iomap_write_allocate(
                         * back....
                         */
                        nimaps = 1;
-                        end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
+                        end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
                        error = xfs_bmap_last_offset(NULL, ip, &last_block,
                                                        XFS_DATA_FORK);
                        if (error)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f9babd179223..ab302539e5b9 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -750,6 +750,7 @@ xfs_setattr_size(
        struct xfs_mount        *mp = ip->i_mount;
        struct inode            *inode = VFS_I(ip);
        int                     mask = iattr->ia_valid;
+        xfs_off_t               oldsize, newsize;
        struct xfs_trans        *tp;
        int                     error;
        uint                    lock_flags;
@@ -777,11 +778,13 @@ xfs_setattr_size(
                lock_flags |= XFS_IOLOCK_EXCL;
        xfs_ilock(ip, lock_flags);
+        oldsize = inode->i_size;
+        newsize = iattr->ia_size;
        /*
         * Short circuit the truncate case for zero length files.
         */
-        if (iattr->ia_size == 0 &&
+        if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
-            ip->i_size == 0 && ip->i_d.di_nextents == 0) {
                if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
                        goto out_unlock;
@@ -807,14 +810,14 @@ xfs_setattr_size(
         * the inode to the transaction, because the inode cannot be unlocked
         * once it is a part of the transaction.
         */
-        if (iattr->ia_size > ip->i_size) {
+        if (newsize > oldsize) {
                /*
                 * Do the first part of growing a file: zero any data in the
                 * last block that is beyond the old EOF.  We need to do this
                 * before the inode is joined to the transaction to modify
                 * i_size.
                 */
-                error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+                error = xfs_zero_eof(ip, newsize, oldsize);
                if (error)
                        goto out_unlock;
        }
@@ -833,8 +836,8 @@ xfs_setattr_size(
         * here and prevents waiting for other data not within the range we
         * care about here.
         */
-        if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
+        if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
-                error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
+                error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
                                        FI_NONE);
                if (error)
                        goto out_unlock;
@@ -845,8 +848,7 @@ xfs_setattr_size(
         */
        inode_dio_wait(inode);
-        error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
+        error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
-                                     xfs_get_blocks);
        if (error)
                goto out_unlock;
@@ -857,7 +859,7 @@ xfs_setattr_size(
        if (error)
                goto out_trans_cancel;
-        truncate_setsize(inode, iattr->ia_size);
+        truncate_setsize(inode, newsize);
        commit_flags = XFS_TRANS_RELEASE_LOG_RES;
        lock_flags |= XFS_ILOCK_EXCL;
@@ -876,19 +878,29 @@ xfs_setattr_size(
         * these flags set.  For all other operations the VFS set these flags
         * explicitly if it wants a timestamp update.
         */
-        if (iattr->ia_size != ip->i_size &&
+        if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
-            (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
                iattr->ia_ctime = iattr->ia_mtime =
                        current_fs_time(inode->i_sb);
                mask |= ATTR_CTIME | ATTR_MTIME;
        }
-        if (iattr->ia_size > ip->i_size) {
+        /*
-                ip->i_d.di_size = iattr->ia_size;
+         * The first thing we do is set the size to new_size permanently on
-                ip->i_size = iattr->ia_size;
+         * disk.  This way we don't have to worry about anyone ever being able
-        } else if (iattr->ia_size <= ip->i_size ||
+         * to look at the data being freed even in the face of a crash.
-                   (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
+         * What we're getting around here is the case where we free a block, it
-                error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
+         * is allocated to another file, it is written to, and then we crash.
+         * If the new data gets written to the file but the log buffers
+         * containing the free and reallocation don't, then we'd end up with
+         * garbage in the blocks being freed.  As long as we make the new size
+         * permanent before actually freeing any blocks it doesn't matter if
+         * they get written to.
+         */
+        ip->i_d.di_size = newsize;
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        if (newsize <= oldsize) {
+                error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
                if (error)
                        goto out_trans_abort;
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 5cc3dde1bc90..eafbcff81f3a 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -31,6 +31,7 @@
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -263,13 +264,18 @@ xfs_qm_scall_trunc_qfile(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, 0);
-        error = xfs_itruncate_data(&tp, ip, 0);
+        ip->i_d.di_size = 0;
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
        if (error) {
                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
                                     XFS_TRANS_ABORT);
                goto out_unlock;
        }
+        ASSERT(ip->i_d.di_nextents == 0);
        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 281961c1d81a..ee5b695c99a7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -828,14 +828,6 @@ xfs_fs_inode_init_once(
        /* xfs inode */
        atomic_set(&ip->i_pincount, 0);
        spin_lock_init(&ip->i_flags_lock);
-        init_waitqueue_head(&ip->i_ipin_wait);
-        /*
-         * Because we want to use a counting completion, complete
-         * the flush completion once to allow a single access to
-         * the flush completion without blocking.
-         */
-        init_completion(&ip->i_flush);
-        complete(&ip->i_flush);
        mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
                     "xfsino", ip->i_ino);
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 72c01a1c16e7..40b75eecd2b4 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -707,14 +707,13 @@ xfs_reclaim_inode_grab(
                return 1;
        /*
-         * do some unlocked checks first to avoid unnecessary lock traffic.
+         * If we are asked for non-blocking operation, do unlocked checks to
-         * The first is a flush lock check, the second is a already in reclaim
+         * see if the inode already is being flushed or in reclaim to avoid
-         * check. Only do these checks if we are not going to block on locks.
+         * lock traffic.
         */
        if ((flags & SYNC_TRYLOCK) &&
-            (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+            __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
                return 1;
-        }
        /*
         * The radix tree lock here protects a thread in xfs_iget from racing
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a9d5b1e06efe..6b6df5802e95 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -891,7 +891,6 @@ DECLARE_EVENT_CLASS(xfs_file_class,
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(xfs_fsize_t, size)
-                __field(xfs_fsize_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
                __field(int, flags)
@@ -900,17 +899,15 @@ DECLARE_EVENT_CLASS(xfs_file_class,
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->size = ip->i_d.di_size;
-                __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
                __entry->flags = flags;
        ),
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
                  "offset 0x%llx count 0x%zx ioflags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
-                  __entry->new_size,
                  __entry->offset,
                  __entry->count,
                  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
@@ -978,7 +975,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(loff_t, size)
-                __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
                __field(int, type)
@@ -990,7 +986,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->size = ip->i_d.di_size;
-                __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
                __entry->type = type;
@@ -998,13 +993,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
-                  "offset 0x%llx count %zd type %s "
+                  "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
-                  "startoff 0x%llx startblock %lld blockcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
-                  __entry->new_size,
                  __entry->offset,
                  __entry->count,
                  __print_symbolic(__entry->type, XFS_IO_TYPES),
@@ -1031,26 +1024,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
                __field(xfs_ino_t, ino)
                __field(loff_t, isize)
                __field(loff_t, disize)
-                __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
-                __entry->isize = ip->i_size;
+                __entry->isize = VFS_I(ip)->i_size;
                __entry->disize = ip->i_d.di_size;
-                __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
        ),
-        TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx "
+        TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
                  "offset 0x%llx count %zd",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->isize,
                  __entry->disize,
-                  __entry->new_size,
                  __entry->offset,
                  __entry->count)
 );
@@ -1090,8 +1080,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
 DEFINE_EVENT(xfs_itrunc_class, name, \
        TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
        TP_ARGS(ip, new_size))
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
 TRACE_EVENT(xfs_pagecache_inval,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
@@ -1568,7 +1558,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
                __field(xfs_ino_t, ino)
                __field(int, format)
                __field(int, nex)
-                __field(int, max_nex)
                __field(int, broot_size)
                __field(int, fork_off)
        ),
@@ -1578,18 +1567,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
                __entry->ino = ip->i_ino;
                __entry->format = ip->i_d.di_format;
                __entry->nex = ip->i_d.di_nextents;
-                __entry->max_nex = ip->i_df.if_ext_max;
                __entry->broot_size = ip->i_df.if_broot_bytes;
                __entry->fork_off = XFS_IFORK_BOFF(ip);
        ),
        TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
-                  "Max in-fork extents %d, broot size %d, fork offset %d",
+                  "broot size %d, fork offset %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
                  __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
                  __entry->nex,
-                  __entry->max_nex,
                  __entry->broot_size,
                  __entry->fork_off)
 )
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f2fea868d4db..ebdb88840a47 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -131,7 +131,8 @@ xfs_readlink(
                         __func__, (unsigned long long) ip->i_ino,
                         (long long) pathlen);
                ASSERT(0);
-                return XFS_ERROR(EFSCORRUPTED);
+                error = XFS_ERROR(EFSCORRUPTED);
+                goto out;
        }
@@ -175,7 +176,7 @@ xfs_free_eofblocks(
         * Figure out if there are any blocks beyond the end
         * of the file.  If not, then there is nothing to do.
         */
-        end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
        last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
        if (last_fsb <= end_fsb)
                return 0;
@@ -226,7 +227,14 @@ xfs_free_eofblocks(
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_trans_ijoin(tp, ip, 0);
-                error = xfs_itruncate_data(&tp, ip, ip->i_size);
+                /*
+                 * Do not update the on-disk file size.  If we update the
+                 * on-disk file size and then the system crashes before the
+                 * contents of the file are flushed to disk then the files
+                 * may be full of holes (ie NULL files bug).
+                 */
+                error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
+                                              XFS_ISIZE(ip));
                if (error) {
                        /*
                         * If we get an error at this point we simply don't
@@ -540,8 +548,8 @@ xfs_release(
                return 0;
        if ((S_ISREG(ip->i_d.di_mode) &&
-             ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
+             (VFS_I(ip)->i_size > 0 ||
-               ip->i_delayed_blks > 0)) &&
+              (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
             (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
            (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
@@ -618,7 +626,7 @@ xfs_inactive(
         * only one with a reference to the inode.
         */
        truncate = ((ip->i_d.di_nlink == 0) &&
-            ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
+            ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
             (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
            S_ISREG(ip->i_d.di_mode));
@@ -632,12 +640,12 @@ xfs_inactive(
        if (ip->i_d.di_nlink != 0) {
                if ((S_ISREG(ip->i_d.di_mode) &&
-                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
+                    (VFS_I(ip)->i_size > 0 ||
-                       ip->i_delayed_blks > 0)) &&
+                     (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
-                      (ip->i_df.if_flags & XFS_IFEXTENTS) &&
+                    (ip->i_df.if_flags & XFS_IFEXTENTS) &&
-                     (!(ip->i_d.di_flags &
+                    (!(ip->i_d.di_flags &
                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
-                      (ip->i_delayed_blks != 0)))) {
+                     ip->i_delayed_blks != 0))) {
                        error = xfs_free_eofblocks(mp, ip, 0);
                        if (error)
                                return VN_INACTIVE_CACHE;
@@ -670,13 +678,18 @@ xfs_inactive(
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_trans_ijoin(tp, ip, 0);
-                error = xfs_itruncate_data(&tp, ip, 0);
+                ip->i_d.di_size = 0;
+                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
                if (error) {
                        xfs_trans_cancel(tp,
                                XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
                        xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
                        return VN_INACTIVE_CACHE;
                }
+                ASSERT(ip->i_d.di_nextents == 0);
        } else if (S_ISLNK(ip->i_d.di_mode)) {
                /*
@@ -1961,11 +1974,11 @@ xfs_zero_remaining_bytes(
         * since nothing can read beyond eof.  The space will
         * be zeroed when the file is extended anyway.
         */
-        if (startoff >= ip->i_size)
+        if (startoff >= XFS_ISIZE(ip))
                return 0;
-        if (endoff > ip->i_size)
+        if (endoff > XFS_ISIZE(ip))
-                endoff = ip->i_size;
+                endoff = XFS_ISIZE(ip);
        bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
                                        mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -2260,7 +2273,7 @@ xfs_change_file_space(
                bf->l_start += offset;
                break;
        case 2: /*SEEK_END*/
-                bf->l_start += ip->i_size;
+                bf->l_start += XFS_ISIZE(ip);
                break;
        default:
                return XFS_ERROR(EINVAL);
@@ -2277,7 +2290,7 @@ xfs_change_file_space(
        bf->l_whence = 0;
        startoffset = bf->l_start;
-        fsize = ip->i_size;
+        fsize = XFS_ISIZE(ip);
        /*
         * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
author	Jiri Kosina <jkosina@suse.cz>	2012-02-03 17:12:42 -0500
committer	Jiri Kosina <jkosina@suse.cz>	2012-02-03 17:13:05 -0500
commit	972c5ae961d6e5103e2b33d935cfa4145fd47140 (patch)
tree	350b2a76b979ba8766c09838617df67ff330eca0 /fs
parent	5196d20305d5e30d871111d3a876cf067dd94255 (diff)
parent	7c7ed8ec337bf5f62cc5287a6eb6b2f1b7504c2f (diff)