Merge commit 'v3.15' into next

author: James Morris <james.l.morris@oracle.com> 2014-06-24 04:46:07 -0400
committer: James Morris <james.l.morris@oracle.com> 2014-06-24 04:46:07 -0400
commit: f01387d2693813eb5271a3448e6a082322c7d75d (patch)
tree: b591ca73c85276bae53d7db57ff1565be45a29da /fs
parent: 92953ff38ba59b4f7b1a54ab28b84be35fafaecc (diff)
parent: 1860e379875dfe7271c649058aeddffe5afd9d0d (diff)
464 files changed, 12718 insertions, 8147 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a16b0ff497ca..d8223209d4b1 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -832,6 +832,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
 static const struct vm_operations_struct v9fs_file_vm_ops = {
        .fault = filemap_fault,
+        .map_pages = filemap_map_pages,
        .page_mkwrite = v9fs_vm_page_mkwrite,
        .remap_pages = generic_file_remap_pages,
 };
@@ -839,6 +840,7 @@ static const struct vm_operations_struct v9fs_file_vm_ops = {
 static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
        .close = v9fs_mmap_vm_close,
        .fault = filemap_fault,
+        .map_pages = filemap_map_pages,
        .page_mkwrite = v9fs_vm_page_mkwrite,
        .remap_pages = generic_file_remap_pages,
 };
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index bb7991c7e5c7..53161ec058a7 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -451,7 +451,7 @@ void v9fs_evict_inode(struct inode *inode)
 {
        struct v9fs_inode *v9inode = V9FS_I(inode);
-        truncate_inode_pages(inode->i_mapping, 0);
+        truncate_inode_pages_final(inode->i_mapping);
        clear_inode(inode);
        filemap_fdatawrite(inode->i_mapping);
diff --git a/fs/Kconfig b/fs/Kconfig
index 7385e54be4b9..312393f32948 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -96,6 +96,7 @@ endif # BLOCK
 menu "Pseudo filesystems"
 source "fs/proc/Kconfig"
+source "fs/kernfs/Kconfig"
 source "fs/sysfs/Kconfig"
 config TMPFS
diff --git a/fs/Makefile b/fs/Makefile
index 47ac07bb4acc..f9cb9876e466 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -52,7 +52,8 @@ obj-$(CONFIG_FHANDLE)		+= fhandle.o
 obj-y                           += quota/
 obj-$(CONFIG_PROC_FS)           += proc/
-obj-$(CONFIG_SYSFS)             += sysfs/ kernfs/
+obj-$(CONFIG_KERNFS)            += kernfs/
+obj-$(CONFIG_SYSFS)             += sysfs/
 obj-$(CONFIG_CONFIGFS_FS)       += configfs/
 obj-y                           += devpts/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 7b3003cb6f1b..9852bdf34d76 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -212,6 +212,7 @@ static int parse_options(struct super_block *sb, char *options)
 static int adfs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_NODIRATIME;
        return parse_options(sb, data);
 }
@@ -265,7 +266,7 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
 }
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
        adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
                                             sizeof(struct adfs_inode_info),
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 3952121f2f28..25b23b1e7f22 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -5,14 +5,6 @@
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
-/* AmigaOS allows file names with up to 30 characters length.
- * Names longer than that will be silently truncated. If you
- * want to disallow this, comment out the following #define.
- * Creating filesystem objects with longer names will then
- * result in an error (ENAMETOOLONG).
- */
-/*#define AFFS_NO_TRUNCATE */
 /* Ugly macros make the code more pretty. */
 #define GET_END_PTR(st,p,sz)             ((st *)((char *)(p)+((sz)-sizeof(st))))
@@ -28,7 +20,6 @@
 #define AFFS_CACHE_SIZE         PAGE_SIZE
-#define AFFS_MAX_PREALLOC       32
 #define AFFS_LC_SIZE            (AFFS_CACHE_SIZE/sizeof(u32)/2)
 #define AFFS_AC_SIZE            (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2)
 #define AFFS_AC_MASK            (AFFS_AC_SIZE-1)
@@ -118,6 +109,7 @@ struct affs_sb_info {
 #define SF_OFS          0x0200          /* Old filesystem */
 #define SF_PREFIX       0x0400          /* Buffer for prefix is allocated */
 #define SF_VERBOSE      0x0800          /* Talk about fs when mounting */
+#define SF_NO_TRUNCATE  0x1000          /* Don't truncate filenames */
 /* short cut to get to the affs specific sb data */
 static inline struct affs_sb_info *AFFS_SB(struct super_block *sb)
@@ -137,9 +129,13 @@ extern void	affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
 extern void     secs_to_datestamp(time_t secs, struct affs_date *ds);
 extern umode_t  prot_to_mode(u32 prot);
 extern void     mode_to_prot(struct inode *inode);
-extern void     affs_error(struct super_block *sb, const char *function, const char *fmt, ...);
+extern void     affs_error(struct super_block *sb, const char *function,
-extern void     affs_warning(struct super_block *sb, const char *function, const char *fmt, ...);
+                           const char *fmt, ...);
-extern int      affs_check_name(const unsigned char *name, int len);
+extern void     affs_warning(struct super_block *sb, const char *function,
+                             const char *fmt, ...);
+extern bool     affs_nofilenametruncate(const struct dentry *dentry);
+extern int      affs_check_name(const unsigned char *name, int len,
+                                bool notruncate);
 extern int      affs_copy_name(unsigned char *bstr, struct dentry *dentry);
 /* bitmap. c */
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index d9a43674cb94..533a322c41c0 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -471,20 +471,27 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...)
                function,ErrorBuffer);
 }
+bool
+affs_nofilenametruncate(const struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        return AFFS_SB(inode->i_sb)->s_flags & SF_NO_TRUNCATE;
+}
 /* Check if the name is valid for a affs object. */
 int
-affs_check_name(const unsigned char *name, int len)
+affs_check_name(const unsigned char *name, int len, bool notruncate)
 {
        int      i;
-        if (len > 30)
+        if (len > 30) {
-#ifdef AFFS_NO_TRUNCATE
+                if (notruncate)
-                return -ENAMETOOLONG;
+                        return -ENAMETOOLONG;
-#else
+                else
-                len = 30;
+                        len = 30;
-#endif
+        }
        for (i = 0; i < len; i++) {
                if (name[i] < ' ' || name[i] == ':'
                    || (name[i] > 0x7e && name[i] < 0xa0))
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index f1eba8c3644e..cbbda476a805 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -52,8 +52,10 @@ affs_readdir(struct file *file, struct dir_context *ctx)
        int                      hash_pos;
        int                      chain_pos;
        u32                      ino;
+        int                      error = 0;
-        pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos);
+        pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",
+                 inode->i_ino, (unsigned long)ctx->pos);
        if (ctx->pos < 2) {
                file->private_data = (void *)0;
@@ -72,7 +74,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
        }
        dir_bh = affs_bread(sb, inode->i_ino);
        if (!dir_bh)
-                goto readdir_out;
+                goto out_unlock_dir;
        /* If the directory hasn't changed since the last call to readdir(),
         * we can jump directly to where we left off.
@@ -88,7 +90,8 @@ affs_readdir(struct file *file, struct dir_context *ctx)
                fh_bh = affs_bread(sb, ino);
                if (!fh_bh) {
                        affs_error(sb, "readdir","Cannot read block %d", i);
-                        return -EIO;
+                        error = -EIO;
+                        goto out_brelse_dir;
                }
                ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
                affs_brelse(fh_bh);
@@ -107,29 +110,34 @@ inside:
                do {
                        fh_bh = affs_bread(sb, ino);
                        if (!fh_bh) {
-                                affs_error(sb, "readdir","Cannot read block %d", ino);
+                                affs_error(sb, "readdir",
+                                           "Cannot read block %d", ino);
                                break;
                        }
                        namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
                        name = AFFS_TAIL(sb, fh_bh)->name + 1;
-                        pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n",
+                        pr_debug("AFFS: readdir(): dir_emit(\"%.*s\", "
+                                 "ino=%u), hash=%d, f_pos=%x\n",
                                 namelen, name, ino, hash_pos, (u32)ctx->pos);
                        if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
-                                goto readdir_done;
+                                goto done;
                        ctx->pos++;
                        ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
                        affs_brelse(fh_bh);
                        fh_bh = NULL;
                } while (ino);
        }
-readdir_done:
+done:
        file->f_version = inode->i_version;
        file->private_data = (void *)(long)ino;
+        affs_brelse(fh_bh);
-readdir_out:
+out_brelse_dir:
        affs_brelse(dir_bh);
-        affs_brelse(fh_bh);
+out_unlock_dir:
        affs_unlock_dir(inode);
-        return 0;
+        return error;
 }
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 0e092d08680e..96df91e8c334 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -259,7 +259,7 @@ affs_evict_inode(struct inode *inode)
 {
        unsigned long cache_page;
        pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        if (!inode->i_nlink) {
                inode->i_size = 0;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index c36cbb4537a2..6dae1ccd176d 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -60,13 +60,13 @@ affs_get_toupper(struct super_block *sb)
 * Note: the dentry argument is the parent dentry.
 */
 static inline int
-__affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
+__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
 {
        const u8 *name = qstr->name;
        unsigned long hash;
        int i;
-        i = affs_check_name(qstr->name, qstr->len);
+        i = affs_check_name(qstr->name, qstr->len, notruncate);
        if (i)
                return i;
@@ -82,16 +82,22 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
 static int
 affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
 {
-        return __affs_hash_dentry(qstr, affs_toupper);
+        return __affs_hash_dentry(qstr, affs_toupper,
+                                  affs_nofilenametruncate(dentry));
 }
 static int
 affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
 {
-        return __affs_hash_dentry(qstr, affs_intl_toupper);
+        return __affs_hash_dentry(qstr, affs_intl_toupper,
+                                  affs_nofilenametruncate(dentry));
 }
 static inline int __affs_compare_dentry(unsigned int len,
-                const char *str, const struct qstr *name, toupper_t toupper)
+                const char *str, const struct qstr *name, toupper_t toupper,
+                bool notruncate)
 {
        const u8 *aname = str;
        const u8 *bname = name->name;
@@ -101,7 +107,7 @@ static inline int __affs_compare_dentry(unsigned int len,
         * must be valid. 'name' must be validated first.
         */
-        if (affs_check_name(name->name, name->len))
+        if (affs_check_name(name->name, name->len, notruncate))
                return 1;
        /*
@@ -126,13 +132,18 @@ static int
 affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name)
 {
-        return __affs_compare_dentry(len, str, name, affs_toupper);
+        return __affs_compare_dentry(len, str, name, affs_toupper,
+                                     affs_nofilenametruncate(parent));
 }
 static int
 affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
                unsigned int len, const char *str, const struct qstr *name)
 {
-        return __affs_compare_dentry(len, str, name, affs_intl_toupper);
+        return __affs_compare_dentry(len, str, name, affs_intl_toupper,
+                                     affs_nofilenametruncate(parent));
 }
 /*
@@ -411,7 +422,10 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
                 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
                 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
-        retval = affs_check_name(new_dentry->d_name.name,new_dentry->d_name.len);
+        retval = affs_check_name(new_dentry->d_name.name,
+                                 new_dentry->d_name.len,
+                                 affs_nofilenametruncate(old_dentry));
        if (retval)
                return retval;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d098731b82ff..895ac7dc9dbf 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -128,7 +128,7 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
 }
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
        affs_inode_cachep = kmem_cache_create("affs_inode_cache",
                                             sizeof(struct affs_inode_info),
@@ -163,7 +163,7 @@ static const struct super_operations affs_sops = {
 };
 enum {
-        Opt_bs, Opt_mode, Opt_mufs, Opt_prefix, Opt_protect,
+        Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect,
        Opt_reserved, Opt_root, Opt_setgid, Opt_setuid,
        Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
 };
@@ -172,6 +172,7 @@ static const match_table_t tokens = {
        {Opt_bs, "bs=%u"},
        {Opt_mode, "mode=%o"},
        {Opt_mufs, "mufs"},
+        {Opt_notruncate, "nofilenametruncate"},
        {Opt_prefix, "prefix=%s"},
        {Opt_protect, "protect"},
        {Opt_reserved, "reserved=%u"},
@@ -233,6 +234,9 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
                case Opt_mufs:
                        *mount_opts |= SF_MUFS;
                        break;
+                case Opt_notruncate:
+                        *mount_opts |= SF_NO_TRUNCATE;
+                        break;
                case Opt_prefix:
                        *prefix = match_strdup(&args[0]);
                        if (!*prefix)
@@ -336,8 +340,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
                                &blocksize,&sbi->s_prefix,
                                sbi->s_volume, &mount_flags)) {
                printk(KERN_ERR "AFFS: Error parsing options\n");
-                kfree(sbi->s_prefix);
-                kfree(sbi);
                return -EINVAL;
        }
        /* N.B. after this point s_prefix must be released */
@@ -530,6 +532,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
        pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
+        sync_filesystem(sb);
        *flags |= MS_NODIRATIME;
        memcpy(volume, sbi->s_volume, 32);
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 1c8c6cc6de30..4b0eff6da674 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -130,6 +130,15 @@ static void afs_cm_destructor(struct afs_call *call)
 {
        _enter("");
+        /* Break the callbacks here so that we do it after the final ACK is
+         * received.  The step number here must match the final number in
+         * afs_deliver_cb_callback().
+         */
+        if (call->unmarshall == 6) {
+                ASSERT(call->server && call->count && call->request);
+                afs_break_callbacks(call->server, call->count, call->request);
+        }
        afs_put_server(call->server);
        call->server = NULL;
        kfree(call->buffer);
@@ -272,6 +281,16 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
                _debug("trailer");
                if (skb->len != 0)
                        return -EBADMSG;
+                /* Record that the message was unmarshalled successfully so
+                 * that the call destructor can know do the callback breaking
+                 * work, even if the final ACK isn't received.
+                 *
+                 * If the step number changes, then afs_cm_destructor() must be
+                 * updated also.
+                 */
+                call->unmarshall++;
+        case 6:
                break;
        }
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index ce25d755b7aa..294671288449 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -422,7 +422,7 @@ void afs_evict_inode(struct inode *inode)
        ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        afs_give_up_callback(vnode);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6621f8008122..590b55f46d61 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -75,6 +75,7 @@ struct afs_call {
        const struct afs_call_type *type;       /* type of call */
        const struct afs_wait_mode *wait_mode;  /* completion wait mode */
        wait_queue_head_t       waitq;          /* processes awaiting completion */
+        void (*async_workfn)(struct afs_call *call); /* asynchronous work function */
        struct work_struct      async_work;     /* asynchronous work processor */
        struct work_struct      work;           /* actual work processor */
        struct sk_buff_head     rx_queue;       /* received packets */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 8ad8c2a0703a..03a3beb17004 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -25,7 +25,7 @@ static void afs_wake_up_call_waiter(struct afs_call *);
 static int afs_wait_for_call_to_complete(struct afs_call *);
 static void afs_wake_up_async_call(struct afs_call *);
 static int afs_dont_wait_for_call_to_complete(struct afs_call *);
-static void afs_process_async_call(struct work_struct *);
+static void afs_process_async_call(struct afs_call *);
 static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
 static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
@@ -58,6 +58,13 @@ static void afs_collect_incoming_call(struct work_struct *);
 static struct sk_buff_head afs_incoming_calls;
 static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
+static void afs_async_workfn(struct work_struct *work)
+{
+        struct afs_call *call = container_of(work, struct afs_call, async_work);
+        call->async_workfn(call);
+}
 /*
 * open an RxRPC socket and bind it to be a server for callback notifications
 * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
@@ -184,6 +191,28 @@ static void afs_free_call(struct afs_call *call)
 }
 /*
+ * End a call but do not free it
+ */
+static void afs_end_call_nofree(struct afs_call *call)
+{
+        if (call->rxcall) {
+                rxrpc_kernel_end_call(call->rxcall);
+                call->rxcall = NULL;
+        }
+        if (call->type->destructor)
+                call->type->destructor(call);
+}
+/*
+ * End a call and free it
+ */
+static void afs_end_call(struct afs_call *call)
+{
+        afs_end_call_nofree(call);
+        afs_free_call(call);
+}
+/*
 * allocate a call with flat request and reply buffers
 */
 struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
@@ -326,7 +355,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
               atomic_read(&afs_outstanding_calls));
        call->wait_mode = wait_mode;
-        INIT_WORK(&call->async_work, afs_process_async_call);
+        call->async_workfn = afs_process_async_call;
+        INIT_WORK(&call->async_work, afs_async_workfn);
        memset(&srx, 0, sizeof(srx));
        srx.srx_family = AF_RXRPC;
@@ -383,11 +413,8 @@ error_do_abort:
        rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
        while ((skb = skb_dequeue(&call->rx_queue)))
                afs_free_skb(skb);
-        rxrpc_kernel_end_call(rxcall);
-        call->rxcall = NULL;
 error_kill_call:
-        call->type->destructor(call);
+        afs_end_call(call);
-        afs_free_call(call);
        _leave(" = %d", ret);
        return ret;
 }
@@ -509,12 +536,8 @@ static void afs_deliver_to_call(struct afs_call *call)
        if (call->state >= AFS_CALL_COMPLETE) {
                while ((skb = skb_dequeue(&call->rx_queue)))
                        afs_free_skb(skb);
-                if (call->incoming) {
+                if (call->incoming)
-                        rxrpc_kernel_end_call(call->rxcall);
+                        afs_end_call(call);
-                        call->rxcall = NULL;
-                        call->type->destructor(call);
-                        afs_free_call(call);
-                }
        }
        _leave("");
@@ -564,10 +587,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
        }
        _debug("call complete");
-        rxrpc_kernel_end_call(call->rxcall);
+        afs_end_call(call);
-        call->rxcall = NULL;
-        call->type->destructor(call);
-        afs_free_call(call);
        _leave(" = %d", ret);
        return ret;
 }
@@ -603,11 +623,8 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
 /*
 * delete an asynchronous call
 */
-static void afs_delete_async_call(struct work_struct *work)
+static void afs_delete_async_call(struct afs_call *call)
 {
-        struct afs_call *call =
-                container_of(work, struct afs_call, async_work);
        _enter("");
        afs_free_call(call);
@@ -620,11 +637,8 @@ static void afs_delete_async_call(struct work_struct *work)
 * - on a multiple-thread workqueue this work item may try to run on several
 *   CPUs at the same time
 */
-static void afs_process_async_call(struct work_struct *work)
+static void afs_process_async_call(struct afs_call *call)
 {
-        struct afs_call *call =
-                container_of(work, struct afs_call, async_work);
        _enter("");
        if (!skb_queue_empty(&call->rx_queue))
@@ -637,14 +651,11 @@ static void afs_process_async_call(struct work_struct *work)
                call->reply = NULL;
                /* kill the call */
-                rxrpc_kernel_end_call(call->rxcall);
+                afs_end_call_nofree(call);
-                call->rxcall = NULL;
-                if (call->type->destructor)
-                        call->type->destructor(call);
                /* we can't just delete the call because the work item may be
                 * queued */
-                PREPARE_WORK(&call->async_work, afs_delete_async_call);
+                call->async_workfn = afs_delete_async_call;
                queue_work(afs_async_calls, &call->async_work);
        }
@@ -685,7 +696,8 @@ static void afs_collect_incoming_call(struct work_struct *work)
                                return;
                        }
-                        INIT_WORK(&call->async_work, afs_process_async_call);
+                        call->async_workfn = afs_process_async_call;
+                        INIT_WORK(&call->async_work, afs_async_workfn);
                        call->wait_mode = &afs_async_incoming_call;
                        call->type = &afs_RXCMxxxx;
                        init_waitqueue_head(&call->waitq);
@@ -782,10 +794,7 @@ void afs_send_empty_reply(struct afs_call *call)
                _debug("oom");
                rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
        default:
-                rxrpc_kernel_end_call(call->rxcall);
+                afs_end_call(call);
-                call->rxcall = NULL;
-                call->type->destructor(call);
-                afs_free_call(call);
                _leave(" [error]");
                return;
        }
@@ -815,17 +824,16 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
        call->state = AFS_CALL_AWAIT_ACK;
        n = rxrpc_kernel_send_data(call->rxcall, &msg, len);
        if (n >= 0) {
+                /* Success */
                _leave(" [replied]");
                return;
        }
        if (n == -ENOMEM) {
                _debug("oom");
                rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
        }
-        rxrpc_kernel_end_call(call->rxcall);
+        afs_end_call(call);
-        call->rxcall = NULL;
-        call->type->destructor(call);
-        afs_free_call(call);
        _leave(" [error]");
 }
diff --git a/fs/aio.c b/fs/aio.c
index 062a5f6a1448..a0ed6c7d2cd2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -52,7 +52,8 @@
 struct aio_ring {
        unsigned        id;     /* kernel internal index number */
        unsigned        nr;     /* number of io_events */
-        unsigned        head;
+        unsigned        head;   /* Written to by userland or under ring_lock
+                                 * mutex by aio_read_events_ring(). */
        unsigned        tail;
        unsigned        magic;
@@ -111,6 +112,11 @@ struct kioctx {
        struct work_struct      free_work;
+        /*
+         * signals when all in-flight requests are done
+         */
+        struct completion *requests_done;
        struct {
                /*
                 * This counts the number of available slots in the ringbuffer,
@@ -243,6 +249,11 @@ static void aio_free_ring(struct kioctx *ctx)
 {
        int i;
+        /* Disconnect the kiotx from the ring file.  This prevents future
+         * accesses to the kioctx from page migration.
+         */
+        put_aio_ring_file(ctx);
        for (i = 0; i < ctx->nr_pages; i++) {
                struct page *page;
                pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
@@ -254,8 +265,6 @@ static void aio_free_ring(struct kioctx *ctx)
                put_page(page);
        }
-        put_aio_ring_file(ctx);
        if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
                kfree(ctx->ring_pages);
                ctx->ring_pages = NULL;
@@ -283,29 +292,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 {
        struct kioctx *ctx;
        unsigned long flags;
+        pgoff_t idx;
        int rc;
        rc = 0;
-        /* Make sure the old page hasn't already been changed */
+        /* mapping->private_lock here protects against the kioctx teardown.  */
        spin_lock(&mapping->private_lock);
        ctx = mapping->private_data;
-        if (ctx) {
+        if (!ctx) {
-                pgoff_t idx;
+                rc = -EINVAL;
-                spin_lock_irqsave(&ctx->completion_lock, flags);
+                goto out;
-                idx = old->index;
+        }
-                if (idx < (pgoff_t)ctx->nr_pages) {
-                        if (ctx->ring_pages[idx] != old)
+        /* The ring_lock mutex.  The prevents aio_read_events() from writing
-                                rc = -EAGAIN;
+         * to the ring's head, and prevents page migration from mucking in
-                } else
+         * a partially initialized kiotx.
-                        rc = -EINVAL;
+         */
-                spin_unlock_irqrestore(&ctx->completion_lock, flags);
+        if (!mutex_trylock(&ctx->ring_lock)) {
+                rc = -EAGAIN;
+                goto out;
+        }
+        idx = old->index;
+        if (idx < (pgoff_t)ctx->nr_pages) {
+                /* Make sure the old page hasn't already been changed */
+                if (ctx->ring_pages[idx] != old)
+                        rc = -EAGAIN;
        } else
                rc = -EINVAL;
-        spin_unlock(&mapping->private_lock);
        if (rc != 0)
-                return rc;
+                goto out_unlock;
        /* Writeback must be complete */
        BUG_ON(PageWriteback(old));
@@ -314,38 +332,26 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
        rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
        if (rc != MIGRATEPAGE_SUCCESS) {
                put_page(new);
-                return rc;
+                goto out_unlock;
        }
-        /* We can potentially race against kioctx teardown here.  Use the
+        /* Take completion_lock to prevent other writes to the ring buffer
-         * address_space's private data lock to protect the mapping's
+         * while the old page is copied to the new.  This prevents new
-         * private_data.
+         * events from being lost.
         */
-        spin_lock(&mapping->private_lock);
+        spin_lock_irqsave(&ctx->completion_lock, flags);
-        ctx = mapping->private_data;
+        migrate_page_copy(new, old);
-        if (ctx) {
+        BUG_ON(ctx->ring_pages[idx] != old);
-                pgoff_t idx;
+        ctx->ring_pages[idx] = new;
-                spin_lock_irqsave(&ctx->completion_lock, flags);
+        spin_unlock_irqrestore(&ctx->completion_lock, flags);
-                migrate_page_copy(new, old);
-                idx = old->index;
-                if (idx < (pgoff_t)ctx->nr_pages) {
-                        /* And only do the move if things haven't changed */
-                        if (ctx->ring_pages[idx] == old)
-                                ctx->ring_pages[idx] = new;
-                        else
-                                rc = -EAGAIN;
-                } else
-                        rc = -EINVAL;
-                spin_unlock_irqrestore(&ctx->completion_lock, flags);
-        } else
-                rc = -EBUSY;
-        spin_unlock(&mapping->private_lock);
-        if (rc == MIGRATEPAGE_SUCCESS)
+        /* The old page is no longer accessible. */
-                put_page(old);
+        put_page(old);
-        else
-                put_page(new);
+out_unlock:
+        mutex_unlock(&ctx->ring_lock);
+out:
+        spin_unlock(&mapping->private_lock);
        return rc;
 }
 #endif
@@ -380,7 +386,7 @@ static int aio_setup_ring(struct kioctx *ctx)
        file = aio_private_file(ctx, nr_pages);
        if (IS_ERR(file)) {
                ctx->aio_ring_file = NULL;
-                return -EAGAIN;
+                return -ENOMEM;
        }
        ctx->aio_ring_file = file;
@@ -415,7 +421,7 @@ static int aio_setup_ring(struct kioctx *ctx)
        if (unlikely(i != nr_pages)) {
                aio_free_ring(ctx);
-                return -EAGAIN;
+                return -ENOMEM;
        }
        ctx->mmap_size = nr_pages * PAGE_SIZE;
@@ -429,7 +435,7 @@ static int aio_setup_ring(struct kioctx *ctx)
        if (IS_ERR((void *)ctx->mmap_base)) {
                ctx->mmap_size = 0;
                aio_free_ring(ctx);
-                return -EAGAIN;
+                return -ENOMEM;
        }
        pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
@@ -507,6 +513,10 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
 {
        struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
+        /* At this point we know that there are no any in-flight requests */
+        if (ctx->requests_done)
+                complete(ctx->requests_done);
        INIT_WORK(&ctx->free_work, free_ioctx);
        schedule_work(&ctx->free_work);
 }
@@ -556,6 +566,10 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
                                        rcu_read_unlock();
                                        spin_unlock(&mm->ioctx_lock);
+                                        /* While kioctx setup is in progress,
+                                         * we are protected from page migration
+                                         * changes ring_pages by ->ring_lock.
+                                         */
                                        ring = kmap_atomic(ctx->ring_pages[0]);
                                        ring->id = ctx->id;
                                        kunmap_atomic(ring);
@@ -640,24 +654,28 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
        ctx->max_reqs = nr_events;
-        if (percpu_ref_init(&ctx->users, free_ioctx_users))
-                goto err;
-        if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
-                goto err;
        spin_lock_init(&ctx->ctx_lock);
        spin_lock_init(&ctx->completion_lock);
        mutex_init(&ctx->ring_lock);
+        /* Protect against page migration throughout kiotx setup by keeping
+         * the ring_lock mutex held until setup is complete. */
+        mutex_lock(&ctx->ring_lock);
        init_waitqueue_head(&ctx->wait);
        INIT_LIST_HEAD(&ctx->active_reqs);
+        if (percpu_ref_init(&ctx->users, free_ioctx_users))
+                goto err;
+        if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
+                goto err;
        ctx->cpu = alloc_percpu(struct kioctx_cpu);
        if (!ctx->cpu)
                goto err;
-        if (aio_setup_ring(ctx) < 0)
+        err = aio_setup_ring(ctx);
+        if (err < 0)
                goto err;
        atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
@@ -683,6 +701,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
        if (err)
                goto err_cleanup;
+        /* Release the ring_lock mutex now that all setup is complete. */
+        mutex_unlock(&ctx->ring_lock);
        pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
                 ctx, ctx->user_id, mm, ctx->nr_events);
        return ctx;
@@ -692,6 +713,7 @@ err_cleanup:
 err_ctx:
        aio_free_ring(ctx);
 err:
+        mutex_unlock(&ctx->ring_lock);
        free_percpu(ctx->cpu);
        free_percpu(ctx->reqs.pcpu_count);
        free_percpu(ctx->users.pcpu_count);
@@ -705,7 +727,8 @@ err:
 *      when the processes owning a context have all exited to encourage
 *      the rapid destruction of the kioctx.
 */
-static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
+static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
+                struct completion *requests_done)
 {
        if (!atomic_xchg(&ctx->dead, 1)) {
                struct kioctx_table *table;
@@ -734,7 +757,11 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
                if (ctx->mmap_size)
                        vm_munmap(ctx->mmap_base, ctx->mmap_size);
+                ctx->requests_done = requests_done;
                percpu_ref_kill(&ctx->users);
+        } else {
+                if (requests_done)
+                        complete(requests_done);
        }
 }
@@ -796,7 +823,7 @@ void exit_aio(struct mm_struct *mm)
                 */
                ctx->mmap_size = 0;
-                kill_ioctx(mm, ctx);
+                kill_ioctx(mm, ctx, NULL);
        }
 }
@@ -1024,6 +1051,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
        mutex_lock(&ctx->ring_lock);
+        /* Access to ->ring_pages here is protected by ctx->ring_lock. */
        ring = kmap_atomic(ctx->ring_pages[0]);
        head = ring->head;
        tail = ring->tail;
@@ -1171,7 +1199,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
        if (!IS_ERR(ioctx)) {
                ret = put_user(ioctx->user_id, ctxp);
                if (ret)
-                        kill_ioctx(current->mm, ioctx);
+                        kill_ioctx(current->mm, ioctx, NULL);
                percpu_ref_put(&ioctx->users);
        }
@@ -1189,8 +1217,22 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 {
        struct kioctx *ioctx = lookup_ioctx(ctx);
        if (likely(NULL != ioctx)) {
-                kill_ioctx(current->mm, ioctx);
+                struct completion requests_done =
+                        COMPLETION_INITIALIZER_ONSTACK(requests_done);
+                /* Pass requests_done to kill_ioctx() where it can be set
+                 * in a thread-safe way. If we try to set it here then we have
+                 * a race condition if two io_destroy() called simultaneously.
+                 */
+                kill_ioctx(current->mm, ioctx, &requests_done);
                percpu_ref_put(&ioctx->users);
+                /* Wait until all IO for the context are done. Otherwise kernel
+                 * keep using user-space buffers even if user thinks the context
+                 * is destroyed.
+                 */
+                wait_for_completion(&requests_done);
                return 0;
        }
        pr_debug("EINVAL: io_destroy: invalid context id\n");
@@ -1285,10 +1327,8 @@ rw_common:
                                                &iovec, compat)
                        : aio_setup_single_vector(req, rw, buf, &nr_segs,
                                                  iovec);
-                if (ret)
+                if (!ret)
-                        return ret;
+                        ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
-                ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
                if (ret < 0) {
                        if (iovec != &inline_vec)
                                kfree(iovec);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 3182c0e68b42..232e03d4780d 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -103,6 +103,9 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
        if (tmp.size < sizeof(tmp))
                return ERR_PTR(-EINVAL);
+        if (tmp.size > (PATH_MAX + sizeof(tmp)))
+                return ERR_PTR(-ENAMETOOLONG);
        return memdup_user(in, tmp.size);
 }
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2caf36ac3e93..cc87c1abac97 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
                spin_lock(&active->d_lock);
                /* Already gone? */
-                if (!d_count(active))
+                if ((int) d_count(active) <= 0)
                        goto next;
                qstr = &active->d_name;
@@ -230,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
                spin_lock(&expiring->d_lock);
-                /* Bad luck, we've already been dentry_iput */
+                /* We've already been dentry_iput or unlinked */
                if (!expiring->d_inode)
                        goto next;
diff --git a/fs/befs/Makefile b/fs/befs/Makefile
index 2f370bd7a50d..8b9f66642a83 100644
--- a/fs/befs/Makefile
+++ b/fs/befs/Makefile
@@ -3,5 +3,5 @@
 #
 
 obj-$(CONFIG_BEFS_FS) += befs.o
+ccflags-$(CONFIG_BEFS_DEBUG)    += -DDEBUG
 befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index b26642839156..3a7813ab8c95 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -88,8 +88,11 @@ enum befs_err {
 /****************************/
 /* debug.c */
+__printf(2, 3)
 void befs_error(const struct super_block *sb, const char *fmt, ...);
+__printf(2, 3)
 void befs_warning(const struct super_block *sb, const char *fmt, ...);
+__printf(2, 3)
 void befs_debug(const struct super_block *sb, const char *fmt, ...);
 void befs_dump_super_block(const struct super_block *sb, befs_super_block *);
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 74e397db0b8b..a2cd305a993a 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -137,7 +137,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
        struct buffer_head *bh = NULL;
        befs_disk_btree_super *od_sup = NULL;
-        befs_debug(sb, "---> befs_btree_read_super()");
+        befs_debug(sb, "---> %s", __func__);
        bh = befs_read_datastream(sb, ds, 0, NULL);
@@ -162,11 +162,11 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
                goto error;
        }
-        befs_debug(sb, "<--- befs_btree_read_super()");
+        befs_debug(sb, "<--- %s", __func__);
        return BEFS_OK;
      error:
-        befs_debug(sb, "<--- befs_btree_read_super() ERROR");
+        befs_debug(sb, "<--- %s ERROR", __func__);
        return BEFS_ERR;
 }
@@ -195,16 +195,16 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
 {
        uint off = 0;
-        befs_debug(sb, "---> befs_bt_read_node()");
+        befs_debug(sb, "---> %s", __func__);
        if (node->bh)
                brelse(node->bh);
        node->bh = befs_read_datastream(sb, ds, node_off, &off);
        if (!node->bh) {
-                befs_error(sb, "befs_bt_read_node() failed to read "
+                befs_error(sb, "%s failed to read "
-                           "node at %Lu", node_off);
+                           "node at %llu", __func__, node_off);
-                befs_debug(sb, "<--- befs_bt_read_node() ERROR");
+                befs_debug(sb, "<--- %s ERROR", __func__);
                return BEFS_ERR;
        }
@@ -221,7 +221,7 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
        node->head.all_key_length =
            fs16_to_cpu(sb, node->od_node->all_key_length);
-        befs_debug(sb, "<--- befs_btree_read_node()");
+        befs_debug(sb, "<--- %s", __func__);
        return BEFS_OK;
 }
@@ -252,7 +252,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
        befs_off_t node_off;
        int res;
-        befs_debug(sb, "---> befs_btree_find() Key: %s", key);
+        befs_debug(sb, "---> %s Key: %s", __func__, key);
        if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
                befs_error(sb,
@@ -263,7 +263,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
        this_node = kmalloc(sizeof (befs_btree_node),
                                                GFP_NOFS);
        if (!this_node) {
-                befs_error(sb, "befs_btree_find() failed to allocate %u "
+                befs_error(sb, "befs_btree_find() failed to allocate %zu "
                           "bytes of memory", sizeof (befs_btree_node));
                goto error;
        }
@@ -274,7 +274,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
        node_off = bt_super.root_node_ptr;
        if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
                befs_error(sb, "befs_btree_find() failed to read "
-                           "node at %Lu", node_off);
+                           "node at %llu", node_off);
                goto error_alloc;
        }
@@ -285,7 +285,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
                /* if no match, go to overflow node */
                if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
                        befs_error(sb, "befs_btree_find() failed to read "
-                                   "node at %Lu", node_off);
+                                   "node at %llu", node_off);
                        goto error_alloc;
                }
        }
@@ -298,11 +298,11 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
        kfree(this_node);
        if (res != BEFS_BT_MATCH) {
-                befs_debug(sb, "<--- befs_btree_find() Key %s not found", key);
+                befs_debug(sb, "<--- %s Key %s not found", __func__, key);
                *value = 0;
                return BEFS_BT_NOT_FOUND;
        }
-        befs_debug(sb, "<--- befs_btree_find() Found key %s, value %Lu",
+        befs_debug(sb, "<--- %s Found key %s, value %llu", __func__,
                   key, *value);
        return BEFS_OK;
@@ -310,7 +310,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
        kfree(this_node);
      error:
        *value = 0;
-        befs_debug(sb, "<--- befs_btree_find() ERROR");
+        befs_debug(sb, "<--- %s ERROR", __func__);
        return BEFS_ERR;
 }
@@ -343,7 +343,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
        char *thiskey;
        fs64 *valarray;
-        befs_debug(sb, "---> befs_find_key() %s", findkey);
+        befs_debug(sb, "---> %s %s", __func__, findkey);
        *value = 0;
@@ -355,7 +355,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
        eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len);
        if (eq < 0) {
-                befs_debug(sb, "<--- befs_find_key() %s not found", findkey);
+                befs_debug(sb, "<--- %s %s not found", __func__, findkey);
                return BEFS_BT_NOT_FOUND;
        }
@@ -373,8 +373,8 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
                                          findkey_len);
                if (eq == 0) {
-                        befs_debug(sb, "<--- befs_find_key() found %s at %d",
+                        befs_debug(sb, "<--- %s found %s at %d",
-                                   thiskey, mid);
+                                   __func__, thiskey, mid);
                        *value = fs64_to_cpu(sb, valarray[mid]);
                        return BEFS_BT_MATCH;
@@ -388,7 +388,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
                *value = fs64_to_cpu(sb, valarray[mid + 1]);
        else
                *value = fs64_to_cpu(sb, valarray[mid]);
-        befs_debug(sb, "<--- befs_find_key() found %s at %d", thiskey, mid);
+        befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid);
        return BEFS_BT_PARMATCH;
 }
@@ -428,7 +428,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
        uint key_sum = 0;
-        befs_debug(sb, "---> befs_btree_read()");
+        befs_debug(sb, "---> %s", __func__);
        if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
                befs_error(sb,
@@ -437,7 +437,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
        }
        if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
-                befs_error(sb, "befs_btree_read() failed to allocate %u "
+                befs_error(sb, "befs_btree_read() failed to allocate %zu "
                           "bytes of memory", sizeof (befs_btree_node));
                goto error;
        }
@@ -452,7 +452,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
                kfree(this_node);
                *value = 0;
                *keysize = 0;
-                befs_debug(sb, "<--- befs_btree_read() Tree is EMPTY");
+                befs_debug(sb, "<--- %s Tree is EMPTY", __func__);
                return BEFS_BT_EMPTY;
        } else if (res == BEFS_ERR) {
                goto error_alloc;
@@ -467,7 +467,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
                        *keysize = 0;
                        *value = 0;
                        befs_debug(sb,
-                                   "<--- befs_btree_read() END of keys at %Lu",
+                                   "<--- %s END of keys at %llu", __func__,
+                                   (unsigned long long)
                                   key_sum + this_node->head.all_key_count);
                        brelse(this_node->bh);
                        kfree(this_node);
@@ -478,8 +479,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
                node_off = this_node->head.right;
                if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
-                        befs_error(sb, "befs_btree_read() failed to read "
+                        befs_error(sb, "%s failed to read node at %llu",
-                                   "node at %Lu", node_off);
+                                  __func__, (unsigned long long)node_off);
                        goto error_alloc;
                }
        }
@@ -492,11 +493,13 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
        keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen);
-        befs_debug(sb, "Read [%Lu,%d]: keysize %d", node_off, cur_key, keylen);
+        befs_debug(sb, "Read [%llu,%d]: keysize %d",
+                   (long long unsigned int)node_off, (int)cur_key,
+                   (int)keylen);
        if (bufsize < keylen + 1) {
-                befs_error(sb, "befs_btree_read() keybuf too small (%u) "
+                befs_error(sb, "%s keybuf too small (%zu) "
-                           "for key of size %d", bufsize, keylen);
+                           "for key of size %d", __func__, bufsize, keylen);
                brelse(this_node->bh);
                goto error_alloc;
        };
@@ -506,13 +509,13 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
        *keysize = keylen;
        keybuf[keylen] = '\0';
-        befs_debug(sb, "Read [%Lu,%d]: Key \"%.*s\", Value %Lu", node_off,
+        befs_debug(sb, "Read [%llu,%d]: Key \"%.*s\", Value %llu", node_off,
                   cur_key, keylen, keybuf, *value);
        brelse(this_node->bh);
        kfree(this_node);
-        befs_debug(sb, "<--- befs_btree_read()");
+        befs_debug(sb, "<--- %s", __func__);
        return BEFS_OK;
@@ -522,7 +525,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
      error:
        *keysize = 0;
        *value = 0;
-        befs_debug(sb, "<--- befs_btree_read() ERROR");
+        befs_debug(sb, "<--- %s ERROR", __func__);
        return BEFS_ERR;
 }
@@ -547,26 +550,26 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
                    befs_off_t * node_off)
 {
-        befs_debug(sb, "---> befs_btree_seekleaf()");
+        befs_debug(sb, "---> %s", __func__);
        if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
-                befs_error(sb, "befs_btree_seekleaf() failed to read "
+                befs_error(sb, "%s failed to read "
-                           "node at %Lu", *node_off);
+                           "node at %llu", __func__, *node_off);
                goto error;
        }
-        befs_debug(sb, "Seekleaf to root node %Lu", *node_off);
+        befs_debug(sb, "Seekleaf to root node %llu", *node_off);
        if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) {
-                befs_debug(sb, "<--- befs_btree_seekleaf() Tree is EMPTY");
+                befs_debug(sb, "<--- %s Tree is EMPTY", __func__);
                return BEFS_BT_EMPTY;
        }
        while (!befs_leafnode(this_node)) {
                if (this_node->head.all_key_count == 0) {
-                        befs_debug(sb, "befs_btree_seekleaf() encountered "
+                        befs_debug(sb, "%s encountered "
-                                   "an empty interior node: %Lu. Using Overflow "
+                                   "an empty interior node: %llu. Using Overflow "
-                                   "node: %Lu", *node_off,
+                                   "node: %llu", __func__, *node_off,
                                   this_node->head.overflow);
                        *node_off = this_node->head.overflow;
                } else {
@@ -574,19 +577,19 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
                        *node_off = fs64_to_cpu(sb, valarray[0]);
                }
                if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
-                        befs_error(sb, "befs_btree_seekleaf() failed to read "
+                        befs_error(sb, "%s failed to read "
-                                   "node at %Lu", *node_off);
+                                   "node at %llu", __func__, *node_off);
                        goto error;
                }
-                befs_debug(sb, "Seekleaf to child node %Lu", *node_off);
+                befs_debug(sb, "Seekleaf to child node %llu", *node_off);
        }
-        befs_debug(sb, "Node %Lu is a leaf node", *node_off);
+        befs_debug(sb, "Node %llu is a leaf node", *node_off);
        return BEFS_OK;
      error:
-        befs_debug(sb, "<--- befs_btree_seekleaf() ERROR");
+        befs_debug(sb, "<--- %s ERROR", __func__);
        return BEFS_ERR;
 }
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index 59096b5e0fc7..c467bebd50af 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -52,26 +52,25 @@ befs_read_datastream(struct super_block *sb, befs_data_stream * ds,
        befs_block_run run;
        befs_blocknr_t block;   /* block coresponding to pos */
-        befs_debug(sb, "---> befs_read_datastream() %Lu", pos);
+        befs_debug(sb, "---> %s %llu", __func__, pos);
        block = pos >> BEFS_SB(sb)->block_shift;
        if (off)
                *off = pos - (block << BEFS_SB(sb)->block_shift);
        if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) {
                befs_error(sb, "BeFS: Error finding disk addr of block %lu",
-                           block);
+                           (unsigned long)block);
-                befs_debug(sb, "<--- befs_read_datastream() ERROR");
+                befs_debug(sb, "<--- %s ERROR", __func__);
                return NULL;
        }
        bh = befs_bread_iaddr(sb, run);
        if (!bh) {
                befs_error(sb, "BeFS: Error reading block %lu from datastream",
-                           block);
+                           (unsigned long)block);
                return NULL;
        }
-        befs_debug(sb, "<--- befs_read_datastream() read data, starting at %Lu",
+        befs_debug(sb, "<--- %s read data, starting at %llu", __func__, pos);
-                   pos);
        return bh;
 }
@@ -106,7 +105,8 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
        } else {
                befs_error(sb,
                           "befs_fblock2brun() was asked to find block %lu, "
-                           "which is not mapped by the datastream\n", fblock);
+                           "which is not mapped by the datastream\n",
+                           (unsigned long)fblock);
                err = BEFS_ERR;
        }
        return err;
@@ -128,14 +128,14 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
        befs_off_t bytes_read = 0;      /* bytes readed */
        u16 plen;
        struct buffer_head *bh = NULL;
-        befs_debug(sb, "---> befs_read_lsymlink() length: %Lu", len);
+        befs_debug(sb, "---> %s length: %llu", __func__, len);
        while (bytes_read < len) {
                bh = befs_read_datastream(sb, ds, bytes_read, NULL);
                if (!bh) {
                        befs_error(sb, "BeFS: Error reading datastream block "
-                                   "starting from %Lu", bytes_read);
+                                   "starting from %llu", bytes_read);
-                        befs_debug(sb, "<--- befs_read_lsymlink() ERROR");
+                        befs_debug(sb, "<--- %s ERROR", __func__);
                        return bytes_read;
                }
@@ -146,7 +146,8 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
                bytes_read += plen;
        }
-        befs_debug(sb, "<--- befs_read_lsymlink() read %u bytes", bytes_read);
+        befs_debug(sb, "<--- %s read %u bytes", __func__, (unsigned int)
+                   bytes_read);
        return bytes_read;
 }
@@ -169,7 +170,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
        befs_blocknr_t metablocks;      /* FS metadata blocks */
        befs_sb_info *befs_sb = BEFS_SB(sb);
-        befs_debug(sb, "---> befs_count_blocks()");
+        befs_debug(sb, "---> %s", __func__);
        datablocks = ds->size >> befs_sb->block_shift;
        if (ds->size & (befs_sb->block_size - 1))
@@ -206,7 +207,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
        }
        blocks = datablocks + metablocks;
-        befs_debug(sb, "<--- befs_count_blocks() %u blocks", blocks);
+        befs_debug(sb, "<--- %s %u blocks", __func__, (unsigned int)blocks);
        return blocks;
 }
@@ -251,11 +252,11 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
        befs_blocknr_t max_block =
            data->max_direct_range >> BEFS_SB(sb)->block_shift;
-        befs_debug(sb, "---> befs_find_brun_direct(), find %lu", blockno);
+        befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
        if (blockno > max_block) {
-                befs_error(sb, "befs_find_brun_direct() passed block outside of"
+                befs_error(sb, "%s passed block outside of direct region",
-                           "direct region");
+                           __func__);
                return BEFS_ERR;
        }
@@ -267,13 +268,14 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
                        run->start = array[i].start + offset;
                        run->len = array[i].len - offset;
-                        befs_debug(sb, "---> befs_find_brun_direct(), "
+                        befs_debug(sb, "---> %s, "
-                                   "found %lu at direct[%d]", blockno, i);
+                                   "found %lu at direct[%d]", __func__,
+                                   (unsigned long)blockno, i);
                        return BEFS_OK;
                }
        }
-        befs_debug(sb, "---> befs_find_brun_direct() ERROR");
+        befs_debug(sb, "---> %s ERROR", __func__);
        return BEFS_ERR;
 }
@@ -316,7 +318,7 @@ befs_find_brun_indirect(struct super_block *sb,
        befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect);
        int arraylen = befs_iaddrs_per_block(sb);
-        befs_debug(sb, "---> befs_find_brun_indirect(), find %lu", blockno);
+        befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
        indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift;
        search_blk = blockno - indir_start_blk;
@@ -325,10 +327,9 @@ befs_find_brun_indirect(struct super_block *sb,
        for (i = 0; i < indirect.len; i++) {
                indirblock = befs_bread(sb, indirblockno + i);
                if (indirblock == NULL) {
-                        befs_debug(sb,
+                        befs_debug(sb, "---> %s failed to read "
-                                   "---> befs_find_brun_indirect() failed to "
+                                   "disk block %lu from the indirect brun",
-                                   "read disk block %lu from the indirect brun",
+                                   __func__, (unsigned long)indirblockno + i);
-                                   indirblockno + i);
                        return BEFS_ERR;
                }
@@ -348,9 +349,10 @@ befs_find_brun_indirect(struct super_block *sb,
                                brelse(indirblock);
                                befs_debug(sb,
-                                           "<--- befs_find_brun_indirect() found "
+                                           "<--- %s found file block "
-                                           "file block %lu at indirect[%d]",
+                                           "%lu at indirect[%d]", __func__,
-                                           blockno, j + (i * arraylen));
+                                           (unsigned long)blockno,
+                                           j + (i * arraylen));
                                return BEFS_OK;
                        }
                        sum += len;
@@ -360,10 +362,10 @@ befs_find_brun_indirect(struct super_block *sb,
        }
        /* Only fallthrough is an error */
-        befs_error(sb, "BeFS: befs_find_brun_indirect() failed to find "
+        befs_error(sb, "BeFS: %s failed to find "
-                   "file block %lu", blockno);
+                   "file block %lu", __func__, (unsigned long)blockno);
-        befs_debug(sb, "<--- befs_find_brun_indirect() ERROR");
+        befs_debug(sb, "<--- %s ERROR", __func__);
        return BEFS_ERR;
 }
@@ -444,7 +446,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
        size_t diblklen = iblklen * befs_iaddrs_per_block(sb)
            * BEFS_DBLINDIR_BRUN_LEN;
-        befs_debug(sb, "---> befs_find_brun_dblindirect() find %lu", blockno);
+        befs_debug(sb, "---> %s find %lu", __func__, (unsigned long)blockno);
        /* First, discover which of the double_indir->indir blocks
         * contains pos. Then figure out how much of pos that
@@ -460,8 +462,9 @@ befs_find_brun_dblindirect(struct super_block *sb,
        dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb);
        if (dbl_which_block > data->double_indirect.len) {
                befs_error(sb, "The double-indirect index calculated by "
-                           "befs_read_brun_dblindirect(), %d, is outside the range "
+                           "%s, %d, is outside the range "
-                           "of the double-indirect block", dblindir_indx);
+                           "of the double-indirect block", __func__,
+                           dblindir_indx);
                return BEFS_ERR;
        }
@@ -469,10 +472,10 @@ befs_find_brun_dblindirect(struct super_block *sb,
            befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) +
                                        dbl_which_block);
        if (dbl_indir_block == NULL) {
-                befs_error(sb, "befs_read_brun_dblindirect() couldn't read the "
+                befs_error(sb, "%s couldn't read the "
-                           "double-indirect block at blockno %lu",
+                           "double-indirect block at blockno %lu", __func__,
-                           iaddr2blockno(sb,
+                           (unsigned long)
-                                         &data->double_indirect) +
+                           iaddr2blockno(sb, &data->double_indirect) +
                           dbl_which_block);
                brelse(dbl_indir_block);
                return BEFS_ERR;
@@ -489,16 +492,16 @@ befs_find_brun_dblindirect(struct super_block *sb,
        which_block = indir_indx / befs_iaddrs_per_block(sb);
        if (which_block > indir_run.len) {
                befs_error(sb, "The indirect index calculated by "
-                           "befs_read_brun_dblindirect(), %d, is outside the range "
+                           "%s, %d, is outside the range "
-                           "of the indirect block", indir_indx);
+                           "of the indirect block", __func__, indir_indx);
                return BEFS_ERR;
        }
        indir_block =
            befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block);
        if (indir_block == NULL) {
-                befs_error(sb, "befs_read_brun_dblindirect() couldn't read the "
+                befs_error(sb, "%s couldn't read the indirect block "
-                           "indirect block at blockno %lu",
+                           "at blockno %lu", __func__, (unsigned long)
                           iaddr2blockno(sb, &indir_run) + which_block);
                brelse(indir_block);
                return BEFS_ERR;
@@ -519,7 +522,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
        run->len -= offset;
        befs_debug(sb, "Found file block %lu in double_indirect[%d][%d],"
-                   " double_indirect_leftover = %lu",
+                   " double_indirect_leftover = %lu", (unsigned long)
                   blockno, dblindir_indx, indir_indx, dblindir_leftover);
        return BEFS_OK;
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index 622e73775c83..4de7cffcd662 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -10,6 +10,7 @@
 * debug functions
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #ifdef __KERNEL__
 #include <stdarg.h>
@@ -23,43 +24,30 @@
 #include "befs.h"
-#define ERRBUFSIZE 1024
 void
 befs_error(const struct super_block *sb, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
-        char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
-        if (err_buf == NULL) {
-                printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE);
-                return;
-        }
        va_start(args, fmt);
-        vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        pr_err("(%s): %pV\n", sb->s_id, &vaf);
        va_end(args);
-        printk(KERN_ERR "BeFS(%s): %s\n", sb->s_id, err_buf);
-        kfree(err_buf);
 }
 void
 befs_warning(const struct super_block *sb, const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
-        char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
-        if (err_buf == NULL) {
-                printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE);
-                return;
-        }
        va_start(args, fmt);
-        vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        pr_warn("(%s): %pV\n", sb->s_id, &vaf);
        va_end(args);
-        printk(KERN_WARNING "BeFS(%s): %s\n", sb->s_id, err_buf);
-        kfree(err_buf);
 }
 void
@@ -67,25 +55,13 @@ befs_debug(const struct super_block *sb, const char *fmt, ...)
 {
 #ifdef CONFIG_BEFS_DEBUG
+        struct va_format vaf;
        va_list args;
-        char *err_buf = NULL;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
-        if (BEFS_SB(sb)->mount_opts.debug) {
+        vaf.va = &args;
-                err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
+        pr_debug("(%s): %pV\n", sb->s_id, &vaf);
-                if (err_buf == NULL) {
+        va_end(args);
-                        printk(KERN_ERR "could not allocate %d bytes\n",
-                                ERRBUFSIZE);
-                        return;
-                }
-                va_start(args, fmt);
-                vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
-                va_end(args);
-                printk(KERN_DEBUG "BeFS(%s): %s\n", sb->s_id, err_buf);
-                kfree(err_buf);
-        }
 #endif                          //CONFIG_BEFS_DEBUG
 }
@@ -109,9 +85,9 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
        befs_debug(sb, "  gid %u", fs32_to_cpu(sb, inode->gid));
        befs_debug(sb, "  mode %08x", fs32_to_cpu(sb, inode->mode));
        befs_debug(sb, "  flags %08x", fs32_to_cpu(sb, inode->flags));
-        befs_debug(sb, "  create_time %Lu",
+        befs_debug(sb, "  create_time %llu",
                   fs64_to_cpu(sb, inode->create_time));
-        befs_debug(sb, "  last_modified_time %Lu",
+        befs_debug(sb, "  last_modified_time %llu",
                   fs64_to_cpu(sb, inode->last_modified_time));
        tmp_run = fsrun_to_cpu(sb, inode->parent);
@@ -137,7 +113,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
                                   tmp_run.allocation_group, tmp_run.start,
                                   tmp_run.len);
                }
-                befs_debug(sb, "  max_direct_range %Lu",
+                befs_debug(sb, "  max_direct_range %llu",
                           fs64_to_cpu(sb,
                                       inode->data.datastream.
                                       max_direct_range));
@@ -147,7 +123,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
                           tmp_run.allocation_group,
                           tmp_run.start, tmp_run.len);
-                befs_debug(sb, "  max_indirect_range %Lu",
+                befs_debug(sb, "  max_indirect_range %llu",
                           fs64_to_cpu(sb,
                                       inode->data.datastream.
                                       max_indirect_range));
@@ -158,12 +134,12 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
                           tmp_run.allocation_group, tmp_run.start,
                           tmp_run.len);
-                befs_debug(sb, "  max_double_indirect_range %Lu",
+                befs_debug(sb, "  max_double_indirect_range %llu",
                           fs64_to_cpu(sb,
                                       inode->data.datastream.
                                       max_double_indirect_range));
-                befs_debug(sb, "  size %Lu",
+                befs_debug(sb, "  size %llu",
                           fs64_to_cpu(sb, inode->data.datastream.size));
        }
@@ -191,8 +167,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
        befs_debug(sb, "  block_size %u", fs32_to_cpu(sb, sup->block_size));
        befs_debug(sb, "  block_shift %u", fs32_to_cpu(sb, sup->block_shift));
-        befs_debug(sb, "  num_blocks %Lu", fs64_to_cpu(sb, sup->num_blocks));
+        befs_debug(sb, "  num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks));
-        befs_debug(sb, "  used_blocks %Lu", fs64_to_cpu(sb, sup->used_blocks));
+        befs_debug(sb, "  used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks));
        befs_debug(sb, "  magic2 %08x", fs32_to_cpu(sb, sup->magic2));
        befs_debug(sb, "  blocks_per_ag %u",
@@ -206,8 +182,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
        befs_debug(sb, "  log_blocks %u, %hu, %hu",
                   tmp_run.allocation_group, tmp_run.start, tmp_run.len);
-        befs_debug(sb, "  log_start %Ld", fs64_to_cpu(sb, sup->log_start));
+        befs_debug(sb, "  log_start %lld", fs64_to_cpu(sb, sup->log_start));
-        befs_debug(sb, "  log_end %Ld", fs64_to_cpu(sb, sup->log_end));
+        befs_debug(sb, "  log_end %lld", fs64_to_cpu(sb, sup->log_end));
        befs_debug(sb, "  magic3 %08x", fs32_to_cpu(sb, sup->magic3));
diff --git a/fs/befs/inode.c b/fs/befs/inode.c
index 94c17f9a9576..fa4b718de597 100644
--- a/fs/befs/inode.c
+++ b/fs/befs/inode.c
@@ -25,7 +25,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
        /* check magic header. */
        if (magic1 != BEFS_INODE_MAGIC1) {
                befs_error(sb,
-                           "Inode has a bad magic header - inode = %lu", inode);
+                           "Inode has a bad magic header - inode = %lu",
+                           (unsigned long)inode);
                return BEFS_BAD_INODE;
        }
@@ -34,8 +35,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
         */
        if (inode != iaddr2blockno(sb, &ino_num)) {
                befs_error(sb, "inode blocknr field disagrees with vfs "
-                           "VFS: %lu, Inode %lu",
+                           "VFS: %lu, Inode %lu", (unsigned long)
-                           inode, iaddr2blockno(sb, &ino_num));
+                           inode, (unsigned long)iaddr2blockno(sb, &ino_num));
                return BEFS_BAD_INODE;
        }
@@ -44,7 +45,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
         */
        if (!(flags & BEFS_INODE_IN_USE)) {
-                befs_error(sb, "inode is not used - inode = %lu", inode);
+                befs_error(sb, "inode is not used - inode = %lu",
+                           (unsigned long)inode);
                return BEFS_BAD_INODE;
        }
diff --git a/fs/befs/io.c b/fs/befs/io.c
index ddef98aa255d..0408a3d601d0 100644
--- a/fs/befs/io.c
+++ b/fs/befs/io.c
@@ -30,9 +30,9 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
        befs_blocknr_t block = 0;
        befs_sb_info *befs_sb = BEFS_SB(sb);
-        befs_debug(sb, "---> Enter befs_read_iaddr() "
+        befs_debug(sb, "---> Enter %s "
-                   "[%u, %hu, %hu]",
+                   "[%u, %hu, %hu]", __func__, iaddr.allocation_group,
-                   iaddr.allocation_group, iaddr.start, iaddr.len);
+                   iaddr.start, iaddr.len);
        if (iaddr.allocation_group > befs_sb->num_ags) {
                befs_error(sb, "BEFS: Invalid allocation group %u, max is %u",
@@ -42,20 +42,21 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
        block = iaddr2blockno(sb, &iaddr);
-        befs_debug(sb, "befs_read_iaddr: offset = %lu", block);
+        befs_debug(sb, "%s: offset = %lu", __func__, (unsigned long)block);
        bh = sb_bread(sb, block);
        if (bh == NULL) {
-                befs_error(sb, "Failed to read block %lu", block);
+                befs_error(sb, "Failed to read block %lu",
+                           (unsigned long)block);
                goto error;
        }
-        befs_debug(sb, "<--- befs_read_iaddr()");
+        befs_debug(sb, "<--- %s", __func__);
        return bh;
      error:
-        befs_debug(sb, "<--- befs_read_iaddr() ERROR");
+        befs_debug(sb, "<--- %s ERROR", __func__);
        return NULL;
 }
@@ -64,20 +65,21 @@ befs_bread(struct super_block *sb, befs_blocknr_t block)
 {
        struct buffer_head *bh = NULL;
-        befs_debug(sb, "---> Enter befs_read() %Lu", block);
+        befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block);
        bh = sb_bread(sb, block);
        if (bh == NULL) {
-                befs_error(sb, "Failed to read block %lu", block);
+                befs_error(sb, "Failed to read block %lu",
+                           (unsigned long)block);
                goto error;
        }
-        befs_debug(sb, "<--- befs_read()");
+        befs_debug(sb, "<--- %s", __func__);
        return bh;
      error:
-        befs_debug(sb, "<--- befs_read() ERROR");
+        befs_debug(sb, "<--- %s ERROR", __func__);
        return NULL;
 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 845d2d690ce2..d626756ff721 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -5,6 +5,8 @@
 *
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
@@ -39,7 +41,6 @@ static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int)
 static struct inode *befs_iget(struct super_block *, unsigned long);
 static struct inode *befs_alloc_inode(struct super_block *sb);
 static void befs_destroy_inode(struct inode *inode);
-static int befs_init_inodecache(void);
 static void befs_destroy_inodecache(void);
 static void *befs_follow_link(struct dentry *, struct nameidata *);
 static void *befs_fast_follow_link(struct dentry *, struct nameidata *);
@@ -131,26 +132,28 @@ befs_get_block(struct inode *inode, sector_t block,
        ulong disk_off;
        befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld",
-                   inode->i_ino, block);
+                   (unsigned long)inode->i_ino, (long)block);
        if (block < 0) {
                befs_error(sb, "befs_get_block() was asked for a block "
                           "number less than zero: block %ld in inode %lu",
-                           block, inode->i_ino);
+                           (long)block, (unsigned long)inode->i_ino);
                return -EIO;
        }
        if (create) {
                befs_error(sb, "befs_get_block() was asked to write to "
-                           "block %ld in inode %lu", block, inode->i_ino);
+                           "block %ld in inode %lu", (long)block,
+                           (unsigned long)inode->i_ino);
                return -EPERM;
        }
        res = befs_fblock2brun(sb, ds, block, &run);
        if (res != BEFS_OK) {
                befs_error(sb,
-                           "<--- befs_get_block() for inode %lu, block "
+                           "<--- %s for inode %lu, block %ld ERROR",
-                           "%ld ERROR", inode->i_ino, block);
+                           __func__, (unsigned long)inode->i_ino,
+                           (long)block);
                return -EFBIG;
        }
@@ -158,8 +161,9 @@ befs_get_block(struct inode *inode, sector_t block,
        map_bh(bh_result, inode->i_sb, disk_off);
-        befs_debug(sb, "<--- befs_get_block() for inode %lu, block %ld, "
+        befs_debug(sb, "<--- %s for inode %lu, block %ld, disk address %lu",
-                   "disk address %lu", inode->i_ino, block, disk_off);
+                  __func__, (unsigned long)inode->i_ino, (long)block,
+                  (unsigned long)disk_off);
        return 0;
 }
@@ -176,15 +180,15 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
        char *utfname;
        const char *name = dentry->d_name.name;
-        befs_debug(sb, "---> befs_lookup() "
+        befs_debug(sb, "---> %s name %s inode %ld", __func__,
-                   "name %s inode %ld", dentry->d_name.name, dir->i_ino);
+                   dentry->d_name.name, dir->i_ino);
        /* Convert to UTF-8 */
        if (BEFS_SB(sb)->nls) {
                ret =
                    befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen);
                if (ret < 0) {
-                        befs_debug(sb, "<--- befs_lookup() ERROR");
+                        befs_debug(sb, "<--- %s ERROR", __func__);
                        return ERR_PTR(ret);
                }
                ret = befs_btree_find(sb, ds, utfname, &offset);
@@ -195,12 +199,12 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
        }
        if (ret == BEFS_BT_NOT_FOUND) {
-                befs_debug(sb, "<--- befs_lookup() %s not found",
+                befs_debug(sb, "<--- %s %s not found", __func__,
                           dentry->d_name.name);
                return ERR_PTR(-ENOENT);
        } else if (ret != BEFS_OK || offset == 0) {
-                befs_warning(sb, "<--- befs_lookup() Error");
+                befs_warning(sb, "<--- %s Error", __func__);
                return ERR_PTR(-ENODATA);
        }
@@ -210,7 +214,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
        d_add(dentry, inode);
-        befs_debug(sb, "<--- befs_lookup()");
+        befs_debug(sb, "<--- %s", __func__);
        return NULL;
 }
@@ -228,26 +232,25 @@ befs_readdir(struct file *file, struct dir_context *ctx)
        char keybuf[BEFS_NAME_LEN + 1];
        const char *dirname = file->f_path.dentry->d_name.name;
-        befs_debug(sb, "---> befs_readdir() "
+        befs_debug(sb, "---> %s name %s, inode %ld, ctx->pos %lld",
-                   "name %s, inode %ld, ctx->pos %Ld",
+                  __func__, dirname, inode->i_ino, ctx->pos);
-                   dirname, inode->i_ino, ctx->pos);
 more:
        result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
                                 keybuf, &keysize, &value);
        if (result == BEFS_ERR) {
-                befs_debug(sb, "<--- befs_readdir() ERROR");
+                befs_debug(sb, "<--- %s ERROR", __func__);
                befs_error(sb, "IO error reading %s (inode %lu)",
                           dirname, inode->i_ino);
                return -EIO;
        } else if (result == BEFS_BT_END) {
-                befs_debug(sb, "<--- befs_readdir() END");
+                befs_debug(sb, "<--- %s END", __func__);
                return 0;
        } else if (result == BEFS_BT_EMPTY) {
-                befs_debug(sb, "<--- befs_readdir() Empty directory");
+                befs_debug(sb, "<--- %s Empty directory", __func__);
                return 0;
        }
@@ -260,7 +263,7 @@ more:
                result =
                    befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
                if (result < 0) {
-                        befs_debug(sb, "<--- befs_readdir() ERROR");
+                        befs_debug(sb, "<--- %s ERROR", __func__);
                        return result;
                }
                if (!dir_emit(ctx, nlsname, nlsnamelen,
@@ -277,7 +280,7 @@ more:
        ctx->pos++;
        goto more;
-        befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos);
+        befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos);
        return 0;
 }
@@ -321,7 +324,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
        struct inode *inode;
        long ret = -EIO;
-        befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino);
+        befs_debug(sb, "---> %s inode = %lu", __func__, ino);
        inode = iget_locked(sb, ino);
        if (!inode)
@@ -428,7 +431,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
        }
        brelse(bh);
-        befs_debug(sb, "<--- befs_read_inode()");
+        befs_debug(sb, "<--- %s", __func__);
        unlock_new_inode(inode);
        return inode;
@@ -437,7 +440,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
      unacquire_none:
        iget_failed(inode);
-        befs_debug(sb, "<--- befs_read_inode() - Bad inode");
+        befs_debug(sb, "<--- %s - Bad inode", __func__);
        return ERR_PTR(ret);
 }
@@ -445,7 +448,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 *
 * Taken from NFS implementation by Al Viro.
 */
-static int
+static int __init
 befs_init_inodecache(void)
 {
        befs_inode_cachep = kmem_cache_create("befs_inode_cache",
@@ -454,11 +457,9 @@ befs_init_inodecache(void)
                                                SLAB_MEM_SPREAD),
                                              init_once);
        if (befs_inode_cachep == NULL) {
-                printk(KERN_ERR "befs_init_inodecache: "
+                pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
-                       "Couldn't initialize inode slabcache\n");
                return -ENOMEM;
        }
        return 0;
 }
@@ -544,16 +545,16 @@ befs_utf2nls(struct super_block *sb, const char *in,
         */
        int maxlen = in_len + 1;
-        befs_debug(sb, "---> utf2nls()");
+        befs_debug(sb, "---> %s", __func__);
        if (!nls) {
-                befs_error(sb, "befs_utf2nls called with no NLS table loaded");
+                befs_error(sb, "%s called with no NLS table loaded", __func__);
                return -EINVAL;
        }
        *out = result = kmalloc(maxlen, GFP_NOFS);
        if (!*out) {
-                befs_error(sb, "befs_utf2nls() cannot allocate memory");
+                befs_error(sb, "%s cannot allocate memory", __func__);
                *out_len = 0;
                return -ENOMEM;
        }
@@ -575,14 +576,14 @@ befs_utf2nls(struct super_block *sb, const char *in,
        result[o] = '\0';
        *out_len = o;
-        befs_debug(sb, "<--- utf2nls()");
+        befs_debug(sb, "<--- %s", __func__);
        return o;
      conv_err:
        befs_error(sb, "Name using character set %s contains a character that "
                   "cannot be converted to unicode.", nls->charset);
-        befs_debug(sb, "<--- utf2nls()");
+        befs_debug(sb, "<--- %s", __func__);
        kfree(result);
        return -EILSEQ;
 }
@@ -623,16 +624,17 @@ befs_nls2utf(struct super_block *sb, const char *in,
         * in special cases */
        int maxlen = (3 * in_len) + 1;
-        befs_debug(sb, "---> nls2utf()\n");
+        befs_debug(sb, "---> %s\n", __func__);
        if (!nls) {
-                befs_error(sb, "befs_nls2utf called with no NLS table loaded.");
+                befs_error(sb, "%s called with no NLS table loaded.",
+                           __func__);
                return -EINVAL;
        }
        *out = result = kmalloc(maxlen, GFP_NOFS);
        if (!*out) {
-                befs_error(sb, "befs_nls2utf() cannot allocate memory");
+                befs_error(sb, "%s cannot allocate memory", __func__);
                *out_len = 0;
                return -ENOMEM;
        }
@@ -653,14 +655,14 @@ befs_nls2utf(struct super_block *sb, const char *in,
        result[o] = '\0';
        *out_len = o;
-        befs_debug(sb, "<--- nls2utf()");
+        befs_debug(sb, "<--- %s", __func__);
        return i;
      conv_err:
        befs_error(sb, "Name using charecter set %s contains a charecter that "
                   "cannot be converted to unicode.", nls->charset);
-        befs_debug(sb, "<--- nls2utf()");
+        befs_debug(sb, "<--- %s", __func__);
        kfree(result);
        return -EILSEQ;
 }
@@ -715,8 +717,8 @@ parse_options(char *options, befs_mount_options * opts)
                        if (option >= 0)
                                uid = make_kuid(current_user_ns(), option);
                        if (!uid_valid(uid)) {
-                                printk(KERN_ERR "BeFS: Invalid uid %d, "
+                                pr_err("Invalid uid %d, "
-                                                "using default\n", option);
+                                       "using default\n", option);
                                break;
                        }
                        opts->uid = uid;
@@ -729,8 +731,8 @@ parse_options(char *options, befs_mount_options * opts)
                        if (option >= 0)
                                gid = make_kgid(current_user_ns(), option);
                        if (!gid_valid(gid)) {
-                                printk(KERN_ERR "BeFS: Invalid gid %d, "
+                                pr_err("Invalid gid %d, "
-                                                "using default\n", option);
+                                       "using default\n", option);
                                break;
                        }
                        opts->gid = gid;
@@ -740,8 +742,8 @@ parse_options(char *options, befs_mount_options * opts)
                        kfree(opts->iocharset);
                        opts->iocharset = match_strdup(&args[0]);
                        if (!opts->iocharset) {
-                                printk(KERN_ERR "BeFS: allocation failure for "
+                                pr_err("allocation failure for "
-                                                "iocharset string\n");
+                                       "iocharset string\n");
                                return 0;
                        }
                        break;
@@ -749,8 +751,8 @@ parse_options(char *options, befs_mount_options * opts)
                        opts->debug = 1;
                        break;
                default:
-                        printk(KERN_ERR "BeFS: Unrecognized mount option \"%s\" "
+                        pr_err("Unrecognized mount option \"%s\" "
-                                        "or missing value\n", p);
+                               "or missing value\n", p);
                        return 0;
                }
        }
@@ -791,22 +793,20 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
        save_mount_options(sb, data);
-        sb->s_fs_info = kmalloc(sizeof (*befs_sb), GFP_KERNEL);
+        sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL);
        if (sb->s_fs_info == NULL) {
-                printk(KERN_ERR
+                pr_err("(%s): Unable to allocate memory for private "
-                       "BeFS(%s): Unable to allocate memory for private "
                       "portion of superblock. Bailing.\n", sb->s_id);
                goto unacquire_none;
        }
        befs_sb = BEFS_SB(sb);
-        memset(befs_sb, 0, sizeof(befs_sb_info));
        if (!parse_options((char *) data, &befs_sb->mount_opts)) {
                befs_error(sb, "cannot parse mount options");
                goto unacquire_priv_sbp;
        }
-        befs_debug(sb, "---> befs_fill_super()");
+        befs_debug(sb, "---> %s", __func__);
 #ifndef CONFIG_BEFS_RW
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -854,7 +854,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
                goto unacquire_priv_sbp;
        if( befs_sb->num_blocks > ~((sector_t)0) ) {
-                befs_error(sb, "blocks count: %Lu "
+                befs_error(sb, "blocks count: %llu "
                        "is larger than the host can use",
                        befs_sb->num_blocks);
                goto unacquire_priv_sbp;
@@ -913,6 +913,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 static int
 befs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        if (!(*flags & MS_RDONLY))
                return -EINVAL;
        return 0;
@@ -924,7 +925,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct super_block *sb = dentry->d_sb;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
-        befs_debug(sb, "---> befs_statfs()");
+        befs_debug(sb, "---> %s", __func__);
        buf->f_type = BEFS_SUPER_MAGIC;
        buf->f_bsize = sb->s_blocksize;
@@ -937,7 +938,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_fsid.val[1] = (u32)(id >> 32);
        buf->f_namelen = BEFS_NAME_LEN;
-        befs_debug(sb, "<--- befs_statfs()");
+        befs_debug(sb, "<--- %s", __func__);
        return 0;
 }
@@ -963,7 +964,7 @@ init_befs_fs(void)
 {
        int err;
-        printk(KERN_INFO "BeFS version: %s\n", BEFS_VERSION);
+        pr_info("version: %s\n", BEFS_VERSION);
        err = befs_init_inodecache();
        if (err)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8defc6b3f9a2..7041ac35ace8 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -172,7 +172,7 @@ static void bfs_evict_inode(struct inode *inode)
        dprintf("ino=%08lx\n", ino);
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        invalidate_inode_buffers(inode);
        clear_inode(inode);
@@ -266,7 +266,7 @@ static void init_once(void *foo)
        inode_init_once(&bi->vfs_inode);
 }
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
        bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
                                             sizeof(struct bfs_inode_info),
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 67be2951b98a..aa3cb626671e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -46,10 +46,15 @@
 #endif
 static int load_elf_binary(struct linux_binprm *bprm);
-static int load_elf_library(struct file *);
 static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
                                int, int, unsigned long);
+#ifdef CONFIG_USELIB
+static int load_elf_library(struct file *);
+#else
+#define load_elf_library NULL
+#endif
 /*
 * If we don't support core dumping, then supply a NULL so we
 * don't even try.
@@ -579,7 +584,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
        unsigned long start_code, end_code, start_data, end_data;
        unsigned long reloc_func_desc __maybe_unused = 0;
        int executable_stack = EXSTACK_DEFAULT;
-        unsigned long def_flags = 0;
        struct pt_regs *regs = current_pt_regs();
        struct {
                struct elfhdr elf_ex;
@@ -719,9 +723,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
        if (retval)
                goto out_free_dentry;
-        /* OK, This is the point of no return */
-        current->mm->def_flags = def_flags;
        /* Do this immediately, since STACK_TOP as used in setup_arg_pages
           may depend on the personality.  */
        SET_PERSONALITY(loc->elf_ex);
@@ -1005,6 +1006,7 @@ out_free_ph:
        goto out;
 }
+#ifdef CONFIG_USELIB
 /* This is really simpleminded and specialized - we are loading an
   a.out library that is given an ELF header. */
 static int load_elf_library(struct file *file)
@@ -1083,6 +1085,7 @@ out_free_ph:
 out:
        return error;
 }
+#endif /* #ifdef CONFIG_USELIB */
 #ifdef CONFIG_ELF_CORE
 /*
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 1c740e152f38..b60500300dd7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -656,6 +656,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
                        mutex_unlock(&root->d_inode->i_mutex);
                        dput(root);
+                        break;
                default: return res;
        }
        return count;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 4f70f383132c..1c2ce0c87711 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -182,6 +182,9 @@ static int bdev_integrity_enabled(struct block_device *bdev, int rw)
 */
 int bio_integrity_enabled(struct bio *bio)
 {
+        if (!bio_is_rw(bio))
+                return 0;
        /* Already protected? */
        if (bio_integrity(bio))
                return 0;
@@ -301,45 +304,65 @@ int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
 EXPORT_SYMBOL(bio_integrity_get_tag);
 /**
- * bio_integrity_generate - Generate integrity metadata for a bio
+ * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio
- * @bio:        bio to generate integrity metadata for
+ * @bio:        bio to generate/verify integrity metadata for
- *
+ * @operate:    operate number, 1 for generate, 0 for verify
- * Description: Generates integrity metadata for a bio by calling the
- * block device's generation callback function.  The bio must have a
- * bip attached with enough room to accommodate the generated
- * integrity metadata.
 */
-static void bio_integrity_generate(struct bio *bio)
+static int bio_integrity_generate_verify(struct bio *bio, int operate)
 {
        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
        struct blk_integrity_exchg bix;
-        struct bio_vec bv;
+        struct bio_vec *bv;
-        struct bvec_iter iter;
+        sector_t sector;
-        sector_t sector = bio->bi_iter.bi_sector;
+        unsigned int sectors, ret = 0, i;
-        unsigned int sectors, total;
        void *prot_buf = bio->bi_integrity->bip_buf;
-        total = 0;
+        if (operate)
+                sector = bio->bi_iter.bi_sector;
+        else
+                sector = bio->bi_integrity->bip_iter.bi_sector;
        bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
        bix.sector_size = bi->sector_size;
-        bio_for_each_segment(bv, bio, iter) {
+        bio_for_each_segment_all(bv, bio, i) {
-                void *kaddr = kmap_atomic(bv.bv_page);
+                void *kaddr = kmap_atomic(bv->bv_page);
-                bix.data_buf = kaddr + bv.bv_offset;
+                bix.data_buf = kaddr + bv->bv_offset;
-                bix.data_size = bv.bv_len;
+                bix.data_size = bv->bv_len;
                bix.prot_buf = prot_buf;
                bix.sector = sector;
-                bi->generate_fn(&bix);
+                if (operate)
+                        bi->generate_fn(&bix);
+                else {
+                        ret = bi->verify_fn(&bix);
+                        if (ret) {
+                                kunmap_atomic(kaddr);
+                                return ret;
+                        }
+                }
-                sectors = bv.bv_len / bi->sector_size;
+                sectors = bv->bv_len / bi->sector_size;
                sector += sectors;
                prot_buf += sectors * bi->tuple_size;
-                total += sectors * bi->tuple_size;
-                BUG_ON(total > bio->bi_integrity->bip_iter.bi_size);
                kunmap_atomic(kaddr);
        }
+        return ret;
+}
+/**
+ * bio_integrity_generate - Generate integrity metadata for a bio
+ * @bio:        bio to generate integrity metadata for
+ *
+ * Description: Generates integrity metadata for a bio by calling the
+ * block device's generation callback function.  The bio must have a
+ * bip attached with enough room to accommodate the generated
+ * integrity metadata.
+ */
+static void bio_integrity_generate(struct bio *bio)
+{
+        bio_integrity_generate_verify(bio, 1);
 }
 static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
@@ -454,40 +477,7 @@ EXPORT_SYMBOL(bio_integrity_prep);
 */
 static int bio_integrity_verify(struct bio *bio)
 {
-        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        return bio_integrity_generate_verify(bio, 0);
-        struct blk_integrity_exchg bix;
-        struct bio_vec *bv;
-        sector_t sector = bio->bi_integrity->bip_iter.bi_sector;
-        unsigned int sectors, ret = 0;
-        void *prot_buf = bio->bi_integrity->bip_buf;
-        int i;
-        bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
-        bix.sector_size = bi->sector_size;
-        bio_for_each_segment_all(bv, bio, i) {
-                void *kaddr = kmap_atomic(bv->bv_page);
-                bix.data_buf = kaddr + bv->bv_offset;
-                bix.data_size = bv->bv_len;
-                bix.prot_buf = prot_buf;
-                bix.sector = sector;
-                ret = bi->verify_fn(&bix);
-                if (ret) {
-                        kunmap_atomic(kaddr);
-                        return ret;
-                }
-                sectors = bv->bv_len / bi->sector_size;
-                sector += sectors;
-                prot_buf += sectors * bi->tuple_size;
-                kunmap_atomic(kaddr);
-        }
-        return ret;
 }
 /**
diff --git a/fs/bio.c b/fs/bio.c
index 8754e7b6eb49..6f0362b77806 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -116,7 +116,6 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
        if (!slab)
                goto out_unlock;
-        printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
        bslab->slab = slab;
        bslab->slab_ref = 1;
        bslab->slab_size = sz;
@@ -1003,7 +1002,7 @@ struct bio_map_data {
 };
 static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
-                             struct sg_iovec *iov, int iov_count,
+                             const struct sg_iovec *iov, int iov_count,
                             int is_our_pages)
 {
        memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
@@ -1023,7 +1022,7 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs,
                       sizeof(struct sg_iovec) * iov_count, gfp_mask);
 }
-static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
+static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count,
                          int to_user, int from_user, int do_free_page)
 {
        int ret = 0, i;
@@ -1121,7 +1120,7 @@ EXPORT_SYMBOL(bio_uncopy_user);
 */
 struct bio *bio_copy_user_iov(struct request_queue *q,
                              struct rq_map_data *map_data,
-                              struct sg_iovec *iov, int iov_count,
+                              const struct sg_iovec *iov, int iov_count,
                              int write_to_vm, gfp_t gfp_mask)
 {
        struct bio_map_data *bmd;
@@ -1260,7 +1259,7 @@ EXPORT_SYMBOL(bio_copy_user);
 static struct bio *__bio_map_user_iov(struct request_queue *q,
                                      struct block_device *bdev,
-                                      struct sg_iovec *iov, int iov_count,
+                                      const struct sg_iovec *iov, int iov_count,
                                      int write_to_vm, gfp_t gfp_mask)
 {
        int i, j;
@@ -1408,7 +1407,7 @@ EXPORT_SYMBOL(bio_map_user);
 *      device. Returns an error pointer in case of error.
 */
 struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
-                             struct sg_iovec *iov, int iov_count,
+                             const struct sg_iovec *iov, int iov_count,
                             int write_to_vm, gfp_t gfp_mask)
 {
        struct bio *bio;
@@ -1970,7 +1969,7 @@ int bio_associate_current(struct bio *bio)
        /* associate blkcg if exists */
        rcu_read_lock();
-        css = task_css(current, blkio_subsys_id);
+        css = task_css(current, blkio_cgrp_id);
        if (css && css_tryget(css))
                bio->bi_css = css;
        rcu_read_unlock();
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1e86823a9cbd..552a8d13bc32 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -83,7 +83,7 @@ void kill_bdev(struct block_device *bdev)
 {
        struct address_space *mapping = bdev->bd_inode->i_mapping;
-        if (mapping->nrpages == 0)
+        if (mapping->nrpages == 0 && mapping->nrshadows == 0)
                return;
        invalidate_bh_lrus();
@@ -419,7 +419,7 @@ static void bdev_evict_inode(struct inode *inode)
 {
        struct block_device *bdev = &BDEV_I(inode)->bdev;
        struct list_head *p;
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        invalidate_inode_buffers(inode); /* is it needed here? */
        clear_inode(inode);
        spin_lock(&bdev_lock);
@@ -1518,12 +1518,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
        BUG_ON(iocb->ki_pos != pos);
        blk_start_plug(&plug);
-        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+        ret = __generic_file_aio_write(iocb, iov, nr_segs);
        if (ret > 0) {
                ssize_t err;
                err = generic_write_sync(file, pos, ret);
-                if (err < 0 && ret > 0)
+                if (err < 0)
                        ret = err;
        }
        blk_finish_plug(&plug);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c1e0b0caf9cc..5a201d81049c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -1,5 +1,6 @@
 /*
 * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2014 Fujitsu.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
@@ -21,708 +22,315 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
+#include <linux/workqueue.h>
 #include "async-thread.h"
+#include "ctree.h"
+#define WORK_DONE_BIT 0
+#define WORK_ORDER_DONE_BIT 1
+#define WORK_HIGH_PRIO_BIT 2
+#define NO_THRESHOLD (-1)
+#define DFT_THRESHOLD (32)
+struct __btrfs_workqueue {
+        struct workqueue_struct *normal_wq;
+        /* List head pointing to ordered work list */
+        struct list_head ordered_list;
+        /* Spinlock for ordered_list */
+        spinlock_t list_lock;
+        /* Thresholding related variants */
+        atomic_t pending;
+        int max_active;
+        int current_max;
+        int thresh;
+        unsigned int count;
+        spinlock_t thres_lock;
+};
-#define WORK_QUEUED_BIT 0
+struct btrfs_workqueue {
-#define WORK_DONE_BIT 1
+        struct __btrfs_workqueue *normal;
-#define WORK_ORDER_DONE_BIT 2
+        struct __btrfs_workqueue *high;
-#define WORK_HIGH_PRIO_BIT 3
+};
-/*
- * container for the kthread task pointer and the list of pending work
- * One of these is allocated per thread.
- */
-struct btrfs_worker_thread {
-        /* pool we belong to */
-        struct btrfs_workers *workers;
-        /* list of struct btrfs_work that are waiting for service */
-        struct list_head pending;
-        struct list_head prio_pending;
-        /* list of worker threads from struct btrfs_workers */
-        struct list_head worker_list;
-        /* kthread */
-        struct task_struct *task;
-        /* number of things on the pending list */
+static inline struct __btrfs_workqueue
-        atomic_t num_pending;
+*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+                         int thresh)
+{
+        struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
-        /* reference counter for this struct */
+        if (unlikely(!ret))
-        atomic_t refs;
+                return NULL;
-        unsigned long sequence;
+        ret->max_active = max_active;
+        atomic_set(&ret->pending, 0);
+        if (thresh == 0)
+                thresh = DFT_THRESHOLD;
+        /* For low threshold, disabling threshold is a better choice */
+        if (thresh < DFT_THRESHOLD) {
+                ret->current_max = max_active;
+                ret->thresh = NO_THRESHOLD;
+        } else {
+                ret->current_max = 1;
+                ret->thresh = thresh;
+        }
-        /* protects the pending list. */
+        if (flags & WQ_HIGHPRI)
-        spinlock_t lock;
+                ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
+                                                 ret->max_active,
+                                                 "btrfs", name);
+        else
+                ret->normal_wq = alloc_workqueue("%s-%s", flags,
+                                                 ret->max_active, "btrfs",
+                                                 name);
+        if (unlikely(!ret->normal_wq)) {
+                kfree(ret);
+                return NULL;
+        }
-        /* set to non-zero when this thread is already awake and kicking */
+        INIT_LIST_HEAD(&ret->ordered_list);
-        int working;
+        spin_lock_init(&ret->list_lock);
+        spin_lock_init(&ret->thres_lock);
+        trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
+        return ret;
+}
-        /* are we currently idle */
+static inline void
-        int idle;
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
-};
-static int __btrfs_start_workers(struct btrfs_workers *workers);
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+                                              int flags,
+                                              int max_active,
+                                              int thresh)
+{
+        struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
-/*
+        if (unlikely(!ret))
- * btrfs_start_workers uses kthread_run, which can block waiting for memory
+                return NULL;
- * for a very long time.  It will actually throttle on page writeback,
- * and so it may not make progress until after our btrfs worker threads
- * process all of the pending work structs in their queue
- *
- * This means we can't use btrfs_start_workers from inside a btrfs worker
- * thread that is used as part of cleaning dirty memory, which pretty much
- * involves all of the worker threads.
- *
- * Instead we have a helper queue who never has more than one thread
- * where we scheduler thread start operations.  This worker_start struct
- * is used to contain the work and hold a pointer to the queue that needs
- * another worker.
- */
-struct worker_start {
-        struct btrfs_work work;
-        struct btrfs_workers *queue;
-};
-static void start_new_worker_func(struct btrfs_work *work)
+        ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
-{
+                                              max_active, thresh);
-        struct worker_start *start;
+        if (unlikely(!ret->normal)) {
-        start = container_of(work, struct worker_start, work);
+                kfree(ret);
-        __btrfs_start_workers(start->queue);
+                return NULL;
-        kfree(start);
+        }
-}
-/*
+        if (flags & WQ_HIGHPRI) {
- * helper function to move a thread onto the idle list after it
+                ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
- * has finished some requests.
+                                                    thresh);
- */
+                if (unlikely(!ret->high)) {
-static void check_idle_worker(struct btrfs_worker_thread *worker)
+                        __btrfs_destroy_workqueue(ret->normal);
-{
+                        kfree(ret);
-        if (!worker->idle && atomic_read(&worker->num_pending) <
+                        return NULL;
-            worker->workers->idle_thresh / 2) {
-                unsigned long flags;
-                spin_lock_irqsave(&worker->workers->lock, flags);
-                worker->idle = 1;
-                /* the list may be empty if the worker is just starting */
-                if (!list_empty(&worker->worker_list) &&
-                    !worker->workers->stopping) {
-                        list_move(&worker->worker_list,
-                                 &worker->workers->idle_list);
                }
-                spin_unlock_irqrestore(&worker->workers->lock, flags);
        }
+        return ret;
 }
 /*
- * helper function to move a thread off the idle list after new
+ * Hook for threshold which will be called in btrfs_queue_work.
- * pending work is added.
+ * This hook WILL be called in IRQ handler context,
+ * so workqueue_set_max_active MUST NOT be called in this hook
 */
-static void check_busy_worker(struct btrfs_worker_thread *worker)
+static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
 {
-        if (worker->idle && atomic_read(&worker->num_pending) >=
+        if (wq->thresh == NO_THRESHOLD)
-            worker->workers->idle_thresh) {
+                return;
-                unsigned long flags;
+        atomic_inc(&wq->pending);
-                spin_lock_irqsave(&worker->workers->lock, flags);
-                worker->idle = 0;
-                if (!list_empty(&worker->worker_list) &&
-                    !worker->workers->stopping) {
-                        list_move_tail(&worker->worker_list,
-                                      &worker->workers->worker_list);
-                }
-                spin_unlock_irqrestore(&worker->workers->lock, flags);
-        }
 }
-static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
+/*
+ * Hook for threshold which will be called before executing the work,
+ * This hook is called in kthread content.
+ * So workqueue_set_max_active is called here.
+ */
+static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
 {
-        struct btrfs_workers *workers = worker->workers;
+        int new_max_active;
-        struct worker_start *start;
+        long pending;
-        unsigned long flags;
+        int need_change = 0;
-        rmb();
-        if (!workers->atomic_start_pending)
-                return;
-        start = kzalloc(sizeof(*start), GFP_NOFS);
+        if (wq->thresh == NO_THRESHOLD)
-        if (!start)
                return;
-        start->work.func = start_new_worker_func;
+        atomic_dec(&wq->pending);
-        start->queue = workers;
+        spin_lock(&wq->thres_lock);
+        /*
-        spin_lock_irqsave(&workers->lock, flags);
+         * Use wq->count to limit the calling frequency of
-        if (!workers->atomic_start_pending)
+         * workqueue_set_max_active.
-                goto out;
+         */
+        wq->count++;
-        workers->atomic_start_pending = 0;
+        wq->count %= (wq->thresh / 4);
-        if (workers->num_workers + workers->num_workers_starting >=
+        if (!wq->count)
-            workers->max_workers)
+                goto  out;
-                goto out;
+        new_max_active = wq->current_max;
-        workers->num_workers_starting += 1;
-        spin_unlock_irqrestore(&workers->lock, flags);
-        btrfs_queue_worker(workers->atomic_worker_start, &start->work);
-        return;
+        /*
+         * pending may be changed later, but it's OK since we really
+         * don't need it so accurate to calculate new_max_active.
+         */
+        pending = atomic_read(&wq->pending);
+        if (pending > wq->thresh)
+                new_max_active++;
+        if (pending < wq->thresh / 2)
+                new_max_active--;
+        new_max_active = clamp_val(new_max_active, 1, wq->max_active);
+        if (new_max_active != wq->current_max)  {
+                need_change = 1;
+                wq->current_max = new_max_active;
+        }
 out:
-        kfree(start);
+        spin_unlock(&wq->thres_lock);
-        spin_unlock_irqrestore(&workers->lock, flags);
+        if (need_change) {
+                workqueue_set_max_active(wq->normal_wq, wq->current_max);
+        }
 }
-static noinline void run_ordered_completions(struct btrfs_workers *workers,
+static void run_ordered_work(struct __btrfs_workqueue *wq)
-                                            struct btrfs_work *work)
 {
-        if (!workers->ordered)
+        struct list_head *list = &wq->ordered_list;
-                return;
+        struct btrfs_work *work;
+        spinlock_t *lock = &wq->list_lock;
-        set_bit(WORK_DONE_BIT, &work->flags);
+        unsigned long flags;
-        spin_lock(&workers->order_lock);
        while (1) {
-                if (!list_empty(&workers->prio_order_list)) {
+                spin_lock_irqsave(lock, flags);
-                        work = list_entry(workers->prio_order_list.next,
+                if (list_empty(list))
-                                          struct btrfs_work, order_list);
-                } else if (!list_empty(&workers->order_list)) {
-                        work = list_entry(workers->order_list.next,
-                                          struct btrfs_work, order_list);
-                } else {
                        break;
-                }
+                work = list_entry(list->next, struct btrfs_work,
+                                  ordered_list);
                if (!test_bit(WORK_DONE_BIT, &work->flags))
                        break;
-                /* we are going to call the ordered done function, but
+                /*
+                 * we are going to call the ordered done function, but
                 * we leave the work item on the list as a barrier so
                 * that later work items that are done don't have their
                 * functions called before this one returns
                 */
                if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
                        break;
+                trace_btrfs_ordered_sched(work);
-                spin_unlock(&workers->order_lock);
+                spin_unlock_irqrestore(lock, flags);
                work->ordered_func(work);
                /* now take the lock again and drop our item from the list */
-                spin_lock(&workers->order_lock);
+                spin_lock_irqsave(lock, flags);
-                list_del(&work->order_list);
+                list_del(&work->ordered_list);
-                spin_unlock(&workers->order_lock);
+                spin_unlock_irqrestore(lock, flags);
                /*
                 * we don't want to call the ordered free functions
                 * with the lock held though
                 */
                work->ordered_free(work);
-                spin_lock(&workers->order_lock);
+                trace_btrfs_all_work_done(work);
-        }
-        spin_unlock(&workers->order_lock);
-}
-static void put_worker(struct btrfs_worker_thread *worker)
-{
-        if (atomic_dec_and_test(&worker->refs))
-                kfree(worker);
-}
-static int try_worker_shutdown(struct btrfs_worker_thread *worker)
-{
-        int freeit = 0;
-        spin_lock_irq(&worker->lock);
-        spin_lock(&worker->workers->lock);
-        if (worker->workers->num_workers > 1 &&
-            worker->idle &&
-            !worker->working &&
-            !list_empty(&worker->worker_list) &&
-            list_empty(&worker->prio_pending) &&
-            list_empty(&worker->pending) &&
-            atomic_read(&worker->num_pending) == 0) {
-                freeit = 1;
-                list_del_init(&worker->worker_list);
-                worker->workers->num_workers--;
        }
-        spin_unlock(&worker->workers->lock);
+        spin_unlock_irqrestore(lock, flags);
-        spin_unlock_irq(&worker->lock);
-        if (freeit)
-                put_worker(worker);
-        return freeit;
 }
-static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
+static void normal_work_helper(struct work_struct *arg)
-                                        struct list_head *prio_head,
-                                        struct list_head *head)
 {
-        struct btrfs_work *work = NULL;
-        struct list_head *cur = NULL;
-        if (!list_empty(prio_head))
-                cur = prio_head->next;
-        smp_mb();
-        if (!list_empty(&worker->prio_pending))
-                goto refill;
-        if (!list_empty(head))
-                cur = head->next;
-        if (cur)
-                goto out;
-refill:
-        spin_lock_irq(&worker->lock);
-        list_splice_tail_init(&worker->prio_pending, prio_head);
-        list_splice_tail_init(&worker->pending, head);
-        if (!list_empty(prio_head))
-                cur = prio_head->next;
-        else if (!list_empty(head))
-                cur = head->next;
-        spin_unlock_irq(&worker->lock);
-        if (!cur)
-                goto out_fail;
-out:
-        work = list_entry(cur, struct btrfs_work, list);
-out_fail:
-        return work;
-}
-/*
- * main loop for servicing work items
- */
-static int worker_loop(void *arg)
-{
-        struct btrfs_worker_thread *worker = arg;
-        struct list_head head;
-        struct list_head prio_head;
        struct btrfs_work *work;
+        struct __btrfs_workqueue *wq;
+        int need_order = 0;
-        INIT_LIST_HEAD(&head);
+        work = container_of(arg, struct btrfs_work, normal_work);
-        INIT_LIST_HEAD(&prio_head);
+        /*
+         * We should not touch things inside work in the following cases:
-        do {
+         * 1) after work->func() if it has no ordered_free
-again:
+         *    Since the struct is freed in work->func().
-                while (1) {
+         * 2) after setting WORK_DONE_BIT
+         *    The work may be freed in other threads almost instantly.
+         * So we save the needed things here.
-                        work = get_next_work(worker, &prio_head, &head);
+         */
-                        if (!work)
+        if (work->ordered_func)
-                                break;
+                need_order = 1;
+        wq = work->wq;
-                        list_del(&work->list);
-                        clear_bit(WORK_QUEUED_BIT, &work->flags);
+        trace_btrfs_work_sched(work);
+        thresh_exec_hook(wq);
-                        work->worker = worker;
+        work->func(work);
+        if (need_order) {
-                        work->func(work);
+                set_bit(WORK_DONE_BIT, &work->flags);
+                run_ordered_work(wq);
-                        atomic_dec(&worker->num_pending);
-                        /*
-                         * unless this is an ordered work queue,
-                         * 'work' was probably freed by func above.
-                         */
-                        run_ordered_completions(worker->workers, work);
-                        check_pending_worker_creates(worker);
-                        cond_resched();
-                }
-                spin_lock_irq(&worker->lock);
-                check_idle_worker(worker);
-                if (freezing(current)) {
-                        worker->working = 0;
-                        spin_unlock_irq(&worker->lock);
-                        try_to_freeze();
-                } else {
-                        spin_unlock_irq(&worker->lock);
-                        if (!kthread_should_stop()) {
-                                cpu_relax();
-                                /*
-                                 * we've dropped the lock, did someone else
-                                 * jump_in?
-                                 */
-                                smp_mb();
-                                if (!list_empty(&worker->pending) ||
-                                    !list_empty(&worker->prio_pending))
-                                        continue;
-                                /*
-                                 * this short schedule allows more work to
-                                 * come in without the queue functions
-                                 * needing to go through wake_up_process()
-                                 *
-                                 * worker->working is still 1, so nobody
-                                 * is going to try and wake us up
-                                 */
-                                schedule_timeout(1);
-                                smp_mb();
-                                if (!list_empty(&worker->pending) ||
-                                    !list_empty(&worker->prio_pending))
-                                        continue;
-                                if (kthread_should_stop())
-                                        break;
-                                /* still no more work?, sleep for real */
-                                spin_lock_irq(&worker->lock);
-                                set_current_state(TASK_INTERRUPTIBLE);
-                                if (!list_empty(&worker->pending) ||
-                                    !list_empty(&worker->prio_pending)) {
-                                        spin_unlock_irq(&worker->lock);
-                                        set_current_state(TASK_RUNNING);
-                                        goto again;
-                                }
-                                /*
-                                 * this makes sure we get a wakeup when someone
-                                 * adds something new to the queue
-                                 */
-                                worker->working = 0;
-                                spin_unlock_irq(&worker->lock);
-                                if (!kthread_should_stop()) {
-                                        schedule_timeout(HZ * 120);
-                                        if (!worker->working &&
-                                            try_worker_shutdown(worker)) {
-                                                return 0;
-                                        }
-                                }
-                        }
-                        __set_current_state(TASK_RUNNING);
-                }
-        } while (!kthread_should_stop());
-        return 0;
-}
-/*
- * this will wait for all the worker threads to shutdown
- */
-void btrfs_stop_workers(struct btrfs_workers *workers)
-{
-        struct list_head *cur;
-        struct btrfs_worker_thread *worker;
-        int can_stop;
-        spin_lock_irq(&workers->lock);
-        workers->stopping = 1;
-        list_splice_init(&workers->idle_list, &workers->worker_list);
-        while (!list_empty(&workers->worker_list)) {
-                cur = workers->worker_list.next;
-                worker = list_entry(cur, struct btrfs_worker_thread,
-                                    worker_list);
-                atomic_inc(&worker->refs);
-                workers->num_workers -= 1;
-                if (!list_empty(&worker->worker_list)) {
-                        list_del_init(&worker->worker_list);
-                        put_worker(worker);
-                        can_stop = 1;
-                } else
-                        can_stop = 0;
-                spin_unlock_irq(&workers->lock);
-                if (can_stop)
-                        kthread_stop(worker->task);
-                spin_lock_irq(&workers->lock);
-                put_worker(worker);
        }
-        spin_unlock_irq(&workers->lock);
+        if (!need_order)
+                trace_btrfs_all_work_done(work);
 }
-/*
+void btrfs_init_work(struct btrfs_work *work,
- * simple init on struct btrfs_workers
+                     btrfs_func_t func,
- */
+                     btrfs_func_t ordered_func,
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+                     btrfs_func_t ordered_free)
-                        struct btrfs_workers *async_helper)
 {
-        workers->num_workers = 0;
+        work->func = func;
-        workers->num_workers_starting = 0;
+        work->ordered_func = ordered_func;
-        INIT_LIST_HEAD(&workers->worker_list);
+        work->ordered_free = ordered_free;
-        INIT_LIST_HEAD(&workers->idle_list);
+        INIT_WORK(&work->normal_work, normal_work_helper);
-        INIT_LIST_HEAD(&workers->order_list);
+        INIT_LIST_HEAD(&work->ordered_list);
-        INIT_LIST_HEAD(&workers->prio_order_list);
+        work->flags = 0;
-        spin_lock_init(&workers->lock);
-        spin_lock_init(&workers->order_lock);
-        workers->max_workers = max;
-        workers->idle_thresh = 32;
-        workers->name = name;
-        workers->ordered = 0;
-        workers->atomic_start_pending = 0;
-        workers->atomic_worker_start = async_helper;
-        workers->stopping = 0;
 }
-/*
+static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
- * starts new worker threads.  This does not enforce the max worker
+                                      struct btrfs_work *work)
- * count in case you need to temporarily go past it.
- */
-static int __btrfs_start_workers(struct btrfs_workers *workers)
 {
-        struct btrfs_worker_thread *worker;
+        unsigned long flags;
-        int ret = 0;
-        worker = kzalloc(sizeof(*worker), GFP_NOFS);
-        if (!worker) {
-                ret = -ENOMEM;
-                goto fail;
-        }
-        INIT_LIST_HEAD(&worker->pending);
-        INIT_LIST_HEAD(&worker->prio_pending);
-        INIT_LIST_HEAD(&worker->worker_list);
-        spin_lock_init(&worker->lock);
-        atomic_set(&worker->num_pending, 0);
-        atomic_set(&worker->refs, 1);
-        worker->workers = workers;
-        worker->task = kthread_create(worker_loop, worker,
-                                      "btrfs-%s-%d", workers->name,
-                                      workers->num_workers + 1);
-        if (IS_ERR(worker->task)) {
-                ret = PTR_ERR(worker->task);
-                goto fail;
-        }
-        spin_lock_irq(&workers->lock);
+        work->wq = wq;
-        if (workers->stopping) {
+        thresh_queue_hook(wq);
-                spin_unlock_irq(&workers->lock);
+        if (work->ordered_func) {
-                ret = -EINVAL;
+                spin_lock_irqsave(&wq->list_lock, flags);
-                goto fail_kthread;
+                list_add_tail(&work->ordered_list, &wq->ordered_list);
+                spin_unlock_irqrestore(&wq->list_lock, flags);
        }
-        list_add_tail(&worker->worker_list, &workers->idle_list);
+        queue_work(wq->normal_wq, &work->normal_work);
-        worker->idle = 1;
+        trace_btrfs_work_queued(work);
-        workers->num_workers++;
-        workers->num_workers_starting--;
-        WARN_ON(workers->num_workers_starting < 0);
-        spin_unlock_irq(&workers->lock);
-        wake_up_process(worker->task);
-        return 0;
-fail_kthread:
-        kthread_stop(worker->task);
-fail:
-        kfree(worker);
-        spin_lock_irq(&workers->lock);
-        workers->num_workers_starting--;
-        spin_unlock_irq(&workers->lock);
-        return ret;
 }
-int btrfs_start_workers(struct btrfs_workers *workers)
+void btrfs_queue_work(struct btrfs_workqueue *wq,
-{
+                      struct btrfs_work *work)
-        spin_lock_irq(&workers->lock);
-        workers->num_workers_starting++;
-        spin_unlock_irq(&workers->lock);
-        return __btrfs_start_workers(workers);
-}
-/*
- * run through the list and find a worker thread that doesn't have a lot
- * to do right now.  This can return null if we aren't yet at the thread
- * count limit and all of the threads are busy.
- */
-static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 {
-        struct btrfs_worker_thread *worker;
+        struct __btrfs_workqueue *dest_wq;
-        struct list_head *next;
-        int enforce_min;
-        enforce_min = (workers->num_workers + workers->num_workers_starting) <
+        if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
-                workers->max_workers;
+                dest_wq = wq->high;
+        else
-        /*
+                dest_wq = wq->normal;
-         * if we find an idle thread, don't move it to the end of the
+        __btrfs_queue_work(dest_wq, work);
-         * idle list.  This improves the chance that the next submission
-         * will reuse the same thread, and maybe catch it while it is still
-         * working
-         */
-        if (!list_empty(&workers->idle_list)) {
-                next = workers->idle_list.next;
-                worker = list_entry(next, struct btrfs_worker_thread,
-                                    worker_list);
-                return worker;
-        }
-        if (enforce_min || list_empty(&workers->worker_list))
-                return NULL;
-        /*
-         * if we pick a busy task, move the task to the end of the list.
-         * hopefully this will keep things somewhat evenly balanced.
-         * Do the move in batches based on the sequence number.  This groups
-         * requests submitted at roughly the same time onto the same worker.
-         */
-        next = workers->worker_list.next;
-        worker = list_entry(next, struct btrfs_worker_thread, worker_list);
-        worker->sequence++;
-        if (worker->sequence % workers->idle_thresh == 0)
-                list_move_tail(next, &workers->worker_list);
-        return worker;
 }
-/*
+static inline void
- * selects a worker thread to take the next job.  This will either find
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
- * an idle worker, start a new worker up to the max count, or just return
- * one of the existing busy workers.
- */
-static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
 {
-        struct btrfs_worker_thread *worker;
+        destroy_workqueue(wq->normal_wq);
-        unsigned long flags;
+        trace_btrfs_workqueue_destroy(wq);
-        struct list_head *fallback;
+        kfree(wq);
-        int ret;
-        spin_lock_irqsave(&workers->lock, flags);
-again:
-        worker = next_worker(workers);
-        if (!worker) {
-                if (workers->num_workers + workers->num_workers_starting >=
-                    workers->max_workers) {
-                        goto fallback;
-                } else if (workers->atomic_worker_start) {
-                        workers->atomic_start_pending = 1;
-                        goto fallback;
-                } else {
-                        workers->num_workers_starting++;
-                        spin_unlock_irqrestore(&workers->lock, flags);
-                        /* we're below the limit, start another worker */
-                        ret = __btrfs_start_workers(workers);
-                        spin_lock_irqsave(&workers->lock, flags);
-                        if (ret)
-                                goto fallback;
-                        goto again;
-                }
-        }
-        goto found;
-fallback:
-        fallback = NULL;
-        /*
-         * we have failed to find any workers, just
-         * return the first one we can find.
-         */
-        if (!list_empty(&workers->worker_list))
-                fallback = workers->worker_list.next;
-        if (!list_empty(&workers->idle_list))
-                fallback = workers->idle_list.next;
-        BUG_ON(!fallback);
-        worker = list_entry(fallback,
-                  struct btrfs_worker_thread, worker_list);
-found:
-        /*
-         * this makes sure the worker doesn't exit before it is placed
-         * onto a busy/idle list
-         */
-        atomic_inc(&worker->num_pending);
-        spin_unlock_irqrestore(&workers->lock, flags);
-        return worker;
 }
-/*
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
- * btrfs_requeue_work just puts the work item back on the tail of the list
- * it was taken from.  It is intended for use with long running work functions
- * that make some progress and want to give the cpu up for others.
- */
-void btrfs_requeue_work(struct btrfs_work *work)
 {
-        struct btrfs_worker_thread *worker = work->worker;
+        if (!wq)
-        unsigned long flags;
-        int wake = 0;
-        if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
                return;
+        if (wq->high)
-        spin_lock_irqsave(&worker->lock, flags);
+                __btrfs_destroy_workqueue(wq->high);
-        if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+        __btrfs_destroy_workqueue(wq->normal);
-                list_add_tail(&work->list, &worker->prio_pending);
+        kfree(wq);
-        else
-                list_add_tail(&work->list, &worker->pending);
-        atomic_inc(&worker->num_pending);
-        /* by definition we're busy, take ourselves off the idle
-         * list
-         */
-        if (worker->idle) {
-                spin_lock(&worker->workers->lock);
-                worker->idle = 0;
-                list_move_tail(&worker->worker_list,
-                              &worker->workers->worker_list);
-                spin_unlock(&worker->workers->lock);
-        }
-        if (!worker->working) {
-                wake = 1;
-                worker->working = 1;
-        }
-        if (wake)
-                wake_up_process(worker->task);
-        spin_unlock_irqrestore(&worker->lock, flags);
 }
-void btrfs_set_work_high_prio(struct btrfs_work *work)
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
 {
-        set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+        if (!wq)
+                return;
+        wq->normal->max_active = max;
+        if (wq->high)
+                wq->high->max_active = max;
 }
-/*
+void btrfs_set_work_high_priority(struct btrfs_work *work)
- * places a struct btrfs_work into the pending queue of one of the kthreads
- */
-void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 {
-        struct btrfs_worker_thread *worker;
+        set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
-        unsigned long flags;
-        int wake = 0;
-        /* don't requeue something already on a list */
-        if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
-                return;
-        worker = find_worker(workers);
-        if (workers->ordered) {
-                /*
-                 * you're not allowed to do ordered queues from an
-                 * interrupt handler
-                 */
-                spin_lock(&workers->order_lock);
-                if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
-                        list_add_tail(&work->order_list,
-                                      &workers->prio_order_list);
-                } else {
-                        list_add_tail(&work->order_list, &workers->order_list);
-                }
-                spin_unlock(&workers->order_lock);
-        } else {
-                INIT_LIST_HEAD(&work->order_list);
-        }
-        spin_lock_irqsave(&worker->lock, flags);
-        if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
-                list_add_tail(&work->list, &worker->prio_pending);
-        else
-                list_add_tail(&work->list, &worker->pending);
-        check_busy_worker(worker);
-        /*
-         * avoid calling into wake_up_process if this thread has already
-         * been kicked
-         */
-        if (!worker->working)
-                wake = 1;
-        worker->working = 1;
-        if (wake)
-                wake_up_process(worker->task);
-        spin_unlock_irqrestore(&worker->lock, flags);
 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f26792683ed..9c6b66d15fb0 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -1,5 +1,6 @@
 /*
 * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2014 Fujitsu.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
@@ -19,103 +20,35 @@
 #ifndef __BTRFS_ASYNC_THREAD_
 #define __BTRFS_ASYNC_THREAD_
-struct btrfs_worker_thread;
+struct btrfs_workqueue;
+/* Internal use only */
+struct __btrfs_workqueue;
+struct btrfs_work;
+typedef void (*btrfs_func_t)(struct btrfs_work *arg);
-/*
- * This is similar to a workqueue, but it is meant to spread the operations
- * across all available cpus instead of just the CPU that was used to
- * queue the work.  There is also some batching introduced to try and
- * cut down on context switches.
- *
- * By default threads are added on demand up to 2 * the number of cpus.
- * Changing struct btrfs_workers->max_workers is one way to prevent
- * demand creation of kthreads.
- *
- * the basic model of these worker threads is to embed a btrfs_work
- * structure in your own data struct, and use container_of in a
- * work function to get back to your data struct.
- */
 struct btrfs_work {
-        /*
+        btrfs_func_t func;
-         * func should be set to the function you want called
+        btrfs_func_t ordered_func;
-         * your work struct is passed as the only arg
+        btrfs_func_t ordered_free;
-         *
-         * ordered_func must be set for work sent to an ordered work queue,
+        /* Don't touch things below */
-         * and it is called to complete a given work item in the same
+        struct work_struct normal_work;
-         * order they were sent to the queue.
+        struct list_head ordered_list;
-         */
+        struct __btrfs_workqueue *wq;
-        void (*func)(struct btrfs_work *work);
-        void (*ordered_func)(struct btrfs_work *work);
-        void (*ordered_free)(struct btrfs_work *work);
-        /*
-         * flags should be set to zero.  It is used to make sure the
-         * struct is only inserted once into the list.
-         */
        unsigned long flags;
-        /* don't touch these */
-        struct btrfs_worker_thread *worker;
-        struct list_head list;
-        struct list_head order_list;
-};
-struct btrfs_workers {
-        /* current number of running workers */
-        int num_workers;
-        int num_workers_starting;
-        /* max number of workers allowed.  changed by btrfs_start_workers */
-        int max_workers;
-        /* once a worker has this many requests or fewer, it is idle */
-        int idle_thresh;
-        /* force completions in the order they were queued */
-        int ordered;
-        /* more workers required, but in an interrupt handler */
-        int atomic_start_pending;
-        /*
-         * are we allowed to sleep while starting workers or are we required
-         * to start them at a later time?  If we can't sleep, this indicates
-         * which queue we need to use to schedule thread creation.
-         */
-        struct btrfs_workers *atomic_worker_start;
-        /* list with all the work threads.  The workers on the idle thread
-         * may be actively servicing jobs, but they haven't yet hit the
-         * idle thresh limit above.
-         */
-        struct list_head worker_list;
-        struct list_head idle_list;
-        /*
-         * when operating in ordered mode, this maintains the list
-         * of work items waiting for completion
-         */
-        struct list_head order_list;
-        struct list_head prio_order_list;
-        /* lock for finding the next worker thread to queue on */
-        spinlock_t lock;
-        /* lock for the ordered lists */
-        spinlock_t order_lock;
-        /* extra name for this worker, used for current->name */
-        char *name;
-        int stopping;
 };
-void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
-int btrfs_start_workers(struct btrfs_workers *workers);
+                                              int flags,
-void btrfs_stop_workers(struct btrfs_workers *workers);
+                                              int max_active,
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+                                              int thresh);
-                        struct btrfs_workers *async_starter);
+void btrfs_init_work(struct btrfs_work *work,
-void btrfs_requeue_work(struct btrfs_work *work);
+                     btrfs_func_t func,
-void btrfs_set_work_high_prio(struct btrfs_work *work);
+                     btrfs_func_t ordered_func,
+                     btrfs_func_t ordered_free);
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+                      struct btrfs_work *work);
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
+void btrfs_set_work_high_priority(struct btrfs_work *work);
 #endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index aded3ef3d3d4..10db21fa0926 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -220,7 +220,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
                           struct ulist *parents, struct __prelim_ref *ref,
-                           int level, u64 time_seq, const u64 *extent_item_pos)
+                           int level, u64 time_seq, const u64 *extent_item_pos,
+                           u64 total_refs)
 {
        int ret = 0;
        int slot;
@@ -249,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
                ret = btrfs_next_old_leaf(root, path, time_seq);
-        while (!ret && count < ref->count) {
+        while (!ret && count < total_refs) {
                eb = path->nodes[0];
                slot = path->slots[0];
@@ -306,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                                  struct btrfs_path *path, u64 time_seq,
                                  struct __prelim_ref *ref,
                                  struct ulist *parents,
-                                  const u64 *extent_item_pos)
+                                  const u64 *extent_item_pos, u64 total_refs)
 {
        struct btrfs_root *root;
        struct btrfs_key root_key;
@@ -329,7 +330,10 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                goto out;
        }
-        root_level = btrfs_old_root_level(root, time_seq);
+        if (path->search_commit_root)
+                root_level = btrfs_header_level(root->commit_root);
+        else
+                root_level = btrfs_old_root_level(root, time_seq);
        if (root_level + 1 == level) {
                srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -361,7 +365,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
        }
        ret = add_all_parents(root, path, parents, ref, level, time_seq,
-                              extent_item_pos);
+                              extent_item_pos, total_refs);
 out:
        path->lowest_level = 0;
        btrfs_release_path(path);
@@ -374,7 +378,7 @@ out:
 static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                                   struct btrfs_path *path, u64 time_seq,
                                   struct list_head *head,
-                                   const u64 *extent_item_pos)
+                                   const u64 *extent_item_pos, u64 total_refs)
 {
        int err;
        int ret = 0;
@@ -400,7 +404,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                if (ref->count == 0)
                        continue;
                err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
-                                             parents, extent_item_pos);
+                                             parents, extent_item_pos,
+                                             total_refs);
                /*
                 * we can only tolerate ENOENT,otherwise,we should catch error
                 * and return directly.
@@ -557,7 +562,7 @@ static void __merge_refs(struct list_head *head, int mode)
 * smaller or equal that seq to the list
 */
 static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
-                              struct list_head *prefs)
+                              struct list_head *prefs, u64 *total_refs)
 {
        struct btrfs_delayed_extent_op *extent_op = head->extent_op;
        struct rb_node *n = &head->node.rb_node;
@@ -593,6 +598,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                default:
                        BUG_ON(1);
                }
+                *total_refs += (node->ref_mod * sgn);
                switch (node->type) {
                case BTRFS_TREE_BLOCK_REF_KEY: {
                        struct btrfs_delayed_tree_ref *ref;
@@ -653,7 +659,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 */
 static int __add_inline_refs(struct btrfs_fs_info *fs_info,
                             struct btrfs_path *path, u64 bytenr,
-                             int *info_level, struct list_head *prefs)
+                             int *info_level, struct list_head *prefs,
+                             u64 *total_refs)
 {
        int ret = 0;
        int slot;
@@ -677,6 +684,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
        flags = btrfs_extent_flags(leaf, ei);
+        *total_refs += btrfs_extent_refs(leaf, ei);
        btrfs_item_key_to_cpu(leaf, &found_key, slot);
        ptr = (unsigned long)(ei + 1);
@@ -859,6 +867,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
        struct list_head prefs;
        struct __prelim_ref *ref;
        struct extent_inode_elem *eie = NULL;
+        u64 total_refs = 0;
        INIT_LIST_HEAD(&prefs);
        INIT_LIST_HEAD(&prefs_delayed);
@@ -873,8 +882,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        if (!trans)
+        if (!trans) {
                path->search_commit_root = 1;
+                path->skip_locking = 1;
+        }
        /*
         * grab both a lock on the path and a lock on the delayed ref head.
@@ -915,7 +926,7 @@ again:
                        }
                        spin_unlock(&delayed_refs->lock);
                        ret = __add_delayed_refs(head, time_seq,
-                                                 &prefs_delayed);
+                                                 &prefs_delayed, &total_refs);
                        mutex_unlock(&head->mutex);
                        if (ret)
                                goto out;
@@ -936,7 +947,8 @@ again:
                    (key.type == BTRFS_EXTENT_ITEM_KEY ||
                     key.type == BTRFS_METADATA_ITEM_KEY)) {
                        ret = __add_inline_refs(fs_info, path, bytenr,
-                                                &info_level, &prefs);
+                                                &info_level, &prefs,
+                                                &total_refs);
                        if (ret)
                                goto out;
                        ret = __add_keyed_refs(fs_info, path, bytenr,
@@ -956,7 +968,7 @@ again:
        __merge_refs(&prefs, 1);
        ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
-                                      extent_item_pos);
+                                      extent_item_pos, total_refs);
        if (ret)
                goto out;
@@ -965,7 +977,7 @@ again:
        while (!list_empty(&prefs)) {
                ref = list_first_entry(&prefs, struct __prelim_ref, list);
                WARN_ON(ref->count < 0);
-                if (ref->count && ref->root_id && ref->parent == 0) {
+                if (roots && ref->count && ref->root_id && ref->parent == 0) {
                        /* no parent == root of tree */
                        ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
                        if (ret < 0)
@@ -1061,22 +1073,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
                                u64 time_seq, struct ulist **leafs,
                                const u64 *extent_item_pos)
 {
-        struct ulist *tmp;
        int ret;
-        tmp = ulist_alloc(GFP_NOFS);
-        if (!tmp)
-                return -ENOMEM;
        *leafs = ulist_alloc(GFP_NOFS);
-        if (!*leafs) {
+        if (!*leafs)
-                ulist_free(tmp);
                return -ENOMEM;
-        }
        ret = find_parent_nodes(trans, fs_info, bytenr,
-                                time_seq, *leafs, tmp, extent_item_pos);
+                                time_seq, *leafs, NULL, extent_item_pos);
-        ulist_free(tmp);
        if (ret < 0 && ret != -ENOENT) {
                free_leaf_list(*leafs);
                return ret;
@@ -1098,9 +1102,9 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 *
 * returns 0 on success, < 0 on error.
 */
-int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
-                                struct btrfs_fs_info *fs_info, u64 bytenr,
+                                  struct btrfs_fs_info *fs_info, u64 bytenr,
-                                u64 time_seq, struct ulist **roots)
+                                  u64 time_seq, struct ulist **roots)
 {
        struct ulist *tmp;
        struct ulist_node *node = NULL;
@@ -1136,6 +1140,20 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
        return 0;
 }
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+                         struct btrfs_fs_info *fs_info, u64 bytenr,
+                         u64 time_seq, struct ulist **roots)
+{
+        int ret;
+        if (!trans)
+                down_read(&fs_info->commit_root_sem);
+        ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots);
+        if (!trans)
+                up_read(&fs_info->commit_root_sem);
+        return ret;
+}
 /*
 * this makes the path point to (inum INODE_ITEM ioff)
 */
@@ -1333,38 +1351,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
        if (ret < 0)
                return ret;
-        while (1) {
+        ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
-                u32 nritems;
+        if (ret) {
-                if (path->slots[0] == 0) {
+                if (ret > 0)
-                        btrfs_set_path_blocking(path);
+                        ret = -ENOENT;
-                        ret = btrfs_prev_leaf(fs_info->extent_root, path);
+                return ret;
-                        if (ret != 0) {
-                                if (ret > 0) {
-                                        pr_debug("logical %llu is not within "
-                                                 "any extent\n", logical);
-                                        ret = -ENOENT;
-                                }
-                                return ret;
-                        }
-                } else {
-                        path->slots[0]--;
-                }
-                nritems = btrfs_header_nritems(path->nodes[0]);
-                if (nritems == 0) {
-                        pr_debug("logical %llu is not within any extent\n",
-                                 logical);
-                        return -ENOENT;
-                }
-                if (path->slots[0] == nritems)
-                        path->slots[0]--;
-                btrfs_item_key_to_cpu(path->nodes[0], found_key,
-                                      path->slots[0]);
-                if (found_key->type == BTRFS_EXTENT_ITEM_KEY ||
-                    found_key->type == BTRFS_METADATA_ITEM_KEY)
-                        break;
        }
+        btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
        if (found_key->type == BTRFS_METADATA_ITEM_KEY)
                size = fs_info->extent_root->leafsize;
        else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
@@ -1540,6 +1533,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
                if (IS_ERR(trans))
                        return PTR_ERR(trans);
                btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+        } else {
+                down_read(&fs_info->commit_root_sem);
        }
        ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
@@ -1550,8 +1545,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
        ULIST_ITER_INIT(&ref_uiter);
        while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
-                ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
+                ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val,
-                                           tree_mod_seq_elem.seq, &roots);
+                                             tree_mod_seq_elem.seq, &roots);
                if (ret)
                        break;
                ULIST_ITER_INIT(&root_uiter);
@@ -1573,6 +1568,8 @@ out:
        if (!search_commit_root) {
                btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
                btrfs_end_transaction(trans, fs_info->extent_root);
+        } else {
+                up_read(&fs_info->commit_root_sem);
        }
        return ret;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8fed2125689e..c9a24444ec9a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -109,14 +109,17 @@ struct btrfs_inode {
        u64 last_trans;
        /*
-         * log transid when this inode was last modified
+         * transid that last logged this inode
         */
-        u64 last_sub_trans;
+        u64 logged_trans;
        /*
-         * transid that last logged this inode
+         * log transid when this inode was last modified
         */
-        u64 logged_trans;
+        int last_sub_trans;
+        /* a local copy of root's last_log_commit */
+        int last_log_commit;
        /* total number of bytes pending delalloc, used by stat to calc the
         * real block usage of the file
@@ -155,9 +158,6 @@ struct btrfs_inode {
        /* flags field from the on disk inode */
        u32 flags;
-        /* a local copy of root's last_log_commit */
-        unsigned long last_log_commit;
        /*
         * Counters to keep track of the number of extent item's we may use due
         * to delalloc and such.  outstanding_extents is the number of extent
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b01fb6c527e3..d43c544d3b68 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -472,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                rcu_read_lock();
                page = radix_tree_lookup(&mapping->page_tree, pg_index);
                rcu_read_unlock();
-                if (page) {
+                if (page && !radix_tree_exceptional_entry(page)) {
                        misses++;
                        if (misses > 4)
                                break;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cbd3a7d6fa68..1bcfcdb23cf4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2769,9 +2769,13 @@ again:
                 * the commit roots are read only
                 * so we always do read locks
                 */
+                if (p->need_commit_sem)
+                        down_read(&root->fs_info->commit_root_sem);
                b = root->commit_root;
                extent_buffer_get(b);
                level = btrfs_header_level(b);
+                if (p->need_commit_sem)
+                        up_read(&root->fs_info->commit_root_sem);
                if (!p->skip_locking)
                        btrfs_tree_read_lock(b);
        } else {
@@ -5360,7 +5364,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 {
        int ret;
        int cmp;
-        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_path *left_path = NULL;
        struct btrfs_path *right_path = NULL;
        struct btrfs_key left_key;
@@ -5376,9 +5379,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
        int advance_right;
        u64 left_blockptr;
        u64 right_blockptr;
-        u64 left_start_ctransid;
+        u64 left_gen;
-        u64 right_start_ctransid;
+        u64 right_gen;
-        u64 ctransid;
        left_path = btrfs_alloc_path();
        if (!left_path) {
@@ -5402,21 +5404,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
        right_path->search_commit_root = 1;
        right_path->skip_locking = 1;
-        spin_lock(&left_root->root_item_lock);
-        left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
-        spin_unlock(&left_root->root_item_lock);
-        spin_lock(&right_root->root_item_lock);
-        right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
-        spin_unlock(&right_root->root_item_lock);
-        trans = btrfs_join_transaction(left_root);
-        if (IS_ERR(trans)) {
-                ret = PTR_ERR(trans);
-                trans = NULL;
-                goto out;
-        }
        /*
         * Strategy: Go to the first items of both trees. Then do
         *
@@ -5453,6 +5440,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
         *   the right if possible or go up and right.
         */
+        down_read(&left_root->fs_info->commit_root_sem);
        left_level = btrfs_header_level(left_root->commit_root);
        left_root_level = left_level;
        left_path->nodes[left_level] = left_root->commit_root;
@@ -5462,6 +5450,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
        right_root_level = right_level;
        right_path->nodes[right_level] = right_root->commit_root;
        extent_buffer_get(right_path->nodes[right_level]);
+        up_read(&left_root->fs_info->commit_root_sem);
        if (left_level == 0)
                btrfs_item_key_to_cpu(left_path->nodes[left_level],
@@ -5480,67 +5469,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
        advance_left = advance_right = 0;
        while (1) {
-                /*
-                 * We need to make sure the transaction does not get committed
-                 * while we do anything on commit roots. This means, we need to
-                 * join and leave transactions for every item that we process.
-                 */
-                if (trans && btrfs_should_end_transaction(trans, left_root)) {
-                        btrfs_release_path(left_path);
-                        btrfs_release_path(right_path);
-                        ret = btrfs_end_transaction(trans, left_root);
-                        trans = NULL;
-                        if (ret < 0)
-                                goto out;
-                }
-                /* now rejoin the transaction */
-                if (!trans) {
-                        trans = btrfs_join_transaction(left_root);
-                        if (IS_ERR(trans)) {
-                                ret = PTR_ERR(trans);
-                                trans = NULL;
-                                goto out;
-                        }
-                        spin_lock(&left_root->root_item_lock);
-                        ctransid = btrfs_root_ctransid(&left_root->root_item);
-                        spin_unlock(&left_root->root_item_lock);
-                        if (ctransid != left_start_ctransid)
-                                left_start_ctransid = 0;
-                        spin_lock(&right_root->root_item_lock);
-                        ctransid = btrfs_root_ctransid(&right_root->root_item);
-                        spin_unlock(&right_root->root_item_lock);
-                        if (ctransid != right_start_ctransid)
-                                right_start_ctransid = 0;
-                        if (!left_start_ctransid || !right_start_ctransid) {
-                                WARN(1, KERN_WARNING
-                                        "BTRFS: btrfs_compare_tree detected "
-                                        "a change in one of the trees while "
-                                        "iterating. This is probably a "
-                                        "bug.\n");
-                                ret = -EIO;
-                                goto out;
-                        }
-                        /*
-                         * the commit root may have changed, so start again
-                         * where we stopped
-                         */
-                        left_path->lowest_level = left_level;
-                        right_path->lowest_level = right_level;
-                        ret = btrfs_search_slot(NULL, left_root,
-                                        &left_key, left_path, 0, 0);
-                        if (ret < 0)
-                                goto out;
-                        ret = btrfs_search_slot(NULL, right_root,
-                                        &right_key, right_path, 0, 0);
-                        if (ret < 0)
-                                goto out;
-                }
                if (advance_left && !left_end_reached) {
                        ret = tree_advance(left_root, left_path, &left_level,
                                        left_root_level,
@@ -5640,7 +5568,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                                right_blockptr = btrfs_node_blockptr(
                                                right_path->nodes[right_level],
                                                right_path->slots[right_level]);
-                                if (left_blockptr == right_blockptr) {
+                                left_gen = btrfs_node_ptr_generation(
+                                                left_path->nodes[left_level],
+                                                left_path->slots[left_level]);
+                                right_gen = btrfs_node_ptr_generation(
+                                                right_path->nodes[right_level],
+                                                right_path->slots[right_level]);
+                                if (left_blockptr == right_blockptr &&
+                                    left_gen == right_gen) {
                                        /*
                                         * As we're on a shared block, don't
                                         * allow to go deeper.
@@ -5663,14 +5598,6 @@ out:
        btrfs_free_path(left_path);
        btrfs_free_path(right_path);
        kfree(tmp_buf);
-        if (trans) {
-                if (!ret)
-                        ret = btrfs_end_transaction(trans, left_root);
-                else
-                        btrfs_end_transaction(trans, left_root);
-        }
        return ret;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c1a42ca519f..ba6b88528dc7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 #define BTRFS_FS_STATE_ERROR            0
 #define BTRFS_FS_STATE_REMOUNTING       1
 #define BTRFS_FS_STATE_TRANS_ABORTED    2
+#define BTRFS_FS_STATE_DEV_REPLACING    3
 /* Super block flags */
 /* Errors detected */
@@ -608,6 +609,7 @@ struct btrfs_path {
        unsigned int skip_locking:1;
        unsigned int leave_spinning:1;
        unsigned int search_commit_root:1;
+        unsigned int need_commit_sem:1;
 };
 /*
@@ -985,7 +987,8 @@ struct btrfs_dev_replace_item {
 #define BTRFS_BLOCK_GROUP_RAID10        (1ULL << 6)
 #define BTRFS_BLOCK_GROUP_RAID5         (1ULL << 7)
 #define BTRFS_BLOCK_GROUP_RAID6         (1ULL << 8)
-#define BTRFS_BLOCK_GROUP_RESERVED      BTRFS_AVAIL_ALLOC_BIT_SINGLE
+#define BTRFS_BLOCK_GROUP_RESERVED      (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
+                                         BTRFS_SPACE_INFO_GLOBAL_RSV)
 enum btrfs_raid_types {
        BTRFS_RAID_RAID10,
@@ -1017,6 +1020,12 @@ enum btrfs_raid_types {
 */
 #define BTRFS_AVAIL_ALLOC_BIT_SINGLE    (1ULL << 48)
+/*
+ * A fake block group type that is used to communicate global block reserve
+ * size to userspace via the SPACE_INFO ioctl.
+ */
+#define BTRFS_SPACE_INFO_GLOBAL_RSV     (1ULL << 49)
 #define BTRFS_EXTENDED_PROFILE_MASK     (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
                                         BTRFS_AVAIL_ALLOC_BIT_SINGLE)
@@ -1439,7 +1448,7 @@ struct btrfs_fs_info {
         */
        struct mutex ordered_extent_flush_mutex;
-        struct rw_semaphore extent_commit_sem;
+        struct rw_semaphore commit_root_sem;
        struct rw_semaphore cleanup_work_sem;
@@ -1489,6 +1498,7 @@ struct btrfs_fs_info {
         */
        struct list_head ordered_roots;
+        struct mutex delalloc_root_mutex;
        spinlock_t delalloc_root_lock;
        /* all fs/file tree roots that have delalloc inodes. */
        struct list_head delalloc_roots;
@@ -1503,28 +1513,27 @@ struct btrfs_fs_info {
         * A third pool does submit_bio to avoid deadlocking with the other
         * two
         */
-        struct btrfs_workers generic_worker;
+        struct btrfs_workqueue *workers;
-        struct btrfs_workers workers;
+        struct btrfs_workqueue *delalloc_workers;
-        struct btrfs_workers delalloc_workers;
+        struct btrfs_workqueue *flush_workers;
-        struct btrfs_workers flush_workers;
+        struct btrfs_workqueue *endio_workers;
-        struct btrfs_workers endio_workers;
+        struct btrfs_workqueue *endio_meta_workers;
-        struct btrfs_workers endio_meta_workers;
+        struct btrfs_workqueue *endio_raid56_workers;
-        struct btrfs_workers endio_raid56_workers;
+        struct btrfs_workqueue *rmw_workers;
-        struct btrfs_workers rmw_workers;
+        struct btrfs_workqueue *endio_meta_write_workers;
-        struct btrfs_workers endio_meta_write_workers;
+        struct btrfs_workqueue *endio_write_workers;
-        struct btrfs_workers endio_write_workers;
+        struct btrfs_workqueue *endio_freespace_worker;
-        struct btrfs_workers endio_freespace_worker;
+        struct btrfs_workqueue *submit_workers;
-        struct btrfs_workers submit_workers;
+        struct btrfs_workqueue *caching_workers;
-        struct btrfs_workers caching_workers;
+        struct btrfs_workqueue *readahead_workers;
-        struct btrfs_workers readahead_workers;
        /*
         * fixup workers take dirty pages that didn't properly go through
         * the cow mechanism and make them safe to write.  It happens
         * for the sys_munmap function call path
         */
-        struct btrfs_workers fixup_workers;
+        struct btrfs_workqueue *fixup_workers;
-        struct btrfs_workers delayed_workers;
+        struct btrfs_workqueue *delayed_workers;
        struct task_struct *transaction_kthread;
        struct task_struct *cleaner_kthread;
        int thread_pool_size;
@@ -1604,9 +1613,9 @@ struct btrfs_fs_info {
        atomic_t scrub_cancel_req;
        wait_queue_head_t scrub_pause_wait;
        int scrub_workers_refcnt;
-        struct btrfs_workers scrub_workers;
+        struct btrfs_workqueue *scrub_workers;
-        struct btrfs_workers scrub_wr_completion_workers;
+        struct btrfs_workqueue *scrub_wr_completion_workers;
-        struct btrfs_workers scrub_nocow_workers;
+        struct btrfs_workqueue *scrub_nocow_workers;
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
        u32 check_integrity_print_mask;
@@ -1647,7 +1656,7 @@ struct btrfs_fs_info {
        /* qgroup rescan items */
        struct mutex qgroup_rescan_lock; /* protects the progress item */
        struct btrfs_key qgroup_rescan_progress;
-        struct btrfs_workers qgroup_rescan_workers;
+        struct btrfs_workqueue *qgroup_rescan_workers;
        struct completion qgroup_rescan_completion;
        struct btrfs_work qgroup_rescan_work;
@@ -1674,10 +1683,18 @@ struct btrfs_fs_info {
        atomic_t mutually_exclusive_operation_running;
+        struct percpu_counter bio_counter;
+        wait_queue_head_t replace_wait;
        struct semaphore uuid_tree_rescan_sem;
        unsigned int update_uuid_tree_gen:1;
 };
+struct btrfs_subvolume_writers {
+        struct percpu_counter   counter;
+        wait_queue_head_t       wait;
+};
 /*
 * in ram representation of the tree.  extent_root is used for all allocations
 * and for the extent tree extent_root root.
@@ -1702,7 +1719,6 @@ struct btrfs_root {
        struct btrfs_block_rsv *block_rsv;
        /* free ino cache stuff */
-        struct mutex fs_commit_mutex;
        struct btrfs_free_space_ctl *free_ino_ctl;
        enum btrfs_caching_type cached;
        spinlock_t cache_lock;
@@ -1714,11 +1730,15 @@ struct btrfs_root {
        struct mutex log_mutex;
        wait_queue_head_t log_writer_wait;
        wait_queue_head_t log_commit_wait[2];
+        struct list_head log_ctxs[2];
        atomic_t log_writers;
        atomic_t log_commit[2];
        atomic_t log_batch;
-        unsigned long log_transid;
+        int log_transid;
-        unsigned long last_log_commit;
+        /* No matter the commit succeeds or not*/
+        int log_transid_committed;
+        /* Just be updated when the commit succeeds. */
+        int last_log_commit;
        pid_t log_start_pid;
        bool log_multiple_pids;
@@ -1793,6 +1813,7 @@ struct btrfs_root {
        spinlock_t root_item_lock;
        atomic_t refs;
+        struct mutex delalloc_mutex;
        spinlock_t delalloc_lock;
        /*
         * all of the inodes that have delalloc bytes.  It is possible for
@@ -1802,6 +1823,8 @@ struct btrfs_root {
        struct list_head delalloc_inodes;
        struct list_head delalloc_root;
        u64 nr_delalloc_inodes;
+        struct mutex ordered_extent_mutex;
        /*
         * this is used by the balancing code to wait for all the pending
         * ordered extents
@@ -1822,6 +1845,8 @@ struct btrfs_root {
         * manipulation with the read-only status via SUBVOL_SETFLAGS
         */
        int send_in_progress;
+        struct btrfs_subvolume_writers *subv_writers;
+        atomic_t will_be_snapshoted;
 };
 struct btrfs_ioctl_defrag_range_args {
@@ -2033,6 +2058,20 @@ struct btrfs_ioctl_defrag_range_args {
 #define btrfs_raw_test_opt(o, opt)      ((o) & BTRFS_MOUNT_##opt)
 #define btrfs_test_opt(root, opt)       ((root)->fs_info->mount_opt & \
                                         BTRFS_MOUNT_##opt)
+#define btrfs_set_and_info(root, opt, fmt, args...)                     \
+{                                                                       \
+        if (!btrfs_test_opt(root, opt))                                 \
+                btrfs_info(root->fs_info, fmt, ##args);                 \
+        btrfs_set_opt(root->fs_info->mount_opt, opt);                   \
+}
+#define btrfs_clear_and_info(root, opt, fmt, args...)                   \
+{                                                                       \
+        if (btrfs_test_opt(root, opt))                                  \
+                btrfs_info(root->fs_info, fmt, ##args);                 \
+        btrfs_clear_opt(root->fs_info->mount_opt, opt);                 \
+}
 /*
 * Inode flags
 */
@@ -3346,6 +3385,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info);
 int __get_raid_index(u64 flags);
+int btrfs_start_nocow_write(struct btrfs_root *root);
+void btrfs_end_nocow_write(struct btrfs_root *root);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -3723,7 +3765,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               u32 min_type);
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+                               int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -4005,6 +4048,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
                         struct btrfs_scrub_progress *progress);
+/* dev-replace.c */
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
 /* reada.c */
 struct reada_control {
        struct btrfs_root       *root;          /* tree to prefetch */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 451b00c86f6c..33e561a84013 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1392,11 +1392,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
                return -ENOMEM;
        async_work->delayed_root = delayed_root;
-        async_work->work.func = btrfs_async_run_delayed_root;
+        btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
-        async_work->work.flags = 0;
+                        NULL, NULL);
        async_work->nr = nr;
-        btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work);
+        btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
        return 0;
 }
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index f3bff89eecf0..31299646024d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -199,44 +199,31 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
 */
 static struct btrfs_delayed_ref_head *
 find_ref_head(struct rb_root *root, u64 bytenr,
-              struct btrfs_delayed_ref_head **last, int return_bigger)
+              int return_bigger)
 {
        struct rb_node *n;
        struct btrfs_delayed_ref_head *entry;
-        int cmp = 0;
-again:
        n = root->rb_node;
        entry = NULL;
        while (n) {
                entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
-                if (last)
-                        *last = entry;
                if (bytenr < entry->node.bytenr)
-                        cmp = -1;
-                else if (bytenr > entry->node.bytenr)
-                        cmp = 1;
-                else
-                        cmp = 0;
-                if (cmp < 0)
                        n = n->rb_left;
-                else if (cmp > 0)
+                else if (bytenr > entry->node.bytenr)
                        n = n->rb_right;
                else
                        return entry;
        }
        if (entry && return_bigger) {
-                if (cmp > 0) {
+                if (bytenr > entry->node.bytenr) {
                        n = rb_next(&entry->href_node);
                        if (!n)
                                n = rb_first(root);
                        entry = rb_entry(n, struct btrfs_delayed_ref_head,
                                         href_node);
-                        bytenr = entry->node.bytenr;
+                        return entry;
-                        return_bigger = 0;
-                        goto again;
                }
                return entry;
        }
@@ -415,12 +402,12 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
 again:
        start = delayed_refs->run_delayed_start;
-        head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
+        head = find_ref_head(&delayed_refs->href_root, start, 1);
        if (!head && !loop) {
                delayed_refs->run_delayed_start = 0;
                start = 0;
                loop = true;
-                head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
+                head = find_ref_head(&delayed_refs->href_root, start, 1);
                if (!head)
                        return NULL;
        } else if (!head && loop) {
@@ -508,6 +495,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
        ref = btrfs_delayed_node_to_head(update);
        BUG_ON(existing_ref->is_data != ref->is_data);
+        spin_lock(&existing_ref->lock);
        if (ref->must_insert_reserved) {
                /* if the extent was freed and then
                 * reallocated before the delayed ref
@@ -549,7 +537,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
         * only need the lock for this case cause we could be processing it
         * currently, for refs we just added we know we're a-ok.
         */
-        spin_lock(&existing_ref->lock);
        existing->ref_mod += update->ref_mod;
        spin_unlock(&existing_ref->lock);
 }
@@ -898,7 +885,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
        struct btrfs_delayed_ref_root *delayed_refs;
        delayed_refs = &trans->transaction->delayed_refs;
-        return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0);
+        return find_ref_head(&delayed_refs->href_root, bytenr, 0);
 }
 void btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 564c92638b20..9f2290509aca 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -431,6 +431,35 @@ leave_no_lock:
        return ret;
 }
+/*
+ * blocked until all flighting bios are finished.
+ */
+static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
+{
+        s64 writers;
+        DEFINE_WAIT(wait);
+        set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+        do {
+                prepare_to_wait(&fs_info->replace_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                writers = percpu_counter_sum(&fs_info->bio_counter);
+                if (writers)
+                        schedule();
+                finish_wait(&fs_info->replace_wait, &wait);
+        } while (writers);
+}
+/*
+ * we have removed target device, it is safe to allow new bios request.
+ */
+static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
+{
+        clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+        if (waitqueue_active(&fs_info->replace_wait))
+                wake_up(&fs_info->replace_wait);
+}
 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                                       int scrub_ret)
 {
@@ -458,17 +487,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        src_device = dev_replace->srcdev;
        btrfs_dev_replace_unlock(dev_replace);
-        /* replace old device with new one in mapping tree */
-        if (!scrub_ret)
-                btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
-                                                                src_device,
-                                                                tgt_device);
        /*
         * flush all outstanding I/O and inode extent mappings before the
         * copy operation is declared as being finished
         */
-        ret = btrfs_start_delalloc_roots(root->fs_info, 0);
+        ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
        if (ret) {
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
                return ret;
@@ -484,6 +507,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        WARN_ON(ret);
        /* keep away write_all_supers() during the finishing procedure */
+        mutex_lock(&root->fs_info->chunk_mutex);
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        btrfs_dev_replace_lock(dev_replace);
        dev_replace->replace_state =
@@ -494,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
        dev_replace->time_stopped = get_seconds();
        dev_replace->item_needs_writeback = 1;
-        if (scrub_ret) {
+        /* replace old device with new one in mapping tree */
+        if (!scrub_ret) {
+                btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+                                                                src_device,
+                                                                tgt_device);
+        } else {
                printk_in_rcu(KERN_ERR
                              "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
                              src_device->missing ? "<missing disk>" :
@@ -503,6 +532,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                              rcu_str_deref(tgt_device->name), scrub_ret);
                btrfs_dev_replace_unlock(dev_replace);
                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                mutex_unlock(&root->fs_info->chunk_mutex);
                if (tgt_device)
                        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -532,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
                fs_info->fs_devices->latest_bdev = tgt_device->bdev;
        list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+        btrfs_rm_dev_replace_blocked(fs_info);
        btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+        btrfs_rm_dev_replace_unblocked(fs_info);
        /*
         * this is again a consistent state where no dev_replace procedure
         * is running, the target device is part of the filesystem, the
@@ -543,6 +577,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
         */
        btrfs_dev_replace_unlock(dev_replace);
        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+        mutex_unlock(&root->fs_info->chunk_mutex);
        /* write back the superblocks */
        trans = btrfs_start_transaction(root, 0);
@@ -862,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
                mutex_unlock(&dev_replace->lock_management_lock);
        }
 }
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
+{
+        percpu_counter_inc(&fs_info->bio_counter);
+}
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+        percpu_counter_dec(&fs_info->bio_counter);
+        if (waitqueue_active(&fs_info->replace_wait))
+                wake_up(&fs_info->replace_wait);
+}
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
+{
+        DEFINE_WAIT(wait);
+again:
+        percpu_counter_inc(&fs_info->bio_counter);
+        if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
+                btrfs_bio_counter_dec(fs_info);
+                wait_event(fs_info->replace_wait,
+                           !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+                                     &fs_info->fs_state));
+                goto again;
+        }
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81ea55314b1f..983314932af3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -329,6 +329,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 {
        struct extent_state *cached_state = NULL;
        int ret;
+        bool need_lock = (current->journal_info ==
+                          (void *)BTRFS_SEND_TRANS_STUB);
        if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
                return 0;
@@ -336,6 +338,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
        if (atomic)
                return -EAGAIN;
+        if (need_lock) {
+                btrfs_tree_read_lock(eb);
+                btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+        }
        lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
                         0, &cached_state);
        if (extent_buffer_uptodate(eb) &&
@@ -347,10 +354,21 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
                       "found %llu\n",
                       eb->start, parent_transid, btrfs_header_generation(eb));
        ret = 1;
-        clear_extent_buffer_uptodate(eb);
+        /*
+         * Things reading via commit roots that don't have normal protection,
+         * like send, can have a really old block in cache that may point at a
+         * block that has been free'd and re-allocated.  So don't clear uptodate
+         * if we find an eb that is under IO (dirty/writeback) because we could
+         * end up reading in the stale data and then writing it back out and
+         * making everybody very sad.
+         */
+        if (!extent_buffer_under_io(eb))
+                clear_extent_buffer_uptodate(eb);
 out:
        unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
                             &cached_state, GFP_NOFS);
+        btrfs_tree_read_unlock_blocking(eb);
        return ret;
 }
@@ -678,32 +696,31 @@ static void end_workqueue_bio(struct bio *bio, int err)
        fs_info = end_io_wq->info;
        end_io_wq->error = err;
-        end_io_wq->work.func = end_workqueue_fn;
+        btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
-        end_io_wq->work.flags = 0;
        if (bio->bi_rw & REQ_WRITE) {
                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
-                        btrfs_queue_worker(&fs_info->endio_meta_write_workers,
+                        btrfs_queue_work(fs_info->endio_meta_write_workers,
-                                           &end_io_wq->work);
+                                         &end_io_wq->work);
                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
-                        btrfs_queue_worker(&fs_info->endio_freespace_worker,
+                        btrfs_queue_work(fs_info->endio_freespace_worker,
-                                           &end_io_wq->work);
+                                         &end_io_wq->work);
                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
-                        btrfs_queue_worker(&fs_info->endio_raid56_workers,
+                        btrfs_queue_work(fs_info->endio_raid56_workers,
-                                           &end_io_wq->work);
+                                         &end_io_wq->work);
                else
-                        btrfs_queue_worker(&fs_info->endio_write_workers,
+                        btrfs_queue_work(fs_info->endio_write_workers,
-                                           &end_io_wq->work);
+                                         &end_io_wq->work);
        } else {
                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
-                        btrfs_queue_worker(&fs_info->endio_raid56_workers,
+                        btrfs_queue_work(fs_info->endio_raid56_workers,
-                                           &end_io_wq->work);
+                                         &end_io_wq->work);
                else if (end_io_wq->metadata)
-                        btrfs_queue_worker(&fs_info->endio_meta_workers,
+                        btrfs_queue_work(fs_info->endio_meta_workers,
-                                           &end_io_wq->work);
+                                         &end_io_wq->work);
                else
-                        btrfs_queue_worker(&fs_info->endio_workers,
+                        btrfs_queue_work(fs_info->endio_workers,
-                                           &end_io_wq->work);
+                                         &end_io_wq->work);
        }
 }
@@ -738,7 +755,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
 {
        unsigned long limit = min_t(unsigned long,
-                                    info->workers.max_workers,
+                                    info->thread_pool_size,
                                    info->fs_devices->open_devices);
        return 256 * limit;
 }
@@ -811,11 +828,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->submit_bio_start = submit_bio_start;
        async->submit_bio_done = submit_bio_done;
-        async->work.func = run_one_async_start;
+        btrfs_init_work(&async->work, run_one_async_start,
-        async->work.ordered_func = run_one_async_done;
+                        run_one_async_done, run_one_async_free);
-        async->work.ordered_free = run_one_async_free;
-        async->work.flags = 0;
        async->bio_flags = bio_flags;
        async->bio_offset = bio_offset;
@@ -824,9 +839,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        atomic_inc(&fs_info->nr_async_submits);
        if (rw & REQ_SYNC)
-                btrfs_set_work_high_prio(&async->work);
+                btrfs_set_work_high_priority(&async->work);
-        btrfs_queue_worker(&fs_info->workers, &async->work);
+        btrfs_queue_work(fs_info->workers, &async->work);
        while (atomic_read(&fs_info->async_submit_draining) &&
              atomic_read(&fs_info->nr_async_submits)) {
@@ -1149,6 +1164,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        }
 }
+static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
+{
+        struct btrfs_subvolume_writers *writers;
+        int ret;
+        writers = kmalloc(sizeof(*writers), GFP_NOFS);
+        if (!writers)
+                return ERR_PTR(-ENOMEM);
+        ret = percpu_counter_init(&writers->counter, 0);
+        if (ret < 0) {
+                kfree(writers);
+                return ERR_PTR(ret);
+        }
+        init_waitqueue_head(&writers->wait);
+        return writers;
+}
+static void
+btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
+{
+        percpu_counter_destroy(&writers->counter);
+        kfree(writers);
+}
 static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
                         u32 stripesize, struct btrfs_root *root,
                         struct btrfs_fs_info *fs_info,
@@ -1194,16 +1235,22 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        spin_lock_init(&root->log_extents_lock[1]);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
+        mutex_init(&root->ordered_extent_mutex);
+        mutex_init(&root->delalloc_mutex);
        init_waitqueue_head(&root->log_writer_wait);
        init_waitqueue_head(&root->log_commit_wait[0]);
        init_waitqueue_head(&root->log_commit_wait[1]);
+        INIT_LIST_HEAD(&root->log_ctxs[0]);
+        INIT_LIST_HEAD(&root->log_ctxs[1]);
        atomic_set(&root->log_commit[0], 0);
        atomic_set(&root->log_commit[1], 0);
        atomic_set(&root->log_writers, 0);
        atomic_set(&root->log_batch, 0);
        atomic_set(&root->orphan_inodes, 0);
        atomic_set(&root->refs, 1);
+        atomic_set(&root->will_be_snapshoted, 0);
        root->log_transid = 0;
+        root->log_transid_committed = -1;
        root->last_log_commit = 0;
        if (fs_info)
                extent_io_tree_init(&root->dirty_log_pages,
@@ -1417,6 +1464,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
        WARN_ON(root->log_root);
        root->log_root = log_root;
        root->log_transid = 0;
+        root->log_transid_committed = -1;
        root->last_log_commit = 0;
        return 0;
 }
@@ -1498,6 +1546,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
 int btrfs_init_fs_root(struct btrfs_root *root)
 {
        int ret;
+        struct btrfs_subvolume_writers *writers;
        root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
        root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1507,15 +1556,24 @@ int btrfs_init_fs_root(struct btrfs_root *root)
                goto fail;
        }
+        writers = btrfs_alloc_subvolume_writers();
+        if (IS_ERR(writers)) {
+                ret = PTR_ERR(writers);
+                goto fail;
+        }
+        root->subv_writers = writers;
        btrfs_init_free_ino_ctl(root);
-        mutex_init(&root->fs_commit_mutex);
        spin_lock_init(&root->cache_lock);
        init_waitqueue_head(&root->cache_wait);
        ret = get_anon_bdev(&root->anon_dev);
        if (ret)
-                goto fail;
+                goto free_writers;
        return 0;
+free_writers:
+        btrfs_free_subvolume_writers(root->subv_writers);
 fail:
        kfree(root->free_ino_ctl);
        kfree(root->free_ino_pinned);
@@ -1990,23 +2048,22 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
 /* helper to cleanup workers */
 static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 {
-        btrfs_stop_workers(&fs_info->generic_worker);
+        btrfs_destroy_workqueue(fs_info->fixup_workers);
-        btrfs_stop_workers(&fs_info->fixup_workers);
+        btrfs_destroy_workqueue(fs_info->delalloc_workers);
-        btrfs_stop_workers(&fs_info->delalloc_workers);
+        btrfs_destroy_workqueue(fs_info->workers);
-        btrfs_stop_workers(&fs_info->workers);
+        btrfs_destroy_workqueue(fs_info->endio_workers);
-        btrfs_stop_workers(&fs_info->endio_workers);
+        btrfs_destroy_workqueue(fs_info->endio_meta_workers);
-        btrfs_stop_workers(&fs_info->endio_meta_workers);
+        btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
-        btrfs_stop_workers(&fs_info->endio_raid56_workers);
+        btrfs_destroy_workqueue(fs_info->rmw_workers);
-        btrfs_stop_workers(&fs_info->rmw_workers);
+        btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
-        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+        btrfs_destroy_workqueue(fs_info->endio_write_workers);
-        btrfs_stop_workers(&fs_info->endio_write_workers);
+        btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
-        btrfs_stop_workers(&fs_info->endio_freespace_worker);
+        btrfs_destroy_workqueue(fs_info->submit_workers);
-        btrfs_stop_workers(&fs_info->submit_workers);
+        btrfs_destroy_workqueue(fs_info->delayed_workers);
-        btrfs_stop_workers(&fs_info->delayed_workers);
+        btrfs_destroy_workqueue(fs_info->caching_workers);
-        btrfs_stop_workers(&fs_info->caching_workers);
+        btrfs_destroy_workqueue(fs_info->readahead_workers);
-        btrfs_stop_workers(&fs_info->readahead_workers);
+        btrfs_destroy_workqueue(fs_info->flush_workers);
-        btrfs_stop_workers(&fs_info->flush_workers);
+        btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
-        btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
 }
 static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2097,6 +2154,8 @@ int open_ctree(struct super_block *sb,
        int err = -EINVAL;
        int num_backups_tried = 0;
        int backup_index = 0;
+        int max_active;
+        int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
        bool create_uuid_tree;
        bool check_uuid_tree;
@@ -2133,10 +2192,16 @@ int open_ctree(struct super_block *sb,
                goto fail_dirty_metadata_bytes;
        }
+        ret = percpu_counter_init(&fs_info->bio_counter, 0);
+        if (ret) {
+                err = ret;
+                goto fail_delalloc_bytes;
+        }
        fs_info->btree_inode = new_inode(sb);
        if (!fs_info->btree_inode) {
                err = -ENOMEM;
-                goto fail_delalloc_bytes;
+                goto fail_bio_counter;
        }
        mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2159,6 +2224,7 @@ int open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->buffer_lock);
        rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->reloc_mutex);
+        mutex_init(&fs_info->delalloc_root_mutex);
        seqlock_init(&fs_info->profiles_lock);
        init_completion(&fs_info->kobj_unregister);
@@ -2211,6 +2277,7 @@ int open_ctree(struct super_block *sb,
        atomic_set(&fs_info->scrub_pause_req, 0);
        atomic_set(&fs_info->scrubs_paused, 0);
        atomic_set(&fs_info->scrub_cancel_req, 0);
+        init_waitqueue_head(&fs_info->replace_wait);
        init_waitqueue_head(&fs_info->scrub_pause_wait);
        fs_info->scrub_workers_refcnt = 0;
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -2274,7 +2341,7 @@ int open_ctree(struct super_block *sb,
        mutex_init(&fs_info->transaction_kthread_mutex);
        mutex_init(&fs_info->cleaner_mutex);
        mutex_init(&fs_info->volume_mutex);
-        init_rwsem(&fs_info->extent_commit_sem);
+        init_rwsem(&fs_info->commit_root_sem);
        init_rwsem(&fs_info->cleanup_work_sem);
        init_rwsem(&fs_info->subvol_sem);
        sema_init(&fs_info->uuid_tree_rescan_sem, 1);
@@ -2458,104 +2525,68 @@ int open_ctree(struct super_block *sb,
                goto fail_alloc;
        }
-        btrfs_init_workers(&fs_info->generic_worker,
+        max_active = fs_info->thread_pool_size;
-                           "genwork", 1, NULL);
-        btrfs_init_workers(&fs_info->workers, "worker",
+        fs_info->workers =
-                           fs_info->thread_pool_size,
+                btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
-                           &fs_info->generic_worker);
+                                      max_active, 16);
-        btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
+        fs_info->delalloc_workers =
-                           fs_info->thread_pool_size, NULL);
+                btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
-        btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
+        fs_info->flush_workers =
-                           fs_info->thread_pool_size, NULL);
+                btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
-        btrfs_init_workers(&fs_info->submit_workers, "submit",
+        fs_info->caching_workers =
-                           min_t(u64, fs_devices->num_devices,
+                btrfs_alloc_workqueue("cache", flags, max_active, 0);
-                           fs_info->thread_pool_size), NULL);
-        btrfs_init_workers(&fs_info->caching_workers, "cache",
+        /*
-                           fs_info->thread_pool_size, NULL);
+         * a higher idle thresh on the submit workers makes it much more
-        /* a higher idle thresh on the submit workers makes it much more
         * likely that bios will be send down in a sane order to the
         * devices
         */
-        fs_info->submit_workers.idle_thresh = 64;
+        fs_info->submit_workers =
+                btrfs_alloc_workqueue("submit", flags,
-        fs_info->workers.idle_thresh = 16;
+                                      min_t(u64, fs_devices->num_devices,
-        fs_info->workers.ordered = 1;
+                                            max_active), 64);
-        fs_info->delalloc_workers.idle_thresh = 2;
+        fs_info->fixup_workers =
-        fs_info->delalloc_workers.ordered = 1;
+                btrfs_alloc_workqueue("fixup", flags, 1, 0);
-        btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
-                           &fs_info->generic_worker);
-        btrfs_init_workers(&fs_info->endio_workers, "endio",
-                           fs_info->thread_pool_size,
-                           &fs_info->generic_worker);
-        btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
-                           fs_info->thread_pool_size,
-                           &fs_info->generic_worker);
-        btrfs_init_workers(&fs_info->endio_meta_write_workers,
-                           "endio-meta-write", fs_info->thread_pool_size,
-                           &fs_info->generic_worker);
-        btrfs_init_workers(&fs_info->endio_raid56_workers,
-                           "endio-raid56", fs_info->thread_pool_size,
-                           &fs_info->generic_worker);
-        btrfs_init_workers(&fs_info->rmw_workers,
-                           "rmw", fs_info->thread_pool_size,
-                           &fs_info->generic_worker);
-        btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
-                           fs_info->thread_pool_size,
-                           &fs_info->generic_worker);
-        btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
-                           1, &fs_info->generic_worker);
-        btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
-                           fs_info->thread_pool_size,
-                           &fs_info->generic_worker);
-        btrfs_init_workers(&fs_info->readahead_workers, "readahead",
-                           fs_info->thread_pool_size,
-                           &fs_info->generic_worker);
-        btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
-                           &fs_info->generic_worker);
        /*
         * endios are largely parallel and should have a very
         * low idle thresh
         */
-        fs_info->endio_workers.idle_thresh = 4;
+        fs_info->endio_workers =
-        fs_info->endio_meta_workers.idle_thresh = 4;
+                btrfs_alloc_workqueue("endio", flags, max_active, 4);
-        fs_info->endio_raid56_workers.idle_thresh = 4;
+        fs_info->endio_meta_workers =
-        fs_info->rmw_workers.idle_thresh = 2;
+                btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+        fs_info->endio_meta_write_workers =
-        fs_info->endio_write_workers.idle_thresh = 2;
+                btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
-        fs_info->endio_meta_write_workers.idle_thresh = 2;
+        fs_info->endio_raid56_workers =
-        fs_info->readahead_workers.idle_thresh = 2;
+                btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+        fs_info->rmw_workers =
-        /*
+                btrfs_alloc_workqueue("rmw", flags, max_active, 2);
-         * btrfs_start_workers can really only fail because of ENOMEM so just
+        fs_info->endio_write_workers =
-         * return -ENOMEM if any of these fail.
+                btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
-         */
+        fs_info->endio_freespace_worker =
-        ret = btrfs_start_workers(&fs_info->workers);
+                btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
-        ret |= btrfs_start_workers(&fs_info->generic_worker);
+        fs_info->delayed_workers =
-        ret |= btrfs_start_workers(&fs_info->submit_workers);
+                btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
-        ret |= btrfs_start_workers(&fs_info->delalloc_workers);
+        fs_info->readahead_workers =
-        ret |= btrfs_start_workers(&fs_info->fixup_workers);
+                btrfs_alloc_workqueue("readahead", flags, max_active, 2);
-        ret |= btrfs_start_workers(&fs_info->endio_workers);
+        fs_info->qgroup_rescan_workers =
-        ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
+                btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
-        ret |= btrfs_start_workers(&fs_info->rmw_workers);
-        ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
+        if (!(fs_info->workers && fs_info->delalloc_workers &&
-        ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
+              fs_info->submit_workers && fs_info->flush_workers &&
-        ret |= btrfs_start_workers(&fs_info->endio_write_workers);
+              fs_info->endio_workers && fs_info->endio_meta_workers &&
-        ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
+              fs_info->endio_meta_write_workers &&
-        ret |= btrfs_start_workers(&fs_info->delayed_workers);
+              fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
-        ret |= btrfs_start_workers(&fs_info->caching_workers);
+              fs_info->endio_freespace_worker && fs_info->rmw_workers &&
-        ret |= btrfs_start_workers(&fs_info->readahead_workers);
+              fs_info->caching_workers && fs_info->readahead_workers &&
-        ret |= btrfs_start_workers(&fs_info->flush_workers);
+              fs_info->fixup_workers && fs_info->delayed_workers &&
-        ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
+              fs_info->qgroup_rescan_workers)) {
-        if (ret) {
                err = -ENOMEM;
                goto fail_sb_buffer;
        }
@@ -2830,7 +2861,7 @@ retry_root_backup:
                        printk(KERN_ERR "BTRFS: failed to read log tree\n");
                        free_extent_buffer(log_tree_root->node);
                        kfree(log_tree_root);
-                        goto fail_trans_kthread;
+                        goto fail_qgroup;
                }
                /* returns with log_tree_root freed on success */
                ret = btrfs_recover_log_trees(log_tree_root);
@@ -2839,24 +2870,24 @@ retry_root_backup:
                                    "Failed to recover log tree");
                        free_extent_buffer(log_tree_root->node);
                        kfree(log_tree_root);
-                        goto fail_trans_kthread;
+                        goto fail_qgroup;
                }
                if (sb->s_flags & MS_RDONLY) {
                        ret = btrfs_commit_super(tree_root);
                        if (ret)
-                                goto fail_trans_kthread;
+                                goto fail_qgroup;
                }
        }
        ret = btrfs_find_orphan_roots(tree_root);
        if (ret)
-                goto fail_trans_kthread;
+                goto fail_qgroup;
        if (!(sb->s_flags & MS_RDONLY)) {
                ret = btrfs_cleanup_fs_roots(fs_info);
                if (ret)
-                        goto fail_trans_kthread;
+                        goto fail_qgroup;
                ret = btrfs_recover_relocation(tree_root);
                if (ret < 0) {
@@ -2963,6 +2994,8 @@ fail_iput:
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
        iput(fs_info->btree_inode);
+fail_bio_counter:
+        percpu_counter_destroy(&fs_info->bio_counter);
 fail_delalloc_bytes:
        percpu_counter_destroy(&fs_info->delalloc_bytes);
 fail_dirty_metadata_bytes:
@@ -3244,6 +3277,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
        /* send down all the barriers */
        head = &info->fs_devices->devices;
        list_for_each_entry_rcu(dev, head, dev_list) {
+                if (dev->missing)
+                        continue;
                if (!dev->bdev) {
                        errors_send++;
                        continue;
@@ -3258,6 +3293,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
        /* wait for all the barriers */
        list_for_each_entry_rcu(dev, head, dev_list) {
+                if (dev->missing)
+                        continue;
                if (!dev->bdev) {
                        errors_wait++;
                        continue;
@@ -3477,6 +3514,8 @@ static void free_fs_root(struct btrfs_root *root)
        root->orphan_block_rsv = NULL;
        if (root->anon_dev)
                free_anon_bdev(root->anon_dev);
+        if (root->subv_writers)
+                btrfs_free_subvolume_writers(root->subv_writers);
        free_extent_buffer(root->node);
        free_extent_buffer(root->commit_root);
        kfree(root->free_ino_ctl);
@@ -3610,6 +3649,7 @@ int close_ctree(struct btrfs_root *root)
        percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
        percpu_counter_destroy(&fs_info->delalloc_bytes);
+        percpu_counter_destroy(&fs_info->bio_counter);
        bdi_destroy(&fs_info->bdi);
        cleanup_srcu_struct(&fs_info->subvol_srcu);
@@ -3791,9 +3831,11 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
                list_move_tail(&root->ordered_root,
                               &fs_info->ordered_roots);
+                spin_unlock(&fs_info->ordered_root_lock);
                btrfs_destroy_ordered_extents(root);
-                cond_resched_lock(&fs_info->ordered_root_lock);
+                cond_resched();
+                spin_lock(&fs_info->ordered_root_lock);
        }
        spin_unlock(&fs_info->ordered_root_lock);
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32312e09f0f5..5590af92094b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -419,7 +419,7 @@ static noinline void caching_thread(struct btrfs_work *work)
 again:
        mutex_lock(&caching_ctl->mutex);
        /* need to make sure the commit_root doesn't disappear */
-        down_read(&fs_info->extent_commit_sem);
+        down_read(&fs_info->commit_root_sem);
 next:
        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -443,10 +443,10 @@ next:
                                break;
                        if (need_resched() ||
-                            rwsem_is_contended(&fs_info->extent_commit_sem)) {
+                            rwsem_is_contended(&fs_info->commit_root_sem)) {
                                caching_ctl->progress = last;
                                btrfs_release_path(path);
-                                up_read(&fs_info->extent_commit_sem);
+                                up_read(&fs_info->commit_root_sem);
                                mutex_unlock(&caching_ctl->mutex);
                                cond_resched();
                                goto again;
@@ -513,7 +513,7 @@ next:
 err:
        btrfs_free_path(path);
-        up_read(&fs_info->extent_commit_sem);
+        up_read(&fs_info->commit_root_sem);
        free_excluded_extents(extent_root, block_group);
@@ -549,7 +549,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        caching_ctl->block_group = cache;
        caching_ctl->progress = cache->key.objectid;
        atomic_set(&caching_ctl->count, 1);
-        caching_ctl->work.func = caching_thread;
+        btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
        spin_lock(&cache->lock);
        /*
@@ -633,14 +633,14 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                return 0;
        }
-        down_write(&fs_info->extent_commit_sem);
+        down_write(&fs_info->commit_root_sem);
        atomic_inc(&caching_ctl->count);
        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
-        up_write(&fs_info->extent_commit_sem);
+        up_write(&fs_info->commit_root_sem);
        btrfs_get_block_group(cache);
-        btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
+        btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
        return ret;
 }
@@ -1542,6 +1542,7 @@ again:
                                ret = 0;
                }
                if (ret) {
+                        key.objectid = bytenr;
                        key.type = BTRFS_EXTENT_ITEM_KEY;
                        key.offset = num_bytes;
                        btrfs_release_path(path);
@@ -2444,7 +2445,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                        spin_unlock(&locked_ref->lock);
                        spin_lock(&delayed_refs->lock);
                        spin_lock(&locked_ref->lock);
-                        if (rb_first(&locked_ref->ref_root)) {
+                        if (rb_first(&locked_ref->ref_root) ||
+                            locked_ref->extent_op) {
                                spin_unlock(&locked_ref->lock);
                                spin_unlock(&delayed_refs->lock);
                                continue;
@@ -3541,11 +3543,13 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        return extended_to_chunk(flags | tmp);
 }
-static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
+static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
 {
        unsigned seq;
+        u64 flags;
        do {
+                flags = orig_flags;
                seq = read_seqbegin(&root->fs_info->profiles_lock);
                if (flags & BTRFS_BLOCK_GROUP_DATA)
@@ -3971,7 +3975,7 @@ static int can_overcommit(struct btrfs_root *root,
 }
 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
-                                         unsigned long nr_pages)
+                                         unsigned long nr_pages, int nr_items)
 {
        struct super_block *sb = root->fs_info->sb;
@@ -3986,9 +3990,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
                 * the filesystem is readonly(all dirty pages are written to
                 * the disk).
                 */
-                btrfs_start_delalloc_roots(root->fs_info, 0);
+                btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
                if (!current->journal_info)
-                        btrfs_wait_ordered_roots(root->fs_info, -1);
+                        btrfs_wait_ordered_roots(root->fs_info, nr_items);
        }
 }
@@ -4045,7 +4049,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        while (delalloc_bytes && loops < 3) {
                max_reclaim = min(delalloc_bytes, to_reclaim);
                nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-                btrfs_writeback_inodes_sb_nr(root, nr_pages);
+                btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
                /*
                 * We need to wait for the async pages to actually start before
                 * we do anything.
@@ -4112,13 +4116,9 @@ static int may_commit_transaction(struct btrfs_root *root,
                goto commit;
        /* See if there is enough pinned space to make this reservation */
-        spin_lock(&space_info->lock);
        if (percpu_counter_compare(&space_info->total_bytes_pinned,
-                                   bytes) >= 0) {
+                                   bytes) >= 0)
-                spin_unlock(&space_info->lock);
                goto commit;
-        }
-        spin_unlock(&space_info->lock);
        /*
         * See if there is some space in the delayed insertion reservation for
@@ -4127,16 +4127,13 @@ static int may_commit_transaction(struct btrfs_root *root,
        if (space_info != delayed_rsv->space_info)
                return -ENOSPC;
-        spin_lock(&space_info->lock);
        spin_lock(&delayed_rsv->lock);
        if (percpu_counter_compare(&space_info->total_bytes_pinned,
                                   bytes - delayed_rsv->size) >= 0) {
                spin_unlock(&delayed_rsv->lock);
-                spin_unlock(&space_info->lock);
                return -ENOSPC;
        }
        spin_unlock(&delayed_rsv->lock);
-        spin_unlock(&space_info->lock);
 commit:
        trans = btrfs_join_transaction(root);
@@ -4181,7 +4178,7 @@ static int flush_space(struct btrfs_root *root,
                break;
        case FLUSH_DELALLOC:
        case FLUSH_DELALLOC_WAIT:
-                shrink_delalloc(root, num_bytes, orig_bytes,
+                shrink_delalloc(root, num_bytes * 2, orig_bytes,
                                state == FLUSH_DELALLOC_WAIT);
                break;
        case ALLOC_CHUNK:
@@ -5477,7 +5474,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_cache *cache;
        struct btrfs_space_info *space_info;
-        down_write(&fs_info->extent_commit_sem);
+        down_write(&fs_info->commit_root_sem);
        list_for_each_entry_safe(caching_ctl, next,
                                 &fs_info->caching_block_groups, list) {
@@ -5496,7 +5493,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
        else
                fs_info->pinned_extents = &fs_info->freed_extents[0];
-        up_write(&fs_info->extent_commit_sem);
+        up_write(&fs_info->commit_root_sem);
        list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
                percpu_counter_set(&space_info->total_bytes_pinned, 0);
@@ -5725,6 +5722,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        if (ret > 0 && skinny_metadata) {
                                skinny_metadata = false;
+                                key.objectid = bytenr;
                                key.type = BTRFS_EXTENT_ITEM_KEY;
                                key.offset = num_bytes;
                                btrfs_release_path(path);
@@ -5751,6 +5749,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
                        bytenr, parent, root_objectid, owner_objectid,
                        owner_offset);
+                btrfs_abort_transaction(trans, extent_root, ret);
+                goto out;
        } else {
                btrfs_abort_transaction(trans, extent_root, ret);
                goto out;
@@ -8262,14 +8262,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
        struct btrfs_caching_control *caching_ctl;
        struct rb_node *n;
-        down_write(&info->extent_commit_sem);
+        down_write(&info->commit_root_sem);
        while (!list_empty(&info->caching_block_groups)) {
                caching_ctl = list_entry(info->caching_block_groups.next,
                                         struct btrfs_caching_control, list);
                list_del(&caching_ctl->list);
                put_caching_control(caching_ctl);
        }
-        up_write(&info->extent_commit_sem);
+        up_write(&info->commit_root_sem);
        spin_lock(&info->block_group_cache_lock);
        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
@@ -8343,9 +8343,15 @@ static void __link_block_group(struct btrfs_space_info *space_info,
                               struct btrfs_block_group_cache *cache)
 {
        int index = get_block_group_index(cache);
+        bool first = false;
        down_write(&space_info->groups_sem);
-        if (list_empty(&space_info->block_groups[index])) {
+        if (list_empty(&space_info->block_groups[index]))
+                first = true;
+        list_add_tail(&cache->list, &space_info->block_groups[index]);
+        up_write(&space_info->groups_sem);
+        if (first) {
                struct kobject *kobj = &space_info->block_group_kobjs[index];
                int ret;
@@ -8357,8 +8363,6 @@ static void __link_block_group(struct btrfs_space_info *space_info,
                        kobject_put(&space_info->kobj);
                }
        }
-        list_add_tail(&cache->list, &space_info->block_groups[index]);
-        up_write(&space_info->groups_sem);
 }
 static struct btrfs_block_group_cache *
@@ -8938,3 +8942,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
        range->len = trimmed;
        return ret;
 }
+/*
+ * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
+ * they are used to prevent the some tasks writing data into the page cache
+ * by nocow before the subvolume is snapshoted, but flush the data into
+ * the disk after the snapshot creation.
+ */
+void btrfs_end_nocow_write(struct btrfs_root *root)
+{
+        percpu_counter_dec(&root->subv_writers->counter);
+        /*
+         * Make sure counter is updated before we wake up
+         * waiters.
+         */
+        smp_mb();
+        if (waitqueue_active(&root->subv_writers->wait))
+                wake_up(&root->subv_writers->wait);
+}
+int btrfs_start_nocow_write(struct btrfs_root *root)
+{
+        if (unlikely(atomic_read(&root->will_be_snapshoted)))
+                return 0;
+        percpu_counter_inc(&root->subv_writers->counter);
+        /*
+         * Make sure counter is updated before we check for snapshot creation.
+         */
+        smp_mb();
+        if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+                btrfs_end_nocow_write(root);
+                return 0;
+        }
+        return 1;
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 85bbd01f1271..3955e475ceec 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -229,12 +229,14 @@ void free_extent_state(struct extent_state *state)
        }
 }
-static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+static struct rb_node *tree_insert(struct rb_root *root,
+                                   struct rb_node *search_start,
+                                   u64 offset,
                                   struct rb_node *node,
                                   struct rb_node ***p_in,
                                   struct rb_node **parent_in)
 {
-        struct rb_node **p = &root->rb_node;
+        struct rb_node **p;
        struct rb_node *parent = NULL;
        struct tree_entry *entry;
@@ -244,6 +246,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
                goto do_insert;
        }
+        p = search_start ? &search_start : &root->rb_node;
        while (*p) {
                parent = *p;
                entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -430,7 +433,7 @@ static int insert_state(struct extent_io_tree *tree,
        set_state_bits(tree, state, bits);
-        node = tree_insert(&tree->state, end, &state->rb_node, p, parent);
+        node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
        if (node) {
                struct extent_state *found;
                found = rb_entry(node, struct extent_state, rb_node);
@@ -477,8 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
        prealloc->state = orig->state;
        orig->start = split;
-        node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node,
+        node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
-                           NULL, NULL);
+                           &prealloc->rb_node, NULL, NULL);
        if (node) {
                free_extent_state(prealloc);
                return -EEXIST;
@@ -746,6 +749,7 @@ again:
                 * our range starts
                 */
                node = tree_search(tree, start);
+process_node:
                if (!node)
                        break;
@@ -766,7 +770,10 @@ again:
                if (start > end)
                        break;
-                cond_resched_lock(&tree->lock);
+                if (!cond_resched_lock(&tree->lock)) {
+                        node = rb_next(node);
+                        goto process_node;
+                }
        }
 out:
        spin_unlock(&tree->lock);
@@ -2757,7 +2764,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
        if (em_cached && *em_cached) {
                em = *em_cached;
-                if (em->in_tree && start >= em->start &&
+                if (extent_map_in_tree(em) && start >= em->start &&
                    start < extent_map_end(em)) {
                        atomic_inc(&em->refs);
                        return em;
@@ -4303,7 +4310,7 @@ static void __free_extent_buffer(struct extent_buffer *eb)
        kmem_cache_free(extent_buffer_cache, eb);
 }
-static int extent_buffer_under_io(struct extent_buffer *eb)
+int extent_buffer_under_io(struct extent_buffer *eb)
 {
        return (atomic_read(&eb->io_pages) ||
                test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 58b27e5ab521..c488b45237bf 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -320,6 +320,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb);
 int set_extent_buffer_uptodate(struct extent_buffer *eb);
 int clear_extent_buffer_uptodate(struct extent_buffer *eb);
 int extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_under_io(struct extent_buffer *eb);
 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
                      unsigned long min_len, char **map,
                      unsigned long *map_start,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 996ad56b57db..1874aee69c86 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void)
        em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
        if (!em)
                return NULL;
-        em->in_tree = 0;
+        RB_CLEAR_NODE(&em->rb_node);
        em->flags = 0;
        em->compress_type = BTRFS_COMPRESS_NONE;
        em->generation = 0;
@@ -73,7 +73,7 @@ void free_extent_map(struct extent_map *em)
                return;
        WARN_ON(atomic_read(&em->refs) == 0);
        if (atomic_dec_and_test(&em->refs)) {
-                WARN_ON(em->in_tree);
+                WARN_ON(extent_map_in_tree(em));
                WARN_ON(!list_empty(&em->list));
                kmem_cache_free(extent_map_cache, em);
        }
@@ -99,8 +99,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
                parent = *p;
                entry = rb_entry(parent, struct extent_map, rb_node);
-                WARN_ON(!entry->in_tree);
                if (em->start < entry->start)
                        p = &(*p)->rb_left;
                else if (em->start >= extent_map_end(entry))
@@ -128,7 +126,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
                if (end > entry->start && em->start < extent_map_end(entry))
                        return -EEXIST;
-        em->in_tree = 1;
        rb_link_node(&em->rb_node, orig_parent, p);
        rb_insert_color(&em->rb_node, root);
        return 0;
@@ -153,8 +150,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
                prev = n;
                prev_entry = entry;
-                WARN_ON(!entry->in_tree);
                if (offset < entry->start)
                        n = n->rb_left;
                else if (offset >= extent_map_end(entry))
@@ -240,12 +235,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
                        em->len += merge->len;
                        em->block_len += merge->block_len;
                        em->block_start = merge->block_start;
-                        merge->in_tree = 0;
                        em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
                        em->mod_start = merge->mod_start;
                        em->generation = max(em->generation, merge->generation);
                        rb_erase(&merge->rb_node, &tree->map);
+                        RB_CLEAR_NODE(&merge->rb_node);
                        free_extent_map(merge);
                }
        }
@@ -257,7 +252,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
                em->len += merge->len;
                em->block_len += merge->block_len;
                rb_erase(&merge->rb_node, &tree->map);
-                merge->in_tree = 0;
+                RB_CLEAR_NODE(&merge->rb_node);
                em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
                em->generation = max(em->generation, merge->generation);
                free_extent_map(merge);
@@ -319,7 +314,21 @@ out:
 void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
 {
        clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
-        if (em->in_tree)
+        if (extent_map_in_tree(em))
+                try_merge_map(tree, em);
+}
+static inline void setup_extent_mapping(struct extent_map_tree *tree,
+                                        struct extent_map *em,
+                                        int modified)
+{
+        atomic_inc(&em->refs);
+        em->mod_start = em->start;
+        em->mod_len = em->len;
+        if (modified)
+                list_move(&em->list, &tree->modified_extents);
+        else
                try_merge_map(tree, em);
 }
@@ -342,15 +351,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
        if (ret)
                goto out;
-        atomic_inc(&em->refs);
+        setup_extent_mapping(tree, em, modified);
-        em->mod_start = em->start;
-        em->mod_len = em->len;
-        if (modified)
-                list_move(&em->list, &tree->modified_extents);
-        else
-                try_merge_map(tree, em);
 out:
        return ret;
 }
@@ -434,6 +435,21 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
        rb_erase(&em->rb_node, &tree->map);
        if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
                list_del_init(&em->list);
-        em->in_tree = 0;
+        RB_CLEAR_NODE(&em->rb_node);
        return ret;
 }
+void replace_extent_mapping(struct extent_map_tree *tree,
+                            struct extent_map *cur,
+                            struct extent_map *new,
+                            int modified)
+{
+        WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+        ASSERT(extent_map_in_tree(cur));
+        if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+                list_del_init(&cur->list);
+        rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+        RB_CLEAR_NODE(&cur->rb_node);
+        setup_extent_mapping(tree, new, modified);
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 93fba716d7f8..e7fd8a56a140 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -33,7 +33,6 @@ struct extent_map {
        unsigned long flags;
        struct block_device *bdev;
        atomic_t refs;
-        unsigned int in_tree;
        unsigned int compress_type;
        struct list_head list;
 };
@@ -44,6 +43,11 @@ struct extent_map_tree {
        rwlock_t lock;
 };
+static inline int extent_map_in_tree(const struct extent_map *em)
+{
+        return !RB_EMPTY_NODE(&em->rb_node);
+}
 static inline u64 extent_map_end(struct extent_map *em)
 {
        if (em->start + em->len < em->start)
@@ -64,6 +68,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 int add_extent_mapping(struct extent_map_tree *tree,
                       struct extent_map *em, int modified);
 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void replace_extent_mapping(struct extent_map_tree *tree,
+                            struct extent_map *cur,
+                            struct extent_map *new,
+                            int modified);
 struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0165b8672f09..ae6af072b635 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -425,13 +425,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                struct page *page = prepared_pages[pg];
                /*
                 * Copy data from userspace to the current page
-                 *
-                 * Disable pagefault to avoid recursive lock since
-                 * the pages are already locked
                 */
-                pagefault_disable();
                copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
-                pagefault_enable();
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
@@ -591,7 +586,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
                clear_bit(EXTENT_FLAG_LOGGING, &flags);
                modified = !list_empty(&em->list);
-                remove_extent_mapping(em_tree, em);
                if (no_splits)
                        goto next;
@@ -622,8 +616,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->bdev = em->bdev;
                        split->flags = flags;
                        split->compress_type = em->compress_type;
-                        ret = add_extent_mapping(em_tree, split, modified);
+                        replace_extent_mapping(em_tree, em, split, modified);
-                        BUG_ON(ret); /* Logic error */
                        free_extent_map(split);
                        split = split2;
                        split2 = NULL;
@@ -661,12 +654,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                                split->orig_block_len = 0;
                        }
-                        ret = add_extent_mapping(em_tree, split, modified);
+                        if (extent_map_in_tree(em)) {
-                        BUG_ON(ret); /* Logic error */
+                                replace_extent_mapping(em_tree, em, split,
+                                                       modified);
+                        } else {
+                                ret = add_extent_mapping(em_tree, split,
+                                                         modified);
+                                ASSERT(ret == 0); /* Logic error */
+                        }
                        free_extent_map(split);
                        split = NULL;
                }
 next:
+                if (extent_map_in_tree(em))
+                        remove_extent_mapping(em_tree, em);
                write_unlock(&em_tree->lock);
                /* once for us */
@@ -720,7 +721,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
        if (drop_cache)
                btrfs_drop_extent_cache(inode, start, end - 1, 0);
-        if (start >= BTRFS_I(inode)->disk_i_size)
+        if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
                modify_tree = 0;
        while (1) {
@@ -798,7 +799,10 @@ next_slot:
                 */
                if (start > key.offset && end < extent_end) {
                        BUG_ON(del_nr > 0);
-                        BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                                ret = -EOPNOTSUPP;
+                                break;
+                        }
                        memcpy(&new_key, &key, sizeof(new_key));
                        new_key.offset = start;
@@ -841,7 +845,10 @@ next_slot:
                 *      | -------- extent -------- |
                 */
                if (start <= key.offset && end < extent_end) {
-                        BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                                ret = -EOPNOTSUPP;
+                                break;
+                        }
                        memcpy(&new_key, &key, sizeof(new_key));
                        new_key.offset = end;
@@ -864,7 +871,10 @@ next_slot:
                 */
                if (start > key.offset && end >= extent_end) {
                        BUG_ON(del_nr > 0);
-                        BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                                ret = -EOPNOTSUPP;
+                                break;
+                        }
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        start - key.offset);
@@ -938,34 +948,42 @@ next_slot:
                 * Set path->slots[0] to first slot, so that after the delete
                 * if items are move off from our leaf to its immediate left or
                 * right neighbor leafs, we end up with a correct and adjusted
-                 * path->slots[0] for our insertion.
+                 * path->slots[0] for our insertion (if replace_extent != 0).
                 */
                path->slots[0] = del_slot;
                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
                if (ret)
                        btrfs_abort_transaction(trans, root, ret);
+        }
-                leaf = path->nodes[0];
+        leaf = path->nodes[0];
-                /*
+        /*
-                 * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that
+         * If btrfs_del_items() was called, it might have deleted a leaf, in
-                 * is, its contents got pushed to its neighbors), in which case
+         * which case it unlocked our path, so check path->locks[0] matches a
-                 * it means path->locks[0] == 0
+         * write lock.
-                 */
+         */
-                if (!ret && replace_extent && leafs_visited == 1 &&
+        if (!ret && replace_extent && leafs_visited == 1 &&
-                    path->locks[0] &&
+            (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
-                    btrfs_leaf_free_space(root, leaf) >=
+             path->locks[0] == BTRFS_WRITE_LOCK) &&
-                    sizeof(struct btrfs_item) + extent_item_size) {
+            btrfs_leaf_free_space(root, leaf) >=
+            sizeof(struct btrfs_item) + extent_item_size) {
-                        key.objectid = ino;
-                        key.type = BTRFS_EXTENT_DATA_KEY;
+                key.objectid = ino;
-                        key.offset = start;
+                key.type = BTRFS_EXTENT_DATA_KEY;
-                        setup_items_for_insert(root, path, &key,
+                key.offset = start;
-                                               &extent_item_size,
+                if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
-                                               extent_item_size,
+                        struct btrfs_key slot_key;
-                                               sizeof(struct btrfs_item) +
-                                               extent_item_size, 1);
+                        btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
-                        *key_inserted = 1;
+                        if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
+                                path->slots[0]++;
                }
+                setup_items_for_insert(root, path, &key,
+                                       &extent_item_size,
+                                       extent_item_size,
+                                       sizeof(struct btrfs_item) +
+                                       extent_item_size, 1);
+                *key_inserted = 1;
        }
        if (!replace_extent || !(*key_inserted))
@@ -1346,11 +1364,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
                struct btrfs_ordered_extent *ordered;
                lock_extent_bits(&BTRFS_I(inode)->io_tree,
                                 start_pos, last_pos, 0, cached_state);
-                ordered = btrfs_lookup_first_ordered_extent(inode, last_pos);
+                ordered = btrfs_lookup_ordered_range(inode, start_pos,
+                                                     last_pos - start_pos + 1);
                if (ordered &&
                    ordered->file_offset + ordered->len > start_pos &&
                    ordered->file_offset <= last_pos) {
-                        btrfs_put_ordered_extent(ordered);
                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
                                             start_pos, last_pos,
                                             cached_state, GFP_NOFS);
@@ -1358,12 +1376,9 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
                                unlock_page(pages[i]);
                                page_cache_release(pages[i]);
                        }
-                        ret = btrfs_wait_ordered_range(inode, start_pos,
+                        btrfs_start_ordered_extent(inode, ordered, 1);
-                                                last_pos - start_pos + 1);
+                        btrfs_put_ordered_extent(ordered);
-                        if (ret)
+                        return -EAGAIN;
-                                return ret;
-                        else
-                                return -EAGAIN;
                }
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
@@ -1396,8 +1411,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
        u64 num_bytes;
        int ret;
+        ret = btrfs_start_nocow_write(root);
+        if (!ret)
+                return -ENOSPC;
        lockstart = round_down(pos, root->sectorsize);
-        lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+        lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
        while (1) {
                lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1415,12 +1434,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
        ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
        if (ret <= 0) {
                ret = 0;
+                btrfs_end_nocow_write(root);
        } else {
-                clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                *write_bytes = min_t(size_t, *write_bytes ,
-                                 EXTENT_DIRTY | EXTENT_DELALLOC |
+                                     num_bytes - pos + lockstart);
-                                 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
-                                 NULL, GFP_NOFS);
-                *write_bytes = min_t(size_t, *write_bytes, num_bytes);
        }
        unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1510,6 +1527,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                        if (!only_release_metadata)
                                btrfs_free_reserved_data_space(inode,
                                                               reserve_bytes);
+                        else
+                                btrfs_end_nocow_write(root);
                        break;
                }
@@ -1598,6 +1617,9 @@ again:
                }
                release_bytes = 0;
+                if (only_release_metadata)
+                        btrfs_end_nocow_write(root);
                if (only_release_metadata && copied > 0) {
                        u64 lockstart = round_down(pos, root->sectorsize);
                        u64 lockend = lockstart +
@@ -1624,10 +1646,12 @@ again:
        kfree(pages);
        if (release_bytes) {
-                if (only_release_metadata)
+                if (only_release_metadata) {
+                        btrfs_end_nocow_write(root);
                        btrfs_delalloc_release_metadata(inode, release_bytes);
-                else
+                } else {
                        btrfs_delalloc_release_space(inode, release_bytes);
+                }
        }
        return num_written ? num_written : ret;
@@ -1636,7 +1660,7 @@ again:
 static ssize_t __btrfs_direct_write(struct kiocb *iocb,
                                    const struct iovec *iov,
                                    unsigned long nr_segs, loff_t pos,
-                                    loff_t *ppos, size_t count, size_t ocount)
+                                    size_t count, size_t ocount)
 {
        struct file *file = iocb->ki_filp;
        struct iov_iter i;
@@ -1645,7 +1669,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
        loff_t endbyte;
        int err;
-        written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
+        written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
                                            count, ocount);
        if (written < 0 || written == count)
@@ -1664,7 +1688,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
        if (err)
                goto out;
        written += written_buffered;
-        *ppos = pos + written_buffered;
+        iocb->ki_pos = pos + written_buffered;
        invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
                                 endbyte >> PAGE_CACHE_SHIFT);
 out:
@@ -1696,8 +1720,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        loff_t *ppos = &iocb->ki_pos;
        u64 start_pos;
+        u64 end_pos;
        ssize_t num_written = 0;
        ssize_t err = 0;
        size_t count, ocount;
@@ -1752,7 +1776,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        start_pos = round_down(pos, root->sectorsize);
        if (start_pos > i_size_read(inode)) {
-                err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
+                /* Expand hole size to cover write data, preventing empty gap */
+                end_pos = round_up(pos + count, root->sectorsize);
+                err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
                if (err) {
                        mutex_unlock(&inode->i_mutex);
                        goto out;
@@ -1764,7 +1790,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        if (unlikely(file->f_flags & O_DIRECT)) {
                num_written = __btrfs_direct_write(iocb, iov, nr_segs,
-                                                   pos, ppos, count, ocount);
+                                                   pos, count, ocount);
        } else {
                struct iov_iter i;
@@ -1772,7 +1798,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                num_written = __btrfs_buffered_write(file, &i, pos);
                if (num_written > 0)
-                        *ppos = pos + num_written;
+                        iocb->ki_pos = pos + num_written;
        }
        mutex_unlock(&inode->i_mutex);
@@ -1797,7 +1823,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        BTRFS_I(inode)->last_sub_trans = root->log_transid;
        if (num_written > 0) {
                err = generic_write_sync(file, pos, num_written);
-                if (err < 0 && num_written > 0)
+                if (err < 0)
                        num_written = err;
        }
@@ -1856,8 +1882,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        int ret = 0;
        struct btrfs_trans_handle *trans;
+        struct btrfs_log_ctx ctx;
+        int ret = 0;
        bool full_sync = 0;
        trace_btrfs_sync_file(file, datasync);
@@ -1951,7 +1978,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        }
        trans->sync = true;
-        ret = btrfs_log_dentry_safe(trans, root, dentry);
+        btrfs_init_log_ctx(&ctx);
+        ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
        if (ret < 0) {
                /* Fallthrough and commit/free transaction. */
                ret = 1;
@@ -1971,7 +2000,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (ret != BTRFS_NO_LOG_SYNC) {
                if (!ret) {
-                        ret = btrfs_sync_log(trans, root);
+                        ret = btrfs_sync_log(trans, root, &ctx);
                        if (!ret) {
                                ret = btrfs_end_transaction(trans, root);
                                goto out;
@@ -1993,6 +2022,7 @@ out:
 static const struct vm_operations_struct btrfs_file_vm_ops = {
        .fault          = filemap_fault,
+        .map_pages      = filemap_map_pages,
        .page_mkwrite   = btrfs_page_mkwrite,
        .remap_pages    = generic_file_remap_pages,
 };
@@ -2157,6 +2187,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
                          ((offset + len - 1) >> PAGE_CACHE_SHIFT));
        bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
+        u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
        ret = btrfs_wait_ordered_range(inode, offset, len);
        if (ret)
@@ -2172,14 +2203,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
         * entire page.
         */
        if (same_page && len < PAGE_CACHE_SIZE) {
-                if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+                if (offset < ino_size)
                        ret = btrfs_truncate_page(inode, offset, len, 0);
                mutex_unlock(&inode->i_mutex);
                return ret;
        }
        /* zero back part of the first page */
-        if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+        if (offset < ino_size) {
                ret = btrfs_truncate_page(inode, offset, 0, 0);
                if (ret) {
                        mutex_unlock(&inode->i_mutex);
@@ -2188,7 +2219,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        }
        /* zero the front end of the last page */
-        if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+        if (offset + len < ino_size) {
                ret = btrfs_truncate_page(inode, offset + len, 0, 1);
                if (ret) {
                        mutex_unlock(&inode->i_mutex);
@@ -2277,10 +2308,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                trans->block_rsv = &root->fs_info->trans_block_rsv;
-                ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+                if (cur_offset < ino_size) {
-                if (ret) {
+                        ret = fill_holes(trans, inode, path, cur_offset,
-                        err = ret;
+                                         drop_end);
-                        break;
+                        if (ret) {
+                                err = ret;
+                                break;
+                        }
                }
                cur_offset = drop_end;
@@ -2313,10 +2347,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        }
        trans->block_rsv = &root->fs_info->trans_block_rsv;
-        ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+        if (cur_offset < ino_size) {
-        if (ret) {
+                ret = fill_holes(trans, inode, path, cur_offset, drop_end);
-                err = ret;
+                if (ret) {
-                goto out_trans;
+                        err = ret;
+                        goto out_trans;
+                }
        }
 out_trans:
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ab485e57b6fe..86935f5ae291 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -55,7 +55,7 @@ static int caching_kthread(void *data)
        key.type = BTRFS_INODE_ITEM_KEY;
 again:
        /* need to make sure the commit_root doesn't disappear */
-        mutex_lock(&root->fs_commit_mutex);
+        down_read(&fs_info->commit_root_sem);
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
@@ -88,7 +88,7 @@ again:
                                btrfs_item_key_to_cpu(leaf, &key, 0);
                                btrfs_release_path(path);
                                root->cache_progress = last;
-                                mutex_unlock(&root->fs_commit_mutex);
+                                up_read(&fs_info->commit_root_sem);
                                schedule_timeout(1);
                                goto again;
                        } else
@@ -127,7 +127,7 @@ next:
        btrfs_unpin_free_ino(root);
 out:
        wake_up(&root->cache_wait);
-        mutex_unlock(&root->fs_commit_mutex);
+        up_read(&fs_info->commit_root_sem);
        btrfs_free_path(path);
@@ -176,7 +176,11 @@ static void start_caching(struct btrfs_root *root)
        tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
                          root->root_key.objectid);
-        BUG_ON(IS_ERR(tsk)); /* -ENOMEM */
+        if (IS_ERR(tsk)) {
+                btrfs_warn(root->fs_info, "failed to start inode caching task");
+                btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+                                "disabling inode map caching");
+        }
 }
 int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
@@ -205,42 +209,28 @@ again:
 void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
 {
-        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
        struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
                return;
 again:
        if (root->cached == BTRFS_CACHE_FINISHED) {
-                __btrfs_add_free_space(ctl, objectid, 1);
+                __btrfs_add_free_space(pinned, objectid, 1);
        } else {
-                /*
+                down_write(&root->fs_info->commit_root_sem);
-                 * If we are in the process of caching free ino chunks,
-                 * to avoid adding the same inode number to the free_ino
-                 * tree twice due to cross transaction, we'll leave it
-                 * in the pinned tree until a transaction is committed
-                 * or the caching work is done.
-                 */
-                mutex_lock(&root->fs_commit_mutex);
                spin_lock(&root->cache_lock);
                if (root->cached == BTRFS_CACHE_FINISHED) {
                        spin_unlock(&root->cache_lock);
-                        mutex_unlock(&root->fs_commit_mutex);
+                        up_write(&root->fs_info->commit_root_sem);
                        goto again;
                }
                spin_unlock(&root->cache_lock);
                start_caching(root);
-                if (objectid <= root->cache_progress ||
+                __btrfs_add_free_space(pinned, objectid, 1);
-                    objectid >= root->highest_objectid)
-                        __btrfs_add_free_space(ctl, objectid, 1);
-                else
-                        __btrfs_add_free_space(pinned, objectid, 1);
-                mutex_unlock(&root->fs_commit_mutex);
+                up_write(&root->fs_info->commit_root_sem);
        }
 }
@@ -250,7 +240,7 @@ again:
 * and others will just be dropped, because the commit root we were
 * searching has changed.
 *
- * Must be called with root->fs_commit_mutex held
+ * Must be called with root->fs_info->commit_root_sem held
 */
 void btrfs_unpin_free_ino(struct btrfs_root *root)
 {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3d44486290b..5f805bc944fa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -394,6 +394,14 @@ static noinline int compress_file_range(struct inode *inode,
            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
                btrfs_add_inode_defrag(NULL, inode);
+        /*
+         * skip compression for a small file range(<=blocksize) that
+         * isn't an inline extent, since it dosen't save disk space at all.
+         */
+        if ((end - start + 1) <= blocksize &&
+            (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+                goto cleanup_and_bail_uncompressed;
        actual_end = min_t(u64, isize, end + 1);
 again:
        will_compress = 0;
@@ -864,7 +872,8 @@ static noinline int cow_file_range(struct inode *inode,
        if (btrfs_is_free_space_inode(inode)) {
                WARN_ON_ONCE(1);
-                return -EINVAL;
+                ret = -EINVAL;
+                goto out_unlock;
        }
        num_bytes = ALIGN(end - start + 1, blocksize);
@@ -1075,17 +1084,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                async_cow->end = cur_end;
                INIT_LIST_HEAD(&async_cow->extents);
-                async_cow->work.func = async_cow_start;
+                btrfs_init_work(&async_cow->work, async_cow_start,
-                async_cow->work.ordered_func = async_cow_submit;
+                                async_cow_submit, async_cow_free);
-                async_cow->work.ordered_free = async_cow_free;
-                async_cow->work.flags = 0;
                nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
                        PAGE_CACHE_SHIFT;
                atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
-                btrfs_queue_worker(&root->fs_info->delalloc_workers,
+                btrfs_queue_work(root->fs_info->delalloc_workers,
-                                   &async_cow->work);
+                                 &async_cow->work);
                if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
                        wait_event(root->fs_info->async_submit_wait,
@@ -1272,6 +1279,15 @@ next_slot:
                        disk_bytenr += cur_offset - found_key.offset;
                        num_bytes = min(end + 1, extent_end) - cur_offset;
                        /*
+                         * if there are pending snapshots for this root,
+                         * we fall into common COW way.
+                         */
+                        if (!nolock) {
+                                err = btrfs_start_nocow_write(root);
+                                if (!err)
+                                        goto out_check;
+                        }
+                        /*
                         * force cow if csum exists in the range.
                         * this ensure that csum for a given extent are
                         * either valid or do not exist.
@@ -1290,6 +1306,8 @@ next_slot:
 out_check:
                if (extent_end <= start) {
                        path->slots[0]++;
+                        if (!nolock && nocow)
+                                btrfs_end_nocow_write(root);
                        goto next_slot;
                }
                if (!nocow) {
@@ -1307,8 +1325,11 @@ out_check:
                        ret = cow_file_range(inode, locked_page,
                                             cow_start, found_key.offset - 1,
                                             page_started, nr_written, 1);
-                        if (ret)
+                        if (ret) {
+                                if (!nolock && nocow)
+                                        btrfs_end_nocow_write(root);
                                goto error;
+                        }
                        cow_start = (u64)-1;
                }
@@ -1355,8 +1376,11 @@ out_check:
                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
                        ret = btrfs_reloc_clone_csums(inode, cur_offset,
                                                      num_bytes);
-                        if (ret)
+                        if (ret) {
+                                if (!nolock && nocow)
+                                        btrfs_end_nocow_write(root);
                                goto error;
+                        }
                }
                extent_clear_unlock_delalloc(inode, cur_offset,
@@ -1364,6 +1388,8 @@ out_check:
                                             locked_page, EXTENT_LOCKED |
                                             EXTENT_DELALLOC, PAGE_UNLOCK |
                                             PAGE_SET_PRIVATE2);
+                if (!nolock && nocow)
+                        btrfs_end_nocow_write(root);
                cur_offset = extent_end;
                if (cur_offset > end)
                        break;
@@ -1843,9 +1869,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
        SetPageChecked(page);
        page_cache_get(page);
-        fixup->work.func = btrfs_writepage_fixup_worker;
+        btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
        fixup->page = page;
-        btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+        btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
        return -EBUSY;
 }
@@ -2239,6 +2265,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
                return PTR_ERR(root);
        }
+        if (btrfs_root_readonly(root)) {
+                srcu_read_unlock(&fs_info->subvol_srcu, index);
+                return 0;
+        }
        /* step 2: get inode */
        key.objectid = backref->inum;
        key.type = BTRFS_INODE_ITEM_KEY;
@@ -2759,7 +2790,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
        struct inode *inode = page->mapping->host;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ordered_extent *ordered_extent = NULL;
-        struct btrfs_workers *workers;
+        struct btrfs_workqueue *workers;
        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
@@ -2768,14 +2799,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                            end - start + 1, uptodate))
                return 0;
-        ordered_extent->work.func = finish_ordered_fn;
+        btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
-        ordered_extent->work.flags = 0;
        if (btrfs_is_free_space_inode(inode))
-                workers = &root->fs_info->endio_freespace_worker;
+                workers = root->fs_info->endio_freespace_worker;
        else
-                workers = &root->fs_info->endio_write_workers;
+                workers = root->fs_info->endio_write_workers;
-        btrfs_queue_worker(workers, &ordered_extent->work);
+        btrfs_queue_work(workers, &ordered_extent->work);
        return 0;
 }
@@ -4593,7 +4623,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
        struct rb_node *node;
        ASSERT(inode->i_state & I_FREEING);
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        write_lock(&map_tree->lock);
        while (!RB_EMPTY_ROOT(&map_tree->map)) {
@@ -4924,7 +4954,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root)
        struct inode *inode;
        u64 objectid = 0;
-        WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+        if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+                WARN_ON(btrfs_root_refs(&root->root_item) != 0);
        spin_lock(&root->inode_lock);
 again:
@@ -5799,6 +5830,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        }
 out_unlock:
        btrfs_end_transaction(trans, root);
+        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        if (drop_inode) {
                inode_dec_link_count(inode);
@@ -5872,6 +5904,7 @@ out_unlock:
                inode_dec_link_count(inode);
                iput(inode);
        }
+        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return err;
 }
@@ -5930,6 +5963,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        }
        btrfs_end_transaction(trans, root);
+        btrfs_balance_delayed_items(root);
 fail:
        if (drop_inode) {
                inode_dec_link_count(inode);
@@ -5996,6 +6030,7 @@ out_fail:
        btrfs_end_transaction(trans, root);
        if (drop_on_err)
                iput(inode);
+        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return err;
 }
@@ -6550,6 +6585,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
        int ret;
        struct extent_buffer *leaf;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_file_extent_item *fi;
        struct btrfs_key key;
        u64 disk_bytenr;
@@ -6626,6 +6662,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
        if (btrfs_extent_readonly(root, disk_bytenr))
                goto out;
+        num_bytes = min(offset + *len, extent_end) - offset;
+        if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                u64 range_end;
+                range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
+                ret = test_range_bit(io_tree, offset, range_end,
+                                     EXTENT_DELALLOC, 0, NULL);
+                if (ret) {
+                        ret = -EAGAIN;
+                        goto out;
+                }
+        }
        btrfs_release_path(path);
        /*
@@ -6654,7 +6704,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
         */
        disk_bytenr += backref_offset;
        disk_bytenr += offset - key.offset;
-        num_bytes = min(offset + *len, extent_end) - offset;
        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
                                goto out;
        /*
@@ -7024,10 +7073,9 @@ again:
        if (!ret)
                goto out_test;
-        ordered->work.func = finish_ordered_fn;
+        btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
-        ordered->work.flags = 0;
+        btrfs_queue_work(root->fs_info->endio_write_workers,
-        btrfs_queue_worker(&root->fs_info->endio_write_workers,
+                         &ordered->work);
-                           &ordered->work);
 out_test:
        /*
         * our bio might span multiple ordered extents.  If we haven't
@@ -7404,15 +7452,15 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
        smp_mb__after_atomic_inc();
        /*
-         * The generic stuff only does filemap_write_and_wait_range, which isn't
+         * The generic stuff only does filemap_write_and_wait_range, which
-         * enough if we've written compressed pages to this area, so we need to
+         * isn't enough if we've written compressed pages to this area, so
-         * call btrfs_wait_ordered_range to make absolutely sure that any
+         * we need to flush the dirty pages again to make absolutely sure
-         * outstanding dirty pages are on disk.
+         * that any outstanding dirty pages are on disk.
         */
        count = iov_length(iov, nr_segs);
-        ret = btrfs_wait_ordered_range(inode, offset, count);
+        if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-        if (ret)
+                     &BTRFS_I(inode)->runtime_flags))
-                return ret;
+                filemap_fdatawrite_range(inode->i_mapping, offset, count);
        if (rw & WRITE) {
                /*
@@ -8404,7 +8452,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
        work->inode = inode;
        work->wait = wait;
        work->delay_iput = delay_iput;
-        work->work.func = btrfs_run_delalloc_work;
+        btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
        return work;
 }
@@ -8419,7 +8467,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
 * some fairly slow code that needs optimization. This walks the list
 * of all the inodes with pending delalloc and forces them to disk.
 */
-static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
+                                   int nr)
 {
        struct btrfs_inode *binode;
        struct inode *inode;
@@ -8431,6 +8480,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        INIT_LIST_HEAD(&works);
        INIT_LIST_HEAD(&splice);
+        mutex_lock(&root->delalloc_mutex);
        spin_lock(&root->delalloc_lock);
        list_splice_init(&root->delalloc_inodes, &splice);
        while (!list_empty(&splice)) {
@@ -8456,19 +8506,16 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                        goto out;
                }
                list_add_tail(&work->list, &works);
-                btrfs_queue_worker(&root->fs_info->flush_workers,
+                btrfs_queue_work(root->fs_info->flush_workers,
-                                   &work->work);
+                                 &work->work);
+                ret++;
+                if (nr != -1 && ret >= nr)
+                        goto out;
                cond_resched();
                spin_lock(&root->delalloc_lock);
        }
        spin_unlock(&root->delalloc_lock);
-        list_for_each_entry_safe(work, next, &works, list) {
-                list_del_init(&work->list);
-                btrfs_wait_and_free_delalloc_work(work);
-        }
-        return 0;
 out:
        list_for_each_entry_safe(work, next, &works, list) {
                list_del_init(&work->list);
@@ -8480,6 +8527,7 @@ out:
                list_splice_tail(&splice, &root->delalloc_inodes);
                spin_unlock(&root->delalloc_lock);
        }
+        mutex_unlock(&root->delalloc_mutex);
        return ret;
 }
@@ -8490,7 +8538,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
                return -EROFS;
-        ret = __start_delalloc_inodes(root, delay_iput);
+        ret = __start_delalloc_inodes(root, delay_iput, -1);
+        if (ret > 0)
+                ret = 0;
        /*
         * the filemap_flush will queue IO into the worker threads, but
         * we have to make sure the IO is actually started and that
@@ -8507,7 +8557,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        return ret;
 }
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+                               int nr)
 {
        struct btrfs_root *root;
        struct list_head splice;
@@ -8518,9 +8569,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
        INIT_LIST_HEAD(&splice);
+        mutex_lock(&fs_info->delalloc_root_mutex);
        spin_lock(&fs_info->delalloc_root_lock);
        list_splice_init(&fs_info->delalloc_roots, &splice);
-        while (!list_empty(&splice)) {
+        while (!list_empty(&splice) && nr) {
                root = list_first_entry(&splice, struct btrfs_root,
                                        delalloc_root);
                root = btrfs_grab_fs_root(root);
@@ -8529,15 +8581,20 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
                               &fs_info->delalloc_roots);
                spin_unlock(&fs_info->delalloc_root_lock);
-                ret = __start_delalloc_inodes(root, delay_iput);
+                ret = __start_delalloc_inodes(root, delay_iput, nr);
                btrfs_put_fs_root(root);
-                if (ret)
+                if (ret < 0)
                        goto out;
+                if (nr != -1) {
+                        nr -= ret;
+                        WARN_ON(nr < 0);
+                }
                spin_lock(&fs_info->delalloc_root_lock);
        }
        spin_unlock(&fs_info->delalloc_root_lock);
+        ret = 0;
        atomic_inc(&fs_info->async_submit_draining);
        while (atomic_read(&fs_info->nr_async_submits) ||
              atomic_read(&fs_info->async_delalloc_pages)) {
@@ -8546,13 +8603,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
                    atomic_read(&fs_info->async_delalloc_pages) == 0));
        }
        atomic_dec(&fs_info->async_submit_draining);
-        return 0;
 out:
        if (!list_empty_careful(&splice)) {
                spin_lock(&fs_info->delalloc_root_lock);
                list_splice_tail(&splice, &fs_info->delalloc_roots);
                spin_unlock(&fs_info->delalloc_root_lock);
        }
+        mutex_unlock(&fs_info->delalloc_root_mutex);
        return ret;
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a6d8efa46bfe..2f6d7b13b5bd 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,32 @@
 #include "props.h"
 #include "sysfs.h"
+#ifdef CONFIG_64BIT
+/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
+ * structures are incorrect, as the timespec structure from userspace
+ * is 4 bytes too small. We define these alternatives here to teach
+ * the kernel about the 32-bit struct packing.
+ */
+struct btrfs_ioctl_timespec_32 {
+        __u64 sec;
+        __u32 nsec;
+} __attribute__ ((__packed__));
+struct btrfs_ioctl_received_subvol_args_32 {
+        char    uuid[BTRFS_UUID_SIZE];  /* in */
+        __u64   stransid;               /* in */
+        __u64   rtransid;               /* out */
+        struct btrfs_ioctl_timespec_32 stime; /* in */
+        struct btrfs_ioctl_timespec_32 rtime; /* out */
+        __u64   flags;                  /* in */
+        __u64   reserved[16];           /* in */
+} __attribute__ ((__packed__));
+#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
+                                struct btrfs_ioctl_received_subvol_args_32)
+#endif
 static int btrfs_clone(struct inode *src, struct inode *inode,
                       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
@@ -585,6 +611,23 @@ fail:
        return ret;
 }
+static void btrfs_wait_nocow_write(struct btrfs_root *root)
+{
+        s64 writers;
+        DEFINE_WAIT(wait);
+        do {
+                prepare_to_wait(&root->subv_writers->wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                writers = percpu_counter_sum(&root->subv_writers->counter);
+                if (writers)
+                        schedule();
+                finish_wait(&root->subv_writers->wait, &wait);
+        } while (writers);
+}
 static int create_snapshot(struct btrfs_root *root, struct inode *dir,
                           struct dentry *dentry, char *name, int namelen,
                           u64 *async_transid, bool readonly,
@@ -598,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
        if (!root->ref_cows)
                return -EINVAL;
+        atomic_inc(&root->will_be_snapshoted);
+        smp_mb__after_atomic_inc();
+        btrfs_wait_nocow_write(root);
        ret = btrfs_start_delalloc_inodes(root, 0);
        if (ret)
-                return ret;
+                goto out;
        btrfs_wait_ordered_extents(root, -1);
        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-        if (!pending_snapshot)
+        if (!pending_snapshot) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                goto out;
+        }
        btrfs_init_block_rsv(&pending_snapshot->block_rsv,
                             BTRFS_BLOCK_RSV_TEMP);
@@ -623,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
                                        &pending_snapshot->qgroup_reserved,
                                        false);
        if (ret)
-                goto out;
+                goto free;
        pending_snapshot->dentry = dentry;
        pending_snapshot->root = root;
@@ -674,8 +723,10 @@ fail:
        btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
                                         &pending_snapshot->block_rsv,
                                         pending_snapshot->qgroup_reserved);
-out:
+free:
        kfree(pending_snapshot);
+out:
+        atomic_dec(&root->will_be_snapshoted);
        return ret;
 }
@@ -884,12 +935,14 @@ static int find_new_extents(struct btrfs_root *root,
        min_key.type = BTRFS_EXTENT_DATA_KEY;
        min_key.offset = *off;
-        path->keep_locks = 1;
        while (1) {
+                path->keep_locks = 1;
                ret = btrfs_search_forward(root, &min_key, path, newer_than);
                if (ret != 0)
                        goto none;
+                path->keep_locks = 0;
+                btrfs_unlock_up_safe(path, 1);
+process_slot:
                if (min_key.objectid != ino)
                        goto none;
                if (min_key.type != BTRFS_EXTENT_DATA_KEY)
@@ -908,6 +961,12 @@ static int find_new_extents(struct btrfs_root *root,
                        return 0;
                }
+                path->slots[0]++;
+                if (path->slots[0] < btrfs_header_nritems(leaf)) {
+                        btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
+                        goto process_slot;
+                }
                if (min_key.offset == (u64)-1)
                        goto none;
@@ -935,10 +994,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
        read_unlock(&em_tree->lock);
        if (!em) {
+                struct extent_state *cached = NULL;
+                u64 end = start + len - 1;
                /* get the big lock and read metadata off disk */
-                lock_extent(io_tree, start, start + len - 1);
+                lock_extent_bits(io_tree, start, end, 0, &cached);
                em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
-                unlock_extent(io_tree, start, start + len - 1);
+                unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
                if (IS_ERR(em))
                        return NULL;
@@ -957,7 +1019,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
                return false;
        next = defrag_lookup_extent(inode, em->start + em->len);
-        if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+        if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
+            (em->block_start + em->block_len == next->block_start))
                ret = false;
        free_extent_map(next);
@@ -1076,10 +1139,12 @@ again:
                page_start = page_offset(page);
                page_end = page_start + PAGE_CACHE_SIZE - 1;
                while (1) {
-                        lock_extent(tree, page_start, page_end);
+                        lock_extent_bits(tree, page_start, page_end,
+                                         0, &cached_state);
                        ordered = btrfs_lookup_ordered_extent(inode,
                                                              page_start);
-                        unlock_extent(tree, page_start, page_end);
+                        unlock_extent_cached(tree, page_start, page_end,
+                                             &cached_state, GFP_NOFS);
                        if (!ordered)
                                break;
@@ -1356,8 +1421,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                }
        }
-        if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
+        if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
                filemap_flush(inode->i_mapping);
+                if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                             &BTRFS_I(inode)->runtime_flags))
+                        filemap_flush(inode->i_mapping);
+        }
        if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
                /* the filemap_flush will queue IO into the worker threads, but
@@ -1403,6 +1472,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
        struct btrfs_trans_handle *trans;
        struct btrfs_device *device = NULL;
        char *sizestr;
+        char *retptr;
        char *devstr = NULL;
        int ret = 0;
        int mod = 0;
@@ -1470,8 +1540,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                        mod = 1;
                        sizestr++;
                }
-                new_size = memparse(sizestr, NULL);
+                new_size = memparse(sizestr, &retptr);
-                if (new_size == 0) {
+                if (*retptr != '\0' || new_size == 0) {
                        ret = -EINVAL;
                        goto out_free;
                }
@@ -1573,7 +1643,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
                if (src_inode->i_sb != file_inode(file)->i_sb) {
                        btrfs_info(BTRFS_I(src_inode)->root->fs_info,
                                   "Snapshot src from another FS");
-                        ret = -EINVAL;
+                        ret = -EXDEV;
                } else if (!inode_owner_or_capable(src_inode)) {
                        /*
                         * Subvolume creation is not restricted, but snapshots
@@ -1797,7 +1867,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
        if (di && !IS_ERR(di)) {
                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
                if (key.objectid == root->root_key.objectid) {
-                        ret = -ENOTEMPTY;
+                        ret = -EPERM;
+                        btrfs_err(root->fs_info, "deleting default subvolume "
+                                  "%llu is not allowed", key.objectid);
                        goto out;
                }
                btrfs_release_path(path);
@@ -2994,8 +3066,9 @@ process_slot:
                                                         new_key.offset + datal,
                                                         1);
                                if (ret) {
-                                        btrfs_abort_transaction(trans, root,
+                                        if (ret != -EOPNOTSUPP)
-                                                                ret);
+                                                btrfs_abort_transaction(trans,
+                                                                root, ret);
                                        btrfs_end_transaction(trans, root);
                                        goto out;
                                }
@@ -3047,6 +3120,8 @@ process_slot:
                        } else if (type == BTRFS_FILE_EXTENT_INLINE) {
                                u64 skip = 0;
                                u64 trim = 0;
+                                u64 aligned_end = 0;
                                if (off > key.offset) {
                                        skip = off - key.offset;
                                        new_key.offset += skip;
@@ -3063,13 +3138,16 @@ process_slot:
                                size -= skip + trim;
                                datal -= skip + trim;
+                                aligned_end = ALIGN(new_key.offset + datal,
+                                                    root->sectorsize);
                                ret = btrfs_drop_extents(trans, root, inode,
                                                         new_key.offset,
-                                                         new_key.offset + datal,
+                                                         aligned_end,
                                                         1);
                                if (ret) {
-                                        btrfs_abort_transaction(trans, root,
+                                        if (ret != -EOPNOTSUPP)
-                                                                ret);
+                                                btrfs_abort_transaction(trans,
+                                                        root, ret);
                                        btrfs_end_transaction(trans, root);
                                        goto out;
                                }
@@ -3153,8 +3231,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
         *   decompress into destination's address_space (the file offset
         *   may change, so source mapping won't do), then recompress (or
         *   otherwise reinsert) a subrange.
-         * - allow ranges within the same file to be cloned (provided
+         *
-         *   they don't overlap)?
+         * - split destination inode's inline extents.  The inline extents can
+         *   be either compressed or non-compressed.
         */
        /* the destination must be opened for writing */
@@ -3465,6 +3544,11 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
                up_read(&info->groups_sem);
        }
+        /*
+         * Global block reserve, exported as a space_info
+         */
+        slot_count++;
        /* space_slots == 0 means they are asking for a count */
        if (space_args.space_slots == 0) {
                space_args.total_spaces = slot_count;
@@ -3523,6 +3607,21 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
                up_read(&info->groups_sem);
        }
+        /*
+         * Add global block reserve
+         */
+        if (slot_count) {
+                struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
+                spin_lock(&block_rsv->lock);
+                space.total_bytes = block_rsv->size;
+                space.used_bytes = block_rsv->size - block_rsv->reserved;
+                spin_unlock(&block_rsv->lock);
+                space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
+                memcpy(dest, &space, sizeof(space));
+                space_args.total_spaces++;
+        }
        user_dest = (struct btrfs_ioctl_space_info __user *)
                (arg + sizeof(struct btrfs_ioctl_space_args));
@@ -4353,10 +4452,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
        return btrfs_qgroup_wait_for_completion(root->fs_info);
 }
-static long btrfs_ioctl_set_received_subvol(struct file *file,
+static long _btrfs_ioctl_set_received_subvol(struct file *file,
-                                            void __user *arg)
+                                            struct btrfs_ioctl_received_subvol_args *sa)
 {
-        struct btrfs_ioctl_received_subvol_args *sa = NULL;
        struct inode *inode = file_inode(file);
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_root_item *root_item = &root->root_item;
@@ -4384,13 +4482,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
                goto out;
        }
-        sa = memdup_user(arg, sizeof(*sa));
-        if (IS_ERR(sa)) {
-                ret = PTR_ERR(sa);
-                sa = NULL;
-                goto out;
-        }
        /*
         * 1 - root item
         * 2 - uuid items (received uuid + subvol uuid)
@@ -4444,14 +4535,90 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
                goto out;
        }
+out:
+        up_write(&root->fs_info->subvol_sem);
+        mnt_drop_write_file(file);
+        return ret;
+}
+#ifdef CONFIG_64BIT
+static long btrfs_ioctl_set_received_subvol_32(struct file *file,
+                                                void __user *arg)
+{
+        struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
+        struct btrfs_ioctl_received_subvol_args *args64 = NULL;
+        int ret = 0;
+        args32 = memdup_user(arg, sizeof(*args32));
+        if (IS_ERR(args32)) {
+                ret = PTR_ERR(args32);
+                args32 = NULL;
+                goto out;
+        }
+        args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+        if (!args64) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
+        args64->stransid = args32->stransid;
+        args64->rtransid = args32->rtransid;
+        args64->stime.sec = args32->stime.sec;
+        args64->stime.nsec = args32->stime.nsec;
+        args64->rtime.sec = args32->rtime.sec;
+        args64->rtime.nsec = args32->rtime.nsec;
+        args64->flags = args32->flags;
+        ret = _btrfs_ioctl_set_received_subvol(file, args64);
+        if (ret)
+                goto out;
+        memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
+        args32->stransid = args64->stransid;
+        args32->rtransid = args64->rtransid;
+        args32->stime.sec = args64->stime.sec;
+        args32->stime.nsec = args64->stime.nsec;
+        args32->rtime.sec = args64->rtime.sec;
+        args32->rtime.nsec = args64->rtime.nsec;
+        args32->flags = args64->flags;
+        ret = copy_to_user(arg, args32, sizeof(*args32));
+        if (ret)
+                ret = -EFAULT;
+out:
+        kfree(args32);
+        kfree(args64);
+        return ret;
+}
+#endif
+static long btrfs_ioctl_set_received_subvol(struct file *file,
+                                            void __user *arg)
+{
+        struct btrfs_ioctl_received_subvol_args *sa = NULL;
+        int ret = 0;
+        sa = memdup_user(arg, sizeof(*sa));
+        if (IS_ERR(sa)) {
+                ret = PTR_ERR(sa);
+                sa = NULL;
+                goto out;
+        }
+        ret = _btrfs_ioctl_set_received_subvol(file, sa);
+        if (ret)
+                goto out;
        ret = copy_to_user(arg, sa, sizeof(*sa));
        if (ret)
                ret = -EFAULT;
 out:
        kfree(sa);
-        up_write(&root->fs_info->subvol_sem);
-        mnt_drop_write_file(file);
        return ret;
 }
@@ -4746,7 +4913,7 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_SYNC: {
                int ret;
-                ret = btrfs_start_delalloc_roots(root->fs_info, 0);
+                ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
                if (ret)
                        return ret;
                ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
@@ -4770,6 +4937,10 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_balance_progress(root, argp);
        case BTRFS_IOC_SET_RECEIVED_SUBVOL:
                return btrfs_ioctl_set_received_subvol(file, argp);
+#ifdef CONFIG_64BIT
+        case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
+                return btrfs_ioctl_set_received_subvol_32(file, argp);
+#endif
        case BTRFS_IOC_SEND:
                return btrfs_ioctl_send(file, argp);
        case BTRFS_IOC_GET_DEV_STATS:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b16450b840e7..a94b05f72869 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -349,10 +349,13 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
        if (!uptodate)
                set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
-        if (entry->bytes_left == 0)
+        if (entry->bytes_left == 0) {
                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
-        else
+                if (waitqueue_active(&entry->wait))
+                        wake_up(&entry->wait);
+        } else {
                ret = 1;
+        }
 out:
        if (!ret && cached && entry) {
                *cached = entry;
@@ -410,10 +413,13 @@ have_entry:
        if (!uptodate)
                set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
-        if (entry->bytes_left == 0)
+        if (entry->bytes_left == 0) {
                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
-        else
+                if (waitqueue_active(&entry->wait))
+                        wake_up(&entry->wait);
+        } else {
                ret = 1;
+        }
 out:
        if (!ret && cached && entry) {
                *cached = entry;
@@ -424,27 +430,48 @@ out:
 }
 /* Needs to either be called under a log transaction or the log_mutex */
-void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
+void btrfs_get_logged_extents(struct inode *inode,
+                              struct list_head *logged_list)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct btrfs_ordered_extent *ordered;
        struct rb_node *n;
-        int index = log->log_transid % 2;
        tree = &BTRFS_I(inode)->ordered_tree;
        spin_lock_irq(&tree->lock);
        for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
                ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
-                spin_lock(&log->log_extents_lock[index]);
+                if (!list_empty(&ordered->log_list))
-                if (list_empty(&ordered->log_list)) {
+                        continue;
-                        list_add_tail(&ordered->log_list, &log->logged_list[index]);
+                list_add_tail(&ordered->log_list, logged_list);
-                        atomic_inc(&ordered->refs);
+                atomic_inc(&ordered->refs);
-                }
-                spin_unlock(&log->log_extents_lock[index]);
        }
        spin_unlock_irq(&tree->lock);
 }
+void btrfs_put_logged_extents(struct list_head *logged_list)
+{
+        struct btrfs_ordered_extent *ordered;
+        while (!list_empty(logged_list)) {
+                ordered = list_first_entry(logged_list,
+                                           struct btrfs_ordered_extent,
+                                           log_list);
+                list_del_init(&ordered->log_list);
+                btrfs_put_ordered_extent(ordered);
+        }
+}
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+                                 struct btrfs_root *log)
+{
+        int index = log->log_transid % 2;
+        spin_lock_irq(&log->log_extents_lock[index]);
+        list_splice_tail(logged_list, &log->logged_list[index]);
+        spin_unlock_irq(&log->log_extents_lock[index]);
+}
 void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
 {
        struct btrfs_ordered_extent *ordered;
@@ -577,7 +604,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
        INIT_LIST_HEAD(&splice);
        INIT_LIST_HEAD(&works);
-        mutex_lock(&root->fs_info->ordered_operations_mutex);
+        mutex_lock(&root->ordered_extent_mutex);
        spin_lock(&root->ordered_extent_lock);
        list_splice_init(&root->ordered_extents, &splice);
        while (!list_empty(&splice) && nr) {
@@ -588,10 +615,11 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
                atomic_inc(&ordered->refs);
                spin_unlock(&root->ordered_extent_lock);
-                ordered->flush_work.func = btrfs_run_ordered_extent_work;
+                btrfs_init_work(&ordered->flush_work,
+                                btrfs_run_ordered_extent_work, NULL, NULL);
                list_add_tail(&ordered->work_list, &works);
-                btrfs_queue_worker(&root->fs_info->flush_workers,
+                btrfs_queue_work(root->fs_info->flush_workers,
-                                   &ordered->flush_work);
+                                 &ordered->flush_work);
                cond_resched();
                spin_lock(&root->ordered_extent_lock);
@@ -608,7 +636,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
                btrfs_put_ordered_extent(ordered);
                cond_resched();
        }
-        mutex_unlock(&root->fs_info->ordered_operations_mutex);
+        mutex_unlock(&root->ordered_extent_mutex);
        return count;
 }
@@ -621,6 +649,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
        INIT_LIST_HEAD(&splice);
+        mutex_lock(&fs_info->ordered_operations_mutex);
        spin_lock(&fs_info->ordered_root_lock);
        list_splice_init(&fs_info->ordered_roots, &splice);
        while (!list_empty(&splice) && nr) {
@@ -643,6 +672,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
        }
        list_splice_tail(&splice, &fs_info->ordered_roots);
        spin_unlock(&fs_info->ordered_root_lock);
+        mutex_unlock(&fs_info->ordered_operations_mutex);
 }
 /*
@@ -704,8 +734,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
                        goto out;
                }
                list_add_tail(&work->list, &works);
-                btrfs_queue_worker(&root->fs_info->flush_workers,
+                btrfs_queue_work(root->fs_info->flush_workers,
-                                   &work->work);
+                                 &work->work);
                cond_resched();
                spin_lock(&root->fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9b0450f7ac20..246897058efb 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -197,7 +197,11 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                 struct inode *inode);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
-void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
+void btrfs_get_logged_extents(struct inode *inode,
+                              struct list_head *logged_list);
+void btrfs_put_logged_extents(struct list_head *logged_list);
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+                                 struct btrfs_root *log);
 void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
 int __init ordered_data_init(void);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 472302a2d745..2cf905877aaf 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1509,8 +1509,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
                ret = qgroup_rescan_init(fs_info, 0, 1);
                if (!ret) {
                        qgroup_rescan_zero_tracking(fs_info);
-                        btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+                        btrfs_queue_work(fs_info->qgroup_rescan_workers,
-                                           &fs_info->qgroup_rescan_work);
+                                         &fs_info->qgroup_rescan_work);
                }
                ret = 0;
        }
@@ -2095,7 +2095,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
        memset(&fs_info->qgroup_rescan_work, 0,
               sizeof(fs_info->qgroup_rescan_work));
-        fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
+        btrfs_init_work(&fs_info->qgroup_rescan_work,
+                        btrfs_qgroup_rescan_worker, NULL, NULL);
        if (ret) {
 err:
@@ -2158,8 +2159,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
        qgroup_rescan_zero_tracking(fs_info);
-        btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+        btrfs_queue_work(fs_info->qgroup_rescan_workers,
-                           &fs_info->qgroup_rescan_work);
+                         &fs_info->qgroup_rescan_work);
        return 0;
 }
@@ -2190,6 +2191,6 @@ void
 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
 {
        if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
-                btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
+                btrfs_queue_work(fs_info->qgroup_rescan_workers,
-                                   &fs_info->qgroup_rescan_work);
+                                 &fs_info->qgroup_rescan_work);
 }
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 9af0b25d991a..4055291a523e 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1416,20 +1416,18 @@ cleanup:
 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
 {
-        rbio->work.flags = 0;
+        btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
-        rbio->work.func = rmw_work;
-        btrfs_queue_worker(&rbio->fs_info->rmw_workers,
+        btrfs_queue_work(rbio->fs_info->rmw_workers,
-                           &rbio->work);
+                         &rbio->work);
 }
 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
 {
-        rbio->work.flags = 0;
+        btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
-        rbio->work.func = read_rebuild_work;
-        btrfs_queue_worker(&rbio->fs_info->rmw_workers,
+        btrfs_queue_work(rbio->fs_info->rmw_workers,
-                           &rbio->work);
+                         &rbio->work);
 }
 /*
@@ -1667,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
        plug = container_of(cb, struct btrfs_plug_cb, cb);
        if (from_schedule) {
-                plug->work.flags = 0;
+                btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
-                plug->work.func = unplug_work;
+                btrfs_queue_work(plug->info->rmw_workers,
-                btrfs_queue_worker(&plug->info->rmw_workers,
+                                 &plug->work);
-                                   &plug->work);
                return;
        }
        run_plug(plug);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 31c797c48c3e..30947f923620 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -793,10 +793,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
                /* FIXME we cannot handle this properly right now */
                BUG();
        }
-        rmw->work.func = reada_start_machine_worker;
+        btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
        rmw->fs_info = fs_info;
-        btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
+        btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
 }
 #ifdef DEBUG
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 07b3b36f40ee..7f92ab1daa87 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2317,7 +2317,6 @@ void free_reloc_roots(struct list_head *list)
 static noinline_for_stack
 int merge_reloc_roots(struct reloc_control *rc)
 {
-        struct btrfs_trans_handle *trans;
        struct btrfs_root *root;
        struct btrfs_root *reloc_root;
        u64 last_snap;
@@ -2375,26 +2374,6 @@ again:
                                list_add_tail(&reloc_root->root_list,
                                              &reloc_roots);
                        goto out;
-                } else if (!ret) {
-                        /*
-                         * recover the last snapshot tranid to avoid
-                         * the space balance break NOCOW.
-                         */
-                        root = read_fs_root(rc->extent_root->fs_info,
-                                            objectid);
-                        if (IS_ERR(root))
-                                continue;
-                        trans = btrfs_join_transaction(root);
-                        BUG_ON(IS_ERR(trans));
-                        /* Check if the fs/file tree was snapshoted or not. */
-                        if (btrfs_root_last_snapshot(&root->root_item) ==
-                            otransid - 1)
-                                btrfs_set_root_last_snapshot(&root->root_item,
-                                                             last_snap);
-                                
-                        btrfs_end_transaction(trans, root);
                }
        }
@@ -4248,7 +4227,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
               rc->block_group->key.objectid, rc->block_group->flags);
-        ret = btrfs_start_delalloc_roots(fs_info, 0);
+        ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
        if (ret < 0) {
                err = ret;
                goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 1389b69059de..38bb47e7d6b1 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,6 +16,7 @@
 * Boston, MA 021110-1307, USA.
 */
+#include <linux/err.h>
 #include <linux/uuid.h>
 #include "ctree.h"
 #include "transaction.h"
@@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                key.offset++;
                root = btrfs_read_fs_root(tree_root, &root_key);
-                err = PTR_RET(root);
+                err = PTR_ERR_OR_ZERO(root);
                if (err && err != -ENOENT) {
                        break;
                } else if (err == -ENOENT) {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efba5d1282ee..0be77993378e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -315,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
        atomic_inc(&fs_info->scrubs_running);
        atomic_inc(&fs_info->scrubs_paused);
        mutex_unlock(&fs_info->scrub_lock);
+        /*
+         * check if @scrubs_running=@scrubs_paused condition
+         * inside wait_event() is not an atomic operation.
+         * which means we may inc/dec @scrub_running/paused
+         * at any time. Let's wake up @scrub_pause_wait as
+         * much as we can to let commit transaction blocked less.
+         */
+        wake_up(&fs_info->scrub_pause_wait);
        atomic_inc(&sctx->workers_pending);
 }
@@ -418,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
                sbio->index = i;
                sbio->sctx = sctx;
                sbio->page_count = 0;
-                sbio->work.func = scrub_bio_end_io_worker;
+                btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
+                                NULL, NULL);
                if (i != SCRUB_BIOS_PER_SCTX - 1)
                        sctx->bios[i]->next_free = i + 1;
@@ -987,9 +998,10 @@ nodatasum_case:
                fixup_nodatasum->root = fs_info->extent_root;
                fixup_nodatasum->mirror_num = failed_mirror_index + 1;
                scrub_pending_trans_workers_inc(sctx);
-                fixup_nodatasum->work.func = scrub_fixup_nodatasum;
+                btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
-                btrfs_queue_worker(&fs_info->scrub_workers,
+                                NULL, NULL);
-                                   &fixup_nodatasum->work);
+                btrfs_queue_work(fs_info->scrub_workers,
+                                 &fixup_nodatasum->work);
                goto out;
        }
@@ -1603,8 +1615,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
        sbio->err = err;
        sbio->bio = bio;
-        sbio->work.func = scrub_wr_bio_end_io_worker;
+        btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
-        btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
+        btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
 }
 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
@@ -2072,7 +2084,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
        sbio->err = err;
        sbio->bio = bio;
-        btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+        btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
 }
 static void scrub_bio_end_io_worker(struct btrfs_work *work)
@@ -2223,6 +2235,47 @@ behind_scrub_pages:
        return 0;
 }
+/*
+ * Given a physical address, this will calculate it's
+ * logical offset. if this is a parity stripe, it will return
+ * the most left data stripe's logical offset.
+ *
+ * return 0 if it is a data stripe, 1 means parity stripe.
+ */
+static int get_raid56_logic_offset(u64 physical, int num,
+                                   struct map_lookup *map, u64 *offset)
+{
+        int i;
+        int j = 0;
+        u64 stripe_nr;
+        u64 last_offset;
+        int stripe_index;
+        int rot;
+        last_offset = (physical - map->stripes[num].physical) *
+                      nr_data_stripes(map);
+        *offset = last_offset;
+        for (i = 0; i < nr_data_stripes(map); i++) {
+                *offset = last_offset + i * map->stripe_len;
+                stripe_nr = *offset;
+                do_div(stripe_nr, map->stripe_len);
+                do_div(stripe_nr, nr_data_stripes(map));
+                /* Work out the disk rotation on this stripe-set */
+                rot = do_div(stripe_nr, map->num_stripes);
+                /* calculate which stripe this data locates */
+                rot += i;
+                stripe_index = rot % map->num_stripes;
+                if (stripe_index == num)
+                        return 0;
+                if (stripe_index < num)
+                        j++;
+        }
+        *offset = last_offset + j * map->stripe_len;
+        return 1;
+}
 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
                                           struct map_lookup *map,
                                           struct btrfs_device *scrub_dev,
@@ -2244,6 +2297,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        u64 physical;
        u64 logical;
        u64 logic_end;
+        u64 physical_end;
        u64 generation;
        int mirror_num;
        struct reada_control *reada1;
@@ -2257,16 +2311,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        u64 extent_len;
        struct btrfs_device *extent_dev;
        int extent_mirror_num;
-        int stop_loop;
+        int stop_loop = 0;
-        if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                         BTRFS_BLOCK_GROUP_RAID6)) {
-                if (num >= nr_data_stripes(map)) {
-                        return 0;
-                }
-        }
        nstripes = length;
+        physical = map->stripes[num].physical;
        offset = 0;
        do_div(nstripes, map->stripe_len);
        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
@@ -2284,6 +2332,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
                increment = map->stripe_len;
                mirror_num = num % map->num_stripes + 1;
+        } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+                                BTRFS_BLOCK_GROUP_RAID6)) {
+                get_raid56_logic_offset(physical, num, map, &offset);
+                increment = map->stripe_len * nr_data_stripes(map);
+                mirror_num = 1;
        } else {
                increment = map->stripe_len;
                mirror_num = 1;
@@ -2307,7 +2360,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
         * to not hold off transaction commits
         */
        logical = base + offset;
+        physical_end = physical + nstripes * map->stripe_len;
+        if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+                         BTRFS_BLOCK_GROUP_RAID6)) {
+                get_raid56_logic_offset(physical_end, num,
+                                        map, &logic_end);
+                logic_end += base;
+        } else {
+                logic_end = logical + increment * nstripes;
+        }
        wait_event(sctx->list_wait,
                   atomic_read(&sctx->bios_in_flight) == 0);
        scrub_blocked_if_needed(fs_info);
@@ -2316,7 +2377,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        key_start.objectid = logical;
        key_start.type = BTRFS_EXTENT_ITEM_KEY;
        key_start.offset = (u64)0;
-        key_end.objectid = base + offset + nstripes * increment;
+        key_end.objectid = logic_end;
        key_end.type = BTRFS_METADATA_ITEM_KEY;
        key_end.offset = (u64)-1;
        reada1 = btrfs_reada_add(root, &key_start, &key_end);
@@ -2326,7 +2387,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        key_start.offset = logical;
        key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
        key_end.type = BTRFS_EXTENT_CSUM_KEY;
-        key_end.offset = base + offset + nstripes * increment;
+        key_end.offset = logic_end;
        reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
        if (!IS_ERR(reada1))
@@ -2344,11 +2405,17 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        /*
         * now find all extents for each stripe and scrub them
         */
-        logical = base + offset;
-        physical = map->stripes[num].physical;
-        logic_end = logical + increment * nstripes;
        ret = 0;
-        while (logical < logic_end) {
+        while (physical < physical_end) {
+                /* for raid56, we skip parity stripe */
+                if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+                                BTRFS_BLOCK_GROUP_RAID6)) {
+                        ret = get_raid56_logic_offset(physical, num,
+                                        map, &logical);
+                        logical += base;
+                        if (ret)
+                                goto skip;
+                }
                /*
                 * canceled?
                 */
@@ -2492,15 +2559,29 @@ again:
                        scrub_free_csums(sctx);
                        if (extent_logical + extent_len <
                            key.objectid + bytes) {
-                                logical += increment;
+                                if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
-                                physical += map->stripe_len;
+                                        BTRFS_BLOCK_GROUP_RAID6)) {
+                                        /*
+                                         * loop until we find next data stripe
+                                         * or we have finished all stripes.
+                                         */
+                                        do {
+                                                physical += map->stripe_len;
+                                                ret = get_raid56_logic_offset(
+                                                                physical, num,
+                                                                map, &logical);
+                                                logical += base;
+                                        } while (physical < physical_end && ret);
+                                } else {
+                                        physical += map->stripe_len;
+                                        logical += increment;
+                                }
                                if (logical < key.objectid + bytes) {
                                        cond_resched();
                                        goto again;
                                }
-                                if (logical >= logic_end) {
+                                if (physical >= physical_end) {
                                        stop_loop = 1;
                                        break;
                                }
@@ -2509,6 +2590,7 @@ next:
                        path->slots[0]++;
                }
                btrfs_release_path(path);
+skip:
                logical += increment;
                physical += map->stripe_len;
                spin_lock(&sctx->stat_lock);
@@ -2686,10 +2768,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
                wait_event(sctx->list_wait,
                           atomic_read(&sctx->bios_in_flight) == 0);
-                atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+                atomic_inc(&fs_info->scrubs_paused);
+                wake_up(&fs_info->scrub_pause_wait);
+                /*
+                 * must be called before we decrease @scrub_paused.
+                 * make sure we don't block transaction commit while
+                 * we are waiting pending workers finished.
+                 */
                wait_event(sctx->list_wait,
                           atomic_read(&sctx->workers_pending) == 0);
-                scrub_blocked_if_needed(fs_info);
+                atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+                mutex_lock(&fs_info->scrub_lock);
+                __scrub_blocked_if_needed(fs_info);
+                atomic_dec(&fs_info->scrubs_paused);
+                mutex_unlock(&fs_info->scrub_lock);
+                wake_up(&fs_info->scrub_pause_wait);
                btrfs_put_block_group(cache);
                if (ret)
@@ -2757,33 +2852,35 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
                                                int is_dev_replace)
 {
        int ret = 0;
+        int flags = WQ_FREEZABLE | WQ_UNBOUND;
+        int max_active = fs_info->thread_pool_size;
        if (fs_info->scrub_workers_refcnt == 0) {
                if (is_dev_replace)
-                        btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
+                        fs_info->scrub_workers =
-                                        &fs_info->generic_worker);
+                                btrfs_alloc_workqueue("btrfs-scrub", flags,
+                                                      1, 4);
                else
-                        btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+                        fs_info->scrub_workers =
-                                        fs_info->thread_pool_size,
+                                btrfs_alloc_workqueue("btrfs-scrub", flags,
-                                        &fs_info->generic_worker);
+                                                      max_active, 4);
-                fs_info->scrub_workers.idle_thresh = 4;
+                if (!fs_info->scrub_workers) {
-                ret = btrfs_start_workers(&fs_info->scrub_workers);
+                        ret = -ENOMEM;
-                if (ret)
                        goto out;
-                btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
+                }
-                                   "scrubwrc",
+                fs_info->scrub_wr_completion_workers =
-                                   fs_info->thread_pool_size,
+                        btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
-                                   &fs_info->generic_worker);
+                                              max_active, 2);
-                fs_info->scrub_wr_completion_workers.idle_thresh = 2;
+                if (!fs_info->scrub_wr_completion_workers) {
-                ret = btrfs_start_workers(
+                        ret = -ENOMEM;
-                                &fs_info->scrub_wr_completion_workers);
-                if (ret)
                        goto out;
-                btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
+                }
-                                   &fs_info->generic_worker);
+                fs_info->scrub_nocow_workers =
-                ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
+                        btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
-                if (ret)
+                if (!fs_info->scrub_nocow_workers) {
+                        ret = -ENOMEM;
                        goto out;
+                }
        }
        ++fs_info->scrub_workers_refcnt;
 out:
@@ -2793,9 +2890,9 @@ out:
 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 {
        if (--fs_info->scrub_workers_refcnt == 0) {
-                btrfs_stop_workers(&fs_info->scrub_workers);
+                btrfs_destroy_workqueue(fs_info->scrub_workers);
-                btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
+                btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
-                btrfs_stop_workers(&fs_info->scrub_nocow_workers);
+                btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
        }
        WARN_ON(fs_info->scrub_workers_refcnt < 0);
 }
@@ -3106,10 +3203,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
        nocow_ctx->len = len;
        nocow_ctx->mirror_num = mirror_num;
        nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
-        nocow_ctx->work.func = copy_nocow_pages_worker;
+        btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
        INIT_LIST_HEAD(&nocow_ctx->inodes);
-        btrfs_queue_worker(&fs_info->scrub_nocow_workers,
+        btrfs_queue_work(fs_info->scrub_nocow_workers,
-                           &nocow_ctx->work);
+                         &nocow_ctx->work);
        return 0;
 }
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9dde9717c1b9..484aacac2c89 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -51,15 +51,18 @@ struct fs_path {
                struct {
                        char *start;
                        char *end;
-                        char *prepared;
                        char *buf;
-                        int buf_len;
+                        unsigned short buf_len:15;
-                        unsigned int reversed:1;
+                        unsigned short reversed:1;
-                        unsigned int virtual_mem:1;
                        char inline_buf[];
                };
-                char pad[PAGE_SIZE];
+                /*
+                 * Average path length does not exceed 200 bytes, we'll have
+                 * better packing in the slab and higher chance to satisfy
+                 * a allocation later during send.
+                 */
+                char pad[256];
        };
 };
 #define FS_PATH_INLINE_SIZE \
@@ -109,6 +112,7 @@ struct send_ctx {
        int cur_inode_deleted;
        u64 cur_inode_size;
        u64 cur_inode_mode;
+        u64 cur_inode_rdev;
        u64 cur_inode_last_extent;
        u64 send_progress;
@@ -120,6 +124,8 @@ struct send_ctx {
        struct list_head name_cache_list;
        int name_cache_size;
+        struct file_ra_state ra;
        char *read_buf;
        /*
@@ -175,6 +181,47 @@ struct send_ctx {
         * own move/rename can be performed.
         */
        struct rb_root waiting_dir_moves;
+        /*
+         * A directory that is going to be rm'ed might have a child directory
+         * which is in the pending directory moves index above. In this case,
+         * the directory can only be removed after the move/rename of its child
+         * is performed. Example:
+         *
+         * Parent snapshot:
+         *
+         * .                        (ino 256)
+         * |-- a/                   (ino 257)
+         *     |-- b/               (ino 258)
+         *         |-- c/           (ino 259)
+         *         |   |-- x/       (ino 260)
+         *         |
+         *         |-- y/           (ino 261)
+         *
+         * Send snapshot:
+         *
+         * .                        (ino 256)
+         * |-- a/                   (ino 257)
+         *     |-- b/               (ino 258)
+         *         |-- YY/          (ino 261)
+         *              |-- x/      (ino 260)
+         *
+         * Sequence of steps that lead to the send snapshot:
+         * rm -f /a/b/c/foo.txt
+         * mv /a/b/y /a/b/YY
+         * mv /a/b/c/x /a/b/YY
+         * rmdir /a/b/c
+         *
+         * When the child is processed, its move/rename is delayed until its
+         * parent is processed (as explained above), but all other operations
+         * like update utimes, chown, chgrp, etc, are performed and the paths
+         * that it uses for those operations must use the orphanized name of
+         * its parent (the directory we're going to rm later), so we need to
+         * memorize that name.
+         *
+         * Indexed by the inode number of the directory to be deleted.
+         */
+        struct rb_root orphan_dirs;
 };
 struct pending_dir_move {
@@ -189,6 +236,18 @@ struct pending_dir_move {
 struct waiting_dir_move {
        struct rb_node node;
        u64 ino;
+        /*
+         * There might be some directory that could not be removed because it
+         * was waiting for this directory inode to be moved first. Therefore
+         * after this directory is moved, we can try to rmdir the ino rmdir_ino.
+         */
+        u64 rmdir_ino;
+};
+struct orphan_dir_info {
+        struct rb_node node;
+        u64 ino;
+        u64 gen;
 };
 struct name_cache_entry {
@@ -214,6 +273,11 @@ struct name_cache_entry {
 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
 static int need_send_hole(struct send_ctx *sctx)
 {
        return (sctx->parent_root && !sctx->cur_inode_new &&
@@ -242,7 +306,6 @@ static struct fs_path *fs_path_alloc(void)
        if (!p)
                return NULL;
        p->reversed = 0;
-        p->virtual_mem = 0;
        p->buf = p->inline_buf;
        p->buf_len = FS_PATH_INLINE_SIZE;
        fs_path_reset(p);
@@ -265,12 +328,8 @@ static void fs_path_free(struct fs_path *p)
 {
        if (!p)
                return;
-        if (p->buf != p->inline_buf) {
+        if (p->buf != p->inline_buf)
-                if (p->virtual_mem)
+                kfree(p->buf);
-                        vfree(p->buf);
-                else
-                        kfree(p->buf);
-        }
        kfree(p);
 }
@@ -290,42 +349,33 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
        if (p->buf_len >= len)
                return 0;
+        if (len > PATH_MAX) {
+                WARN_ON(1);
+                return -ENOMEM;
+        }
        path_len = p->end - p->start;
        old_buf_len = p->buf_len;
-        len = PAGE_ALIGN(len);
+        /*
+         * First time the inline_buf does not suffice
+         */
        if (p->buf == p->inline_buf) {
-                tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN);
+                tmp_buf = kmalloc(len, GFP_NOFS);
-                if (!tmp_buf) {
+                if (tmp_buf)
-                        tmp_buf = vmalloc(len);
+                        memcpy(tmp_buf, p->buf, old_buf_len);
-                        if (!tmp_buf)
-                                return -ENOMEM;
-                        p->virtual_mem = 1;
-                }
-                memcpy(tmp_buf, p->buf, p->buf_len);
-                p->buf = tmp_buf;
-                p->buf_len = len;
        } else {
-                if (p->virtual_mem) {
+                tmp_buf = krealloc(p->buf, len, GFP_NOFS);
-                        tmp_buf = vmalloc(len);
-                        if (!tmp_buf)
-                                return -ENOMEM;
-                        memcpy(tmp_buf, p->buf, p->buf_len);
-                        vfree(p->buf);
-                } else {
-                        tmp_buf = krealloc(p->buf, len, GFP_NOFS);
-                        if (!tmp_buf) {
-                                tmp_buf = vmalloc(len);
-                                if (!tmp_buf)
-                                        return -ENOMEM;
-                                memcpy(tmp_buf, p->buf, p->buf_len);
-                                kfree(p->buf);
-                                p->virtual_mem = 1;
-                        }
-                }
-                p->buf = tmp_buf;
-                p->buf_len = len;
        }
+        if (!tmp_buf)
+                return -ENOMEM;
+        p->buf = tmp_buf;
+        /*
+         * The real size of the buffer is bigger, this will let the fast path
+         * happen most of the time
+         */
+        p->buf_len = ksize(p->buf);
        if (p->reversed) {
                tmp_buf = p->buf + old_buf_len - path_len - 1;
                p->end = p->buf + p->buf_len - 1;
@@ -338,7 +388,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
        return 0;
 }
-static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
+static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
+                                   char **prepared)
 {
        int ret;
        int new_len;
@@ -354,11 +405,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
                if (p->start != p->end)
                        *--p->start = '/';
                p->start -= name_len;
-                p->prepared = p->start;
+                *prepared = p->start;
        } else {
                if (p->start != p->end)
                        *p->end++ = '/';
-                p->prepared = p->end;
+                *prepared = p->end;
                p->end += name_len;
                *p->end = 0;
        }
@@ -370,12 +421,12 @@ out:
 static int fs_path_add(struct fs_path *p, const char *name, int name_len)
 {
        int ret;
+        char *prepared;
-        ret = fs_path_prepare_for_add(p, name_len);
+        ret = fs_path_prepare_for_add(p, name_len, &prepared);
        if (ret < 0)
                goto out;
-        memcpy(p->prepared, name, name_len);
+        memcpy(prepared, name, name_len);
-        p->prepared = NULL;
 out:
        return ret;
@@ -384,12 +435,12 @@ out:
 static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
 {
        int ret;
+        char *prepared;
-        ret = fs_path_prepare_for_add(p, p2->end - p2->start);
+        ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
        if (ret < 0)
                goto out;
-        memcpy(p->prepared, p2->start, p2->end - p2->start);
+        memcpy(prepared, p2->start, p2->end - p2->start);
-        p->prepared = NULL;
 out:
        return ret;
@@ -400,13 +451,13 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
                                          unsigned long off, int len)
 {
        int ret;
+        char *prepared;
-        ret = fs_path_prepare_for_add(p, len);
+        ret = fs_path_prepare_for_add(p, len, &prepared);
        if (ret < 0)
                goto out;
-        read_extent_buffer(eb, p->prepared, off, len);
+        read_extent_buffer(eb, prepared, off, len);
-        p->prepared = NULL;
 out:
        return ret;
@@ -450,6 +501,7 @@ static struct btrfs_path *alloc_path_for_send(void)
                return NULL;
        path->search_commit_root = 1;
        path->skip_locking = 1;
+        path->need_commit_sem = 1;
        return path;
 }
@@ -728,29 +780,22 @@ out:
 /*
 * Helper function to retrieve some fields from an inode item.
 */
-static int get_inode_info(struct btrfs_root *root,
+static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
-                          u64 ino, u64 *size, u64 *gen,
+                          u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
-                          u64 *mode, u64 *uid, u64 *gid,
+                          u64 *gid, u64 *rdev)
-                          u64 *rdev)
 {
        int ret;
        struct btrfs_inode_item *ii;
        struct btrfs_key key;
-        struct btrfs_path *path;
-        path = alloc_path_for_send();
-        if (!path)
-                return -ENOMEM;
        key.objectid = ino;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-        if (ret < 0)
-                goto out;
        if (ret) {
-                ret = -ENOENT;
+                if (ret > 0)
-                goto out;
+                        ret = -ENOENT;
+                return ret;
        }
        ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -768,7 +813,22 @@ static int get_inode_info(struct btrfs_root *root,
        if (rdev)
                *rdev = btrfs_inode_rdev(path->nodes[0], ii);
-out:
+        return ret;
+}
+static int get_inode_info(struct btrfs_root *root,
+                          u64 ino, u64 *size, u64 *gen,
+                          u64 *mode, u64 *uid, u64 *gid,
+                          u64 *rdev)
+{
+        struct btrfs_path *path;
+        int ret;
+        path = alloc_path_for_send();
+        if (!path)
+                return -ENOMEM;
+        ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
+                               rdev);
        btrfs_free_path(path);
        return ret;
 }
@@ -915,9 +975,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
        struct btrfs_dir_item *di;
        struct btrfs_key di_key;
        char *buf = NULL;
-        char *buf2 = NULL;
+        const int buf_len = PATH_MAX;
-        int buf_len;
-        int buf_virtual = 0;
        u32 name_len;
        u32 data_len;
        u32 cur;
@@ -927,7 +985,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
        int num;
        u8 type;
-        buf_len = PAGE_SIZE;
        buf = kmalloc(buf_len, GFP_NOFS);
        if (!buf) {
                ret = -ENOMEM;
@@ -949,30 +1006,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
                type = btrfs_dir_type(eb, di);
                btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+                /*
+                 * Path too long
+                 */
                if (name_len + data_len > buf_len) {
-                        buf_len = PAGE_ALIGN(name_len + data_len);
+                        ret = -ENAMETOOLONG;
-                        if (buf_virtual) {
+                        goto out;
-                                buf2 = vmalloc(buf_len);
-                                if (!buf2) {
-                                        ret = -ENOMEM;
-                                        goto out;
-                                }
-                                vfree(buf);
-                        } else {
-                                buf2 = krealloc(buf, buf_len, GFP_NOFS);
-                                if (!buf2) {
-                                        buf2 = vmalloc(buf_len);
-                                        if (!buf2) {
-                                                ret = -ENOMEM;
-                                                goto out;
-                                        }
-                                        kfree(buf);
-                                        buf_virtual = 1;
-                                }
-                        }
-                        buf = buf2;
-                        buf2 = NULL;
                }
                read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -995,10 +1034,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
        }
 out:
-        if (buf_virtual)
+        kfree(buf);
-                vfree(buf);
-        else
-                kfree(buf);
        return ret;
 }
@@ -1066,6 +1102,7 @@ out:
 struct backref_ctx {
        struct send_ctx *sctx;
+        struct btrfs_path *path;
        /* number of total found references */
        u64 found;
@@ -1136,8 +1173,9 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
         * There are inodes that have extents that lie behind its i_size. Don't
         * accept clones from these extents.
         */
-        ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
+        ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
-                        NULL);
+                               NULL, NULL, NULL);
+        btrfs_release_path(bctx->path);
        if (ret < 0)
                return ret;
@@ -1216,12 +1254,17 @@ static int find_extent_clone(struct send_ctx *sctx,
        if (!tmp_path)
                return -ENOMEM;
+        /* We only use this path under the commit sem */
+        tmp_path->need_commit_sem = 0;
        backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
        if (!backref_ctx) {
                ret = -ENOMEM;
                goto out;
        }
+        backref_ctx->path = tmp_path;
        if (data_offset >= ino_size) {
                /*
                 * There may be extents that lie behind the file's size.
@@ -1249,8 +1292,10 @@ static int find_extent_clone(struct send_ctx *sctx,
        }
        logical = disk_byte + btrfs_file_extent_offset(eb, fi);
+        down_read(&sctx->send_root->fs_info->commit_root_sem);
        ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
                                  &found_key, &flags);
+        up_read(&sctx->send_root->fs_info->commit_root_sem);
        btrfs_release_path(tmp_path);
        if (ret < 0)
@@ -1292,8 +1337,6 @@ static int find_extent_clone(struct send_ctx *sctx,
                extent_item_pos = logical - found_key.objectid;
        else
                extent_item_pos = 0;
-        extent_item_pos = logical - found_key.objectid;
        ret = iterate_extent_inodes(sctx->send_root->fs_info,
                                        found_key.objectid, extent_item_pos, 1,
                                        __iterate_backrefs, backref_ctx);
@@ -1418,11 +1461,7 @@ static int gen_unique_name(struct send_ctx *sctx,
        while (1) {
                len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
                                ino, gen, idx);
-                if (len >= sizeof(tmp)) {
+                ASSERT(len < sizeof(tmp));
-                        /* should really not happen */
-                        ret = -EOVERFLOW;
-                        goto out;
-                }
                di = btrfs_lookup_dir_item(NULL, sctx->send_root,
                                path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1632,7 +1671,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
                goto out;
        }
-        if (key.type == BTRFS_INODE_REF_KEY) {
+        if (found_key.type == BTRFS_INODE_REF_KEY) {
                struct btrfs_inode_ref *iref;
                iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                      struct btrfs_inode_ref);
@@ -1898,13 +1937,20 @@ static void name_cache_delete(struct send_ctx *sctx,
        nce_head = radix_tree_lookup(&sctx->name_cache,
                        (unsigned long)nce->ino);
-        BUG_ON(!nce_head);
+        if (!nce_head) {
+                btrfs_err(sctx->send_root->fs_info,
+              "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
+                        nce->ino, sctx->name_cache_size);
+        }
        list_del(&nce->radix_list);
        list_del(&nce->list);
        sctx->name_cache_size--;
-        if (list_empty(nce_head)) {
+        /*
+         * We may not get to the final release of nce_head if the lookup fails
+         */
+        if (nce_head && list_empty(nce_head)) {
                radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
                kfree(nce_head);
        }
@@ -1977,7 +2023,6 @@ static void name_cache_free(struct send_ctx *sctx)
 */
 static int __get_cur_name_and_parent(struct send_ctx *sctx,
                                     u64 ino, u64 gen,
-                                     int skip_name_cache,
                                     u64 *parent_ino,
                                     u64 *parent_gen,
                                     struct fs_path *dest)
@@ -1987,8 +2032,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
        struct btrfs_path *path = NULL;
        struct name_cache_entry *nce = NULL;
-        if (skip_name_cache)
-                goto get_ref;
        /*
         * First check if we already did a call to this function with the same
         * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
@@ -2033,12 +2076,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
                goto out_cache;
        }
-get_ref:
        /*
         * Depending on whether the inode was already processed or not, use
         * send_root or parent_root for ref lookup.
         */
-        if (ino < sctx->send_progress && !skip_name_cache)
+        if (ino < sctx->send_progress)
                ret = get_first_ref(sctx->send_root, ino,
                                    parent_ino, parent_gen, dest);
        else
@@ -2062,8 +2104,6 @@ get_ref:
                        goto out;
                ret = 1;
        }
-        if (skip_name_cache)
-                goto out;
 out_cache:
        /*
@@ -2131,9 +2171,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
        u64 parent_inode = 0;
        u64 parent_gen = 0;
        int stop = 0;
-        u64 start_ino = ino;
-        u64 start_gen = gen;
-        int skip_name_cache = 0;
        name = fs_path_alloc();
        if (!name) {
@@ -2141,31 +2178,33 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
                goto out;
        }
-        if (is_waiting_for_move(sctx, ino))
-                skip_name_cache = 1;
-again:
        dest->reversed = 1;
        fs_path_reset(dest);
        while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
                fs_path_reset(name);
-                ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache,
+                if (is_waiting_for_rm(sctx, ino)) {
-                                &parent_inode, &parent_gen, name);
+                        ret = gen_unique_name(sctx, ino, gen, name);
+                        if (ret < 0)
+                                goto out;
+                        ret = fs_path_add_path(dest, name);
+                        break;
+                }
+                if (is_waiting_for_move(sctx, ino)) {
+                        ret = get_first_ref(sctx->parent_root, ino,
+                                            &parent_inode, &parent_gen, name);
+                } else {
+                        ret = __get_cur_name_and_parent(sctx, ino, gen,
+                                                        &parent_inode,
+                                                        &parent_gen, name);
+                        if (ret)
+                                stop = 1;
+                }
                if (ret < 0)
                        goto out;
-                if (ret)
-                        stop = 1;
-                if (!skip_name_cache &&
-                    is_waiting_for_move(sctx, parent_inode)) {
-                        ino = start_ino;
-                        gen = start_gen;
-                        stop = 0;
-                        skip_name_cache = 1;
-                        goto again;
-                }
                ret = fs_path_add_path(dest, name);
                if (ret < 0)
@@ -2429,10 +2468,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
        if (!p)
                return -ENOMEM;
-        ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
+        if (ino != sctx->cur_ino) {
-                        NULL, &rdev);
+                ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
-        if (ret < 0)
+                                     NULL, NULL, &rdev);
-                goto out;
+                if (ret < 0)
+                        goto out;
+        } else {
+                gen = sctx->cur_inode_gen;
+                mode = sctx->cur_inode_mode;
+                rdev = sctx->cur_inode_rdev;
+        }
        if (S_ISREG(mode)) {
                cmd = BTRFS_SEND_C_MKFILE;
@@ -2512,17 +2557,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
        key.objectid = dir;
        key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = 0;
+        ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
        while (1) {
-                ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
+                eb = path->nodes[0];
-                                1, 0);
+                slot = path->slots[0];
-                if (ret < 0)
+                if (slot >= btrfs_header_nritems(eb)) {
-                        goto out;
+                        ret = btrfs_next_leaf(sctx->send_root, path);
-                if (!ret) {
+                        if (ret < 0) {
-                        eb = path->nodes[0];
+                                goto out;
-                        slot = path->slots[0];
+                        } else if (ret > 0) {
-                        btrfs_item_key_to_cpu(eb, &found_key, slot);
+                                ret = 0;
+                                break;
+                        }
+                        continue;
                }
-                if (ret || found_key.objectid != key.objectid ||
+                btrfs_item_key_to_cpu(eb, &found_key, slot);
+                if (found_key.objectid != key.objectid ||
                    found_key.type != key.type) {
                        ret = 0;
                        goto out;
@@ -2537,8 +2591,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
                        goto out;
                }
-                key.offset = found_key.offset + 1;
+                path->slots[0]++;
-                btrfs_release_path(path);
        }
 out:
@@ -2590,7 +2643,7 @@ struct recorded_ref {
 * everything mixed. So we first record all refs and later process them.
 * This function is a helper to record one ref.
 */
-static int record_ref(struct list_head *head, u64 dir,
+static int __record_ref(struct list_head *head, u64 dir,
                      u64 dir_gen, struct fs_path *path)
 {
        struct recorded_ref *ref;
@@ -2676,12 +2729,78 @@ out:
        return ret;
 }
+static struct orphan_dir_info *
+add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+        struct rb_node **p = &sctx->orphan_dirs.rb_node;
+        struct rb_node *parent = NULL;
+        struct orphan_dir_info *entry, *odi;
+        odi = kmalloc(sizeof(*odi), GFP_NOFS);
+        if (!odi)
+                return ERR_PTR(-ENOMEM);
+        odi->ino = dir_ino;
+        odi->gen = 0;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct orphan_dir_info, node);
+                if (dir_ino < entry->ino) {
+                        p = &(*p)->rb_left;
+                } else if (dir_ino > entry->ino) {
+                        p = &(*p)->rb_right;
+                } else {
+                        kfree(odi);
+                        return entry;
+                }
+        }
+        rb_link_node(&odi->node, parent, p);
+        rb_insert_color(&odi->node, &sctx->orphan_dirs);
+        return odi;
+}
+static struct orphan_dir_info *
+get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+        struct rb_node *n = sctx->orphan_dirs.rb_node;
+        struct orphan_dir_info *entry;
+        while (n) {
+                entry = rb_entry(n, struct orphan_dir_info, node);
+                if (dir_ino < entry->ino)
+                        n = n->rb_left;
+                else if (dir_ino > entry->ino)
+                        n = n->rb_right;
+                else
+                        return entry;
+        }
+        return NULL;
+}
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
+{
+        struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
+        return odi != NULL;
+}
+static void free_orphan_dir_info(struct send_ctx *sctx,
+                                 struct orphan_dir_info *odi)
+{
+        if (!odi)
+                return;
+        rb_erase(&odi->node, &sctx->orphan_dirs);
+        kfree(odi);
+}
 /*
 * Returns 1 if a directory can be removed at this point in time.
 * We check this by iterating all dir items and checking if the inode behind
 * the dir item was already processed.
 */
-static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+                     u64 send_progress)
 {
        int ret = 0;
        struct btrfs_root *root = sctx->parent_root;
@@ -2704,31 +2823,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
        key.objectid = dir;
        key.type = BTRFS_DIR_INDEX_KEY;
        key.offset = 0;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
        while (1) {
-                ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
+                struct waiting_dir_move *dm;
-                if (ret < 0)
-                        goto out;
+                if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
-                if (!ret) {
+                        ret = btrfs_next_leaf(root, path);
-                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                        if (ret < 0)
-                                        path->slots[0]);
+                                goto out;
+                        else if (ret > 0)
+                                break;
+                        continue;
                }
-                if (ret || found_key.objectid != key.objectid ||
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-                    found_key.type != key.type) {
+                                      path->slots[0]);
+                if (found_key.objectid != key.objectid ||
+                    found_key.type != key.type)
                        break;
-                }
                di = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                struct btrfs_dir_item);
                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
+                dm = get_waiting_dir_move(sctx, loc.objectid);
+                if (dm) {
+                        struct orphan_dir_info *odi;
+                        odi = add_orphan_dir_info(sctx, dir);
+                        if (IS_ERR(odi)) {
+                                ret = PTR_ERR(odi);
+                                goto out;
+                        }
+                        odi->gen = dir_gen;
+                        dm->rmdir_ino = dir;
+                        ret = 0;
+                        goto out;
+                }
                if (loc.objectid > send_progress) {
                        ret = 0;
                        goto out;
                }
-                btrfs_release_path(path);
+                path->slots[0]++;
-                key.offset = found_key.offset + 1;
        }
        ret = 1;
@@ -2740,19 +2880,9 @@ out:
 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
 {
-        struct rb_node *n = sctx->waiting_dir_moves.rb_node;
+        struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
-        struct waiting_dir_move *entry;
-        while (n) {
+        return entry != NULL;
-                entry = rb_entry(n, struct waiting_dir_move, node);
-                if (ino < entry->ino)
-                        n = n->rb_left;
-                else if (ino > entry->ino)
-                        n = n->rb_right;
-                else
-                        return 1;
-        }
-        return 0;
 }
 static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
@@ -2765,6 +2895,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
        if (!dm)
                return -ENOMEM;
        dm->ino = ino;
+        dm->rmdir_ino = 0;
        while (*p) {
                parent = *p;
@@ -2784,31 +2915,41 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
        return 0;
 }
-static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
 {
        struct rb_node *n = sctx->waiting_dir_moves.rb_node;
        struct waiting_dir_move *entry;
        while (n) {
                entry = rb_entry(n, struct waiting_dir_move, node);
-                if (ino < entry->ino) {
+                if (ino < entry->ino)
                        n = n->rb_left;
-                } else if (ino > entry->ino) {
+                else if (ino > entry->ino)
                        n = n->rb_right;
-                } else {
+                else
-                        rb_erase(&entry->node, &sctx->waiting_dir_moves);
+                        return entry;
-                        kfree(entry);
-                        return 0;
-                }
        }
-        return -ENOENT;
+        return NULL;
+}
+static void free_waiting_dir_move(struct send_ctx *sctx,
+                                  struct waiting_dir_move *dm)
+{
+        if (!dm)
+                return;
+        rb_erase(&dm->node, &sctx->waiting_dir_moves);
+        kfree(dm);
 }
-static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
+static int add_pending_dir_move(struct send_ctx *sctx,
+                                u64 ino,
+                                u64 ino_gen,
+                                u64 parent_ino)
 {
        struct rb_node **p = &sctx->pending_dir_moves.rb_node;
        struct rb_node *parent = NULL;
-        struct pending_dir_move *entry, *pm;
+        struct pending_dir_move *entry = NULL, *pm;
        struct recorded_ref *cur;
        int exists = 0;
        int ret;
@@ -2817,8 +2958,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
        if (!pm)
                return -ENOMEM;
        pm->parent_ino = parent_ino;
-        pm->ino = sctx->cur_ino;
+        pm->ino = ino;
-        pm->gen = sctx->cur_inode_gen;
+        pm->gen = ino_gen;
        INIT_LIST_HEAD(&pm->list);
        INIT_LIST_HEAD(&pm->update_refs);
        RB_CLEAR_NODE(&pm->node);
@@ -2888,19 +3029,52 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 {
        struct fs_path *from_path = NULL;
        struct fs_path *to_path = NULL;
+        struct fs_path *name = NULL;
        u64 orig_progress = sctx->send_progress;
        struct recorded_ref *cur;
+        u64 parent_ino, parent_gen;
+        struct waiting_dir_move *dm = NULL;
+        u64 rmdir_ino = 0;
        int ret;
+        name = fs_path_alloc();
        from_path = fs_path_alloc();
-        if (!from_path)
+        if (!name || !from_path) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                goto out;
+        }
-        sctx->send_progress = pm->ino;
+        dm = get_waiting_dir_move(sctx, pm->ino);
-        ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
+        ASSERT(dm);
+        rmdir_ino = dm->rmdir_ino;
+        free_waiting_dir_move(sctx, dm);
+        ret = get_first_ref(sctx->parent_root, pm->ino,
+                            &parent_ino, &parent_gen, name);
        if (ret < 0)
                goto out;
+        if (parent_ino == sctx->cur_ino) {
+                /* child only renamed, not moved */
+                ASSERT(parent_gen == sctx->cur_inode_gen);
+                ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+                                   from_path);
+                if (ret < 0)
+                        goto out;
+                ret = fs_path_add_path(from_path, name);
+                if (ret < 0)
+                        goto out;
+        } else {
+                /* child moved and maybe renamed too */
+                sctx->send_progress = pm->ino;
+                ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
+                if (ret < 0)
+                        goto out;
+        }
+        fs_path_free(name);
+        name = NULL;
        to_path = fs_path_alloc();
        if (!to_path) {
                ret = -ENOMEM;
@@ -2908,9 +3082,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
        }
        sctx->send_progress = sctx->cur_ino + 1;
-        ret = del_waiting_dir_move(sctx, pm->ino);
-        ASSERT(ret == 0);
        ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
        if (ret < 0)
                goto out;
@@ -2919,6 +3090,35 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
        if (ret < 0)
                goto out;
+        if (rmdir_ino) {
+                struct orphan_dir_info *odi;
+                odi = get_orphan_dir_info(sctx, rmdir_ino);
+                if (!odi) {
+                        /* already deleted */
+                        goto finish;
+                }
+                ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+                if (ret < 0)
+                        goto out;
+                if (!ret)
+                        goto finish;
+                name = fs_path_alloc();
+                if (!name) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
+                if (ret < 0)
+                        goto out;
+                ret = send_rmdir(sctx, name);
+                if (ret < 0)
+                        goto out;
+                free_orphan_dir_info(sctx, odi);
+        }
+finish:
        ret = send_utimes(sctx, pm->ino, pm->gen);
        if (ret < 0)
                goto out;
@@ -2928,12 +3128,15 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
         * and old parent(s).
         */
        list_for_each_entry(cur, &pm->update_refs, list) {
+                if (cur->dir == rmdir_ino)
+                        continue;
                ret = send_utimes(sctx, cur->dir, cur->dir_gen);
                if (ret < 0)
                        goto out;
        }
 out:
+        fs_path_free(name);
        fs_path_free(from_path);
        fs_path_free(to_path);
        sctx->send_progress = orig_progress;
@@ -3005,17 +3208,19 @@ static int wait_for_parent_move(struct send_ctx *sctx,
        int ret;
        u64 ino = parent_ref->dir;
        u64 parent_ino_before, parent_ino_after;
-        u64 new_gen, old_gen;
+        u64 old_gen;
        struct fs_path *path_before = NULL;
        struct fs_path *path_after = NULL;
        int len1, len2;
+        int register_upper_dirs;
-        if (parent_ref->dir <= sctx->cur_ino)
+        u64 gen;
-                return 0;
        if (is_waiting_for_move(sctx, ino))
                return 1;
+        if (parent_ref->dir <= sctx->cur_ino)
+                return 0;
        ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
                             NULL, NULL, NULL, NULL);
        if (ret == -ENOENT)
@@ -3023,12 +3228,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
        else if (ret < 0)
                return ret;
-        ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen,
+        if (parent_ref->dir_gen != old_gen)
-                             NULL, NULL, NULL, NULL);
-        if (ret < 0)
-                return ret;
-        if (new_gen != old_gen)
                return 0;
        path_before = fs_path_alloc();
@@ -3051,7 +3251,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
        }
        ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
-                            NULL, path_after);
+                            &gen, path_after);
        if (ret == -ENOENT) {
                ret = 0;
                goto out;
@@ -3061,13 +3261,67 @@ static int wait_for_parent_move(struct send_ctx *sctx,
        len1 = fs_path_len(path_before);
        len2 = fs_path_len(path_after);
-        if ((parent_ino_before != parent_ino_after) && (len1 != len2 ||
+        if (parent_ino_before != parent_ino_after || len1 != len2 ||
-             memcmp(path_before->start, path_after->start, len1))) {
+             memcmp(path_before->start, path_after->start, len1)) {
                ret = 1;
                goto out;
        }
        ret = 0;
+        /*
+         * Ok, our new most direct ancestor has a higher inode number but
+         * wasn't moved/renamed. So maybe some of the new ancestors higher in
+         * the hierarchy have an higher inode number too *and* were renamed
+         * or moved - in this case we need to wait for the ancestor's rename
+         * or move operation before we can do the move/rename for the current
+         * inode.
+         */
+        register_upper_dirs = 0;
+        ino = parent_ino_after;
+again:
+        while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) {
+                u64 parent_gen;
+                fs_path_reset(path_before);
+                fs_path_reset(path_after);
+                ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
+                                    &parent_gen, path_after);
+                if (ret < 0)
+                        goto out;
+                ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
+                                    NULL, path_before);
+                if (ret == -ENOENT) {
+                        ret = 0;
+                        break;
+                } else if (ret < 0) {
+                        goto out;
+                }
+                len1 = fs_path_len(path_before);
+                len2 = fs_path_len(path_after);
+                if (parent_ino_before != parent_ino_after || len1 != len2 ||
+                    memcmp(path_before->start, path_after->start, len1)) {
+                        ret = 1;
+                        if (register_upper_dirs) {
+                                break;
+                        } else {
+                                register_upper_dirs = 1;
+                                ino = parent_ref->dir;
+                                gen = parent_ref->dir_gen;
+                                goto again;
+                        }
+                } else if (register_upper_dirs) {
+                        ret = add_pending_dir_move(sctx, ino, gen,
+                                                   parent_ino_after);
+                        if (ret < 0 && ret != -EEXIST)
+                                goto out;
+                }
+                ino = parent_ino_after;
+                gen = parent_gen;
+        }
 out:
        fs_path_free(path_before);
        fs_path_free(path_after);
@@ -3089,6 +3343,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
        u64 ow_gen;
        int did_overwrite = 0;
        int is_orphan = 0;
+        u64 last_dir_ino_rm = 0;
 verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
@@ -3227,9 +3482,14 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                                 * dirs, we always have one new and one deleted
                                 * ref. The deleted ref is ignored later.
                                 */
-                                if (wait_for_parent_move(sctx, cur)) {
+                                ret = wait_for_parent_move(sctx, cur);
+                                if (ret < 0)
+                                        goto out;
+                                if (ret) {
                                        ret = add_pending_dir_move(sctx,
-                                                                   cur->dir);
+                                                           sctx->cur_ino,
+                                                           sctx->cur_inode_gen,
+                                                           cur->dir);
                                        *pending_move = 1;
                                } else {
                                        ret = send_rename(sctx, valid_path,
@@ -3259,7 +3519,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                 * later, we do this check again and rmdir it then if possible.
                 * See the use of check_dirs for more details.
                 */
-                ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino);
+                ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+                                sctx->cur_ino);
                if (ret < 0)
                        goto out;
                if (ret) {
@@ -3350,8 +3611,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                        ret = send_utimes(sctx, cur->dir, cur->dir_gen);
                        if (ret < 0)
                                goto out;
-                } else if (ret == inode_state_did_delete) {
+                } else if (ret == inode_state_did_delete &&
-                        ret = can_rmdir(sctx, cur->dir, sctx->cur_ino);
+                           cur->dir != last_dir_ino_rm) {
+                        ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
+                                        sctx->cur_ino);
                        if (ret < 0)
                                goto out;
                        if (ret) {
@@ -3362,6 +3625,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                                ret = send_rmdir(sctx, valid_path);
                                if (ret < 0)
                                        goto out;
+                                last_dir_ino_rm = cur->dir;
                        }
                }
        }
@@ -3375,9 +3639,8 @@ out:
        return ret;
 }
-static int __record_new_ref(int num, u64 dir, int index,
+static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
-                            struct fs_path *name,
+                      struct fs_path *name, void *ctx, struct list_head *refs)
-                            void *ctx)
 {
        int ret = 0;
        struct send_ctx *sctx = ctx;
@@ -3388,7 +3651,7 @@ static int __record_new_ref(int num, u64 dir, int index,
        if (!p)
                return -ENOMEM;
-        ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
+        ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
                        NULL, NULL);
        if (ret < 0)
                goto out;
@@ -3400,7 +3663,7 @@ static int __record_new_ref(int num, u64 dir, int index,
        if (ret < 0)
                goto out;
-        ret = record_ref(&sctx->new_refs, dir, gen, p);
+        ret = __record_ref(refs, dir, gen, p);
 out:
        if (ret)
@@ -3408,37 +3671,23 @@ out:
        return ret;
 }
+static int __record_new_ref(int num, u64 dir, int index,
+                            struct fs_path *name,
+                            void *ctx)
+{
+        struct send_ctx *sctx = ctx;
+        return record_ref(sctx->send_root, num, dir, index, name,
+                          ctx, &sctx->new_refs);
+}
 static int __record_deleted_ref(int num, u64 dir, int index,
                                struct fs_path *name,
                                void *ctx)
 {
-        int ret = 0;
        struct send_ctx *sctx = ctx;
-        struct fs_path *p;
+        return record_ref(sctx->parent_root, num, dir, index, name,
-        u64 gen;
+                          ctx, &sctx->deleted_refs);
-        p = fs_path_alloc();
-        if (!p)
-                return -ENOMEM;
-        ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
-                        NULL, NULL);
-        if (ret < 0)
-                goto out;
-        ret = get_cur_path(sctx, dir, gen, p);
-        if (ret < 0)
-                goto out;
-        ret = fs_path_add_path(p, name);
-        if (ret < 0)
-                goto out;
-        ret = record_ref(&sctx->deleted_refs, dir, gen, p);
-out:
-        if (ret)
-                fs_path_free(p);
-        return ret;
 }
 static int record_new_ref(struct send_ctx *sctx)
@@ -3619,21 +3868,31 @@ static int process_all_refs(struct send_ctx *sctx,
                root = sctx->parent_root;
                cb = __record_deleted_ref;
        } else {
-                BUG();
+                btrfs_err(sctx->send_root->fs_info,
+                                "Wrong command %d in process_all_refs", cmd);
+                ret = -EINVAL;
+                goto out;
        }
        key.objectid = sctx->cmp_key->objectid;
        key.type = BTRFS_INODE_REF_KEY;
        key.offset = 0;
-        while (1) {
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-                ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
+        if (ret < 0)
-                if (ret < 0)
+                goto out;
-                        goto out;
-                if (ret)
-                        break;
+        while (1) {
                eb = path->nodes[0];
                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(eb)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto out;
+                        else if (ret > 0)
+                                break;
+                        continue;
+                }
                btrfs_item_key_to_cpu(eb, &found_key, slot);
                if (found_key.objectid != key.objectid ||
@@ -3642,11 +3901,10 @@ static int process_all_refs(struct send_ctx *sctx,
                        break;
                ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
-                btrfs_release_path(path);
                if (ret < 0)
                        goto out;
-                key.offset = found_key.offset + 1;
+                path->slots[0]++;
        }
        btrfs_release_path(path);
@@ -3927,19 +4185,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
        key.objectid = sctx->cmp_key->objectid;
        key.type = BTRFS_XATTR_ITEM_KEY;
        key.offset = 0;
-        while (1) {
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-                ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
+        if (ret < 0)
-                if (ret < 0)
+                goto out;
-                        goto out;
-                if (ret) {
-                        ret = 0;
-                        goto out;
-                }
+        while (1) {
                eb = path->nodes[0];
                slot = path->slots[0];
-                btrfs_item_key_to_cpu(eb, &found_key, slot);
+                if (slot >= btrfs_header_nritems(eb)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0) {
+                                goto out;
+                        } else if (ret > 0) {
+                                ret = 0;
+                                break;
+                        }
+                        continue;
+                }
+                btrfs_item_key_to_cpu(eb, &found_key, slot);
                if (found_key.objectid != key.objectid ||
                    found_key.type != key.type) {
                        ret = 0;
@@ -3951,8 +4215,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
                if (ret < 0)
                        goto out;
-                btrfs_release_path(path);
+                path->slots[0]++;
-                key.offset = found_key.offset + 1;
        }
 out:
@@ -3991,6 +4254,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
                goto out;
        last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+        /* initial readahead */
+        memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+        file_ra_state_init(&sctx->ra, inode->i_mapping);
+        btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
+                       last_index - index + 1);
        while (index <= last_index) {
                unsigned cur_len = min_t(unsigned, len,
                                         PAGE_CACHE_SIZE - pg_offset);
@@ -4174,6 +4444,9 @@ static int send_hole(struct send_ctx *sctx, u64 end)
        p = fs_path_alloc();
        if (!p)
                return -ENOMEM;
+        ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+        if (ret < 0)
+                goto tlv_put_failure;
        memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
        while (offset < end) {
                len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
@@ -4181,9 +4454,6 @@ static int send_hole(struct send_ctx *sctx, u64 end)
                ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
                if (ret < 0)
                        break;
-                ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
-                if (ret < 0)
-                        break;
                TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
                TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
                TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
@@ -4724,7 +4994,9 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
        if (S_ISREG(sctx->cur_inode_mode)) {
                if (need_send_hole(sctx)) {
-                        if (sctx->cur_inode_last_extent == (u64)-1) {
+                        if (sctx->cur_inode_last_extent == (u64)-1 ||
+                            sctx->cur_inode_last_extent <
+                            sctx->cur_inode_size) {
                                ret = get_last_extent(sctx, (u64)-1);
                                if (ret)
                                        goto out;
@@ -4763,18 +5035,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
                ret = apply_children_dir_moves(sctx);
                if (ret)
                        goto out;
+                /*
+                 * Need to send that every time, no matter if it actually
+                 * changed between the two trees as we have done changes to
+                 * the inode before. If our inode is a directory and it's
+                 * waiting to be moved/renamed, we will send its utimes when
+                 * it's moved/renamed, therefore we don't need to do it here.
+                 */
+                sctx->send_progress = sctx->cur_ino + 1;
+                ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+                if (ret < 0)
+                        goto out;
        }
-        /*
-         * Need to send that every time, no matter if it actually
-         * changed between the two trees as we have done changes to
-         * the inode before.
-         */
-        sctx->send_progress = sctx->cur_ino + 1;
-        ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
-        if (ret < 0)
-                goto out;
 out:
        return ret;
 }
@@ -4840,6 +5113,8 @@ static int changed_inode(struct send_ctx *sctx,
                                sctx->left_path->nodes[0], left_ii);
                sctx->cur_inode_mode = btrfs_inode_mode(
                                sctx->left_path->nodes[0], left_ii);
+                sctx->cur_inode_rdev = btrfs_inode_rdev(
+                                sctx->left_path->nodes[0], left_ii);
                if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
                        ret = send_create_inode_if_needed(sctx);
        } else if (result == BTRFS_COMPARE_TREE_DELETED) {
@@ -4884,6 +5159,8 @@ static int changed_inode(struct send_ctx *sctx,
                                        sctx->left_path->nodes[0], left_ii);
                        sctx->cur_inode_mode = btrfs_inode_mode(
                                        sctx->left_path->nodes[0], left_ii);
+                        sctx->cur_inode_rdev = btrfs_inode_rdev(
+                                        sctx->left_path->nodes[0], left_ii);
                        ret = send_create_inode_if_needed(sctx);
                        if (ret < 0)
                                goto out;
@@ -5124,37 +5401,15 @@ static int full_send_tree(struct send_ctx *sctx)
        struct btrfs_path *path;
        struct extent_buffer *eb;
        int slot;
-        u64 start_ctransid;
-        u64 ctransid;
        path = alloc_path_for_send();
        if (!path)
                return -ENOMEM;
-        spin_lock(&send_root->root_item_lock);
-        start_ctransid = btrfs_root_ctransid(&send_root->root_item);
-        spin_unlock(&send_root->root_item_lock);
        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-        /*
-         * Make sure the tree has not changed after re-joining. We detect this
-         * by comparing start_ctransid and ctransid. They should always match.
-         */
-        spin_lock(&send_root->root_item_lock);
-        ctransid = btrfs_root_ctransid(&send_root->root_item);
-        spin_unlock(&send_root->root_item_lock);
-        if (ctransid != start_ctransid) {
-                WARN(1, KERN_WARNING "BTRFS: the root that you're trying to "
-                                     "send was modified in between. This is "
-                                     "probably a bug.\n");
-                ret = -EIO;
-                goto out;
-        }
        ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
        if (ret < 0)
                goto out;
@@ -5340,6 +5595,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        sctx->pending_dir_moves = RB_ROOT;
        sctx->waiting_dir_moves = RB_ROOT;
+        sctx->orphan_dirs = RB_ROOT;
        sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
                        (arg->clone_sources_count + 1));
@@ -5435,7 +5691,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                        NULL);
        sort_clone_roots = 1;
+        current->journal_info = (void *)BTRFS_SEND_TRANS_STUB;
        ret = send_subvol(sctx);
+        current->journal_info = NULL;
        if (ret < 0)
                goto out;
@@ -5477,6 +5735,16 @@ out:
                kfree(dm);
        }
+        WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
+        while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
+                struct rb_node *n;
+                struct orphan_dir_info *odi;
+                n = rb_first(&sctx->orphan_dirs);
+                odi = rb_entry(n, struct orphan_dir_info, node);
+                free_orphan_dir_info(sctx, odi);
+        }
        if (sort_clone_roots) {
                for (i = 0; i < sctx->clone_roots_cnt; i++)
                        btrfs_root_dec_send_in_progress(
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d04db817be5c..9601d25a4607 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,6 +66,8 @@
 static const struct super_operations btrfs_super_ops;
 static struct file_system_type btrfs_fs_type;
+static int btrfs_remount(struct super_block *sb, int *flags, char *data);
 static const char *btrfs_decode_error(int errno)
 {
        char *errstr = "unknown";
@@ -383,20 +385,6 @@ static match_table_t tokens = {
        {Opt_err, NULL},
 };
-#define btrfs_set_and_info(root, opt, fmt, args...)                     \
-{                                                                       \
-        if (!btrfs_test_opt(root, opt))                                 \
-                btrfs_info(root->fs_info, fmt, ##args);                 \
-        btrfs_set_opt(root->fs_info->mount_opt, opt);                   \
-}
-#define btrfs_clear_and_info(root, opt, fmt, args...)                   \
-{                                                                       \
-        if (btrfs_test_opt(root, opt))                                  \
-                btrfs_info(root->fs_info, fmt, ##args);                 \
-        btrfs_clear_opt(root->fs_info->mount_opt, opt);                 \
-}
 /*
 * Regular mount options parser.  Everything that is needed only when
 * reading in a new superblock is parsed here.
@@ -1184,7 +1172,31 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
                return ERR_PTR(-ENOMEM);
        mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
                             newargs);
+        if (PTR_RET(mnt) == -EBUSY) {
+                if (flags & MS_RDONLY) {
+                        mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
+                                             newargs);
+                } else {
+                        int r;
+                        mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
+                                             newargs);
+                        if (IS_ERR(mnt)) {
+                                kfree(newargs);
+                                return ERR_CAST(mnt);
+                        }
+                        r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+                        if (r < 0) {
+                                /* FIXME: release vfsmount mnt ??*/
+                                kfree(newargs);
+                                return ERR_PTR(r);
+                        }
+                }
+        }
        kfree(newargs);
        if (IS_ERR(mnt))
                return ERR_CAST(mnt);
@@ -1305,13 +1317,6 @@ error_fs_info:
        return ERR_PTR(error);
 }
-static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
-{
-        spin_lock_irq(&workers->lock);
-        workers->max_workers = new_limit;
-        spin_unlock_irq(&workers->lock);
-}
 static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
                                     int new_pool_size, int old_pool_size)
 {
@@ -1323,21 +1328,20 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
        btrfs_info(fs_info, "resize thread pool %d -> %d",
               old_pool_size, new_pool_size);
-        btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
+        btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
-        btrfs_set_max_workers(&fs_info->workers, new_pool_size);
+        btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
-        btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
+        btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
-        btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
+        btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
-        btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
+        btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
-        btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
+        btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
-        btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
+        btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
-        btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size);
+                                new_pool_size);
-        btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size);
+        btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
-        btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size);
+        btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
-        btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
+        btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
-        btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
+        btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
-        btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
+        btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
-        btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
+                                new_pool_size);
-                              new_pool_size);
 }
 static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
@@ -1388,6 +1392,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
        unsigned int old_metadata_ratio = fs_info->metadata_ratio;
        int ret;
+        sync_filesystem(sb);
        btrfs_remount_prepare(fs_info);
        ret = btrfs_parse_options(root, data);
@@ -1479,6 +1484,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                sb->s_flags &= ~MS_RDONLY;
        }
 out:
+        wake_up_process(fs_info->transaction_kthread);
        btrfs_remount_cleanup(fs_info, old_opts);
        return 0;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 865f4cf9a769..c5eb2143dc66 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -24,6 +24,7 @@
 #include <linux/kobject.h>
 #include <linux/bug.h>
 #include <linux/genhd.h>
+#include <linux/debugfs.h>
 #include "ctree.h"
 #include "disk-io.h"
@@ -599,6 +600,12 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
 /* /sys/fs/btrfs/ entry */
 static struct kset *btrfs_kset;
+/* /sys/kernel/debug/btrfs */
+static struct dentry *btrfs_debugfs_root_dentry;
+/* Debugging tunables and exported data */
+u64 btrfs_debugfs_test;
 int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 {
        int error;
@@ -642,27 +649,41 @@ failure:
        return error;
 }
+static int btrfs_init_debugfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+        btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
+        if (!btrfs_debugfs_root_dentry)
+                return -ENOMEM;
+        debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
+                        &btrfs_debugfs_test);
+#endif
+        return 0;
+}
 int btrfs_init_sysfs(void)
 {
        int ret;
        btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
        if (!btrfs_kset)
                return -ENOMEM;
-        init_feature_attrs();
+        ret = btrfs_init_debugfs();
+        if (ret)
+                return ret;
+        init_feature_attrs();
        ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
-        if (ret) {
-                kset_unregister(btrfs_kset);
-                return ret;
-        }
-        return 0;
+        return ret;
 }
 void btrfs_exit_sysfs(void)
 {
        sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
        kset_unregister(btrfs_kset);
+        debugfs_remove_recursive(btrfs_debugfs_root_dentry);
 }
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f3cea3710d44..9ab576318a84 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -1,6 +1,11 @@
 #ifndef _BTRFS_SYSFS_H_
 #define _BTRFS_SYSFS_H_
+/*
+ * Data exported through sysfs
+ */
+extern u64 btrfs_debugfs_test;
 enum btrfs_feature_set {
        FEAT_COMPAT,
        FEAT_COMPAT_RO,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 34cd83184c4a..7579f6d0b854 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -75,10 +75,21 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
        }
 }
-static noinline void switch_commit_root(struct btrfs_root *root)
+static noinline void switch_commit_roots(struct btrfs_transaction *trans,
+                                         struct btrfs_fs_info *fs_info)
 {
-        free_extent_buffer(root->commit_root);
+        struct btrfs_root *root, *tmp;
-        root->commit_root = btrfs_root_node(root);
+        down_write(&fs_info->commit_root_sem);
+        list_for_each_entry_safe(root, tmp, &trans->switch_commits,
+                                 dirty_list) {
+                list_del_init(&root->dirty_list);
+                free_extent_buffer(root->commit_root);
+                root->commit_root = btrfs_root_node(root);
+                if (is_fstree(root->objectid))
+                        btrfs_unpin_free_ino(root);
+        }
+        up_write(&fs_info->commit_root_sem);
 }
 static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
@@ -208,6 +219,7 @@ loop:
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
        INIT_LIST_HEAD(&cur_trans->ordered_operations);
        INIT_LIST_HEAD(&cur_trans->pending_chunks);
+        INIT_LIST_HEAD(&cur_trans->switch_commits);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(&cur_trans->dirty_pages,
                             fs_info->btree_inode->i_mapping);
@@ -375,7 +387,8 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
                return ERR_PTR(-EROFS);
-        if (current->journal_info) {
+        if (current->journal_info &&
+            current->journal_info != (void *)BTRFS_SEND_TRANS_STUB) {
                WARN_ON(type & TRANS_EXTWRITERS);
                h = current->journal_info;
                h->use_count++;
@@ -683,7 +696,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        int lock = (trans->type != TRANS_JOIN_NOLOCK);
        int err = 0;
-        if (--trans->use_count) {
+        if (trans->use_count > 1) {
+                trans->use_count--;
                trans->block_rsv = trans->orig_rsv;
                return 0;
        }
@@ -731,17 +745,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        }
        if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
-                if (throttle) {
+                if (throttle)
-                        /*
-                         * We may race with somebody else here so end up having
-                         * to call end_transaction on ourselves again, so inc
-                         * our use_count.
-                         */
-                        trans->use_count++;
                        return btrfs_commit_transaction(trans, root);
-                } else {
+                else
                        wake_up_process(info->transaction_kthread);
-                }
        }
        if (trans->type & __TRANS_FREEZABLE)
@@ -925,9 +932,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                        return ret;
        }
-        if (root != root->fs_info->extent_root)
-                switch_commit_root(root);
        return 0;
 }
@@ -983,15 +987,16 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                list_del_init(next);
                root = list_entry(next, struct btrfs_root, dirty_list);
+                if (root != fs_info->extent_root)
+                        list_add_tail(&root->dirty_list,
+                                      &trans->transaction->switch_commits);
                ret = update_cowonly_root(trans, root);
                if (ret)
                        return ret;
        }
-        down_write(&fs_info->extent_commit_sem);
+        list_add_tail(&fs_info->extent_root->dirty_list,
-        switch_commit_root(fs_info->extent_root);
+                      &trans->transaction->switch_commits);
-        up_write(&fs_info->extent_commit_sem);
        btrfs_after_dev_replace_commit(fs_info);
        return 0;
@@ -1048,11 +1053,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        smp_wmb();
                        if (root->commit_root != root->node) {
-                                mutex_lock(&root->fs_commit_mutex);
+                                list_add_tail(&root->dirty_list,
-                                switch_commit_root(root);
+                                        &trans->transaction->switch_commits);
-                                btrfs_unpin_free_ino(root);
-                                mutex_unlock(&root->fs_commit_mutex);
                                btrfs_set_root_node(&root->root_item,
                                                    root->node);
                        }
@@ -1578,10 +1580,9 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
        trace_btrfs_transaction_commit(root);
-        btrfs_scrub_continue(root);
        if (current->journal_info == trans)
                current->journal_info = NULL;
+        btrfs_scrub_cancel(root->fs_info);
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
 }
@@ -1621,7 +1622,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
        if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
-                return btrfs_start_delalloc_roots(fs_info, 1);
+                return btrfs_start_delalloc_roots(fs_info, 1, -1);
        return 0;
 }
@@ -1754,7 +1755,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        /* ->aborted might be set after the previous check, so check it */
        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
                ret = cur_trans->aborted;
-                goto cleanup_transaction;
+                goto scrub_continue;
        }
        /*
         * the reloc mutex makes sure that we stop
@@ -1771,7 +1772,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        ret = create_pending_snapshots(trans, root->fs_info);
        if (ret) {
                mutex_unlock(&root->fs_info->reloc_mutex);
-                goto cleanup_transaction;
+                goto scrub_continue;
        }
        /*
@@ -1787,13 +1788,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        ret = btrfs_run_delayed_items(trans, root);
        if (ret) {
                mutex_unlock(&root->fs_info->reloc_mutex);
-                goto cleanup_transaction;
+                goto scrub_continue;
        }
        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
        if (ret) {
                mutex_unlock(&root->fs_info->reloc_mutex);
-                goto cleanup_transaction;
+                goto scrub_continue;
        }
        /*
@@ -1823,7 +1824,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        if (ret) {
                mutex_unlock(&root->fs_info->tree_log_mutex);
                mutex_unlock(&root->fs_info->reloc_mutex);
-                goto cleanup_transaction;
+                goto scrub_continue;
        }
        /*
@@ -1844,7 +1845,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        if (ret) {
                mutex_unlock(&root->fs_info->tree_log_mutex);
                mutex_unlock(&root->fs_info->reloc_mutex);
-                goto cleanup_transaction;
+                goto scrub_continue;
        }
        /*
@@ -1855,7 +1856,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                ret = cur_trans->aborted;
                mutex_unlock(&root->fs_info->tree_log_mutex);
                mutex_unlock(&root->fs_info->reloc_mutex);
-                goto cleanup_transaction;
+                goto scrub_continue;
        }
        btrfs_prepare_extent_commit(trans, root);
@@ -1864,11 +1865,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_set_root_node(&root->fs_info->tree_root->root_item,
                            root->fs_info->tree_root->node);
-        switch_commit_root(root->fs_info->tree_root);
+        list_add_tail(&root->fs_info->tree_root->dirty_list,
+                      &cur_trans->switch_commits);
        btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
                            root->fs_info->chunk_root->node);
-        switch_commit_root(root->fs_info->chunk_root);
+        list_add_tail(&root->fs_info->chunk_root->dirty_list,
+                      &cur_trans->switch_commits);
+        switch_commit_roots(cur_trans, root->fs_info);
        assert_qgroups_uptodate(trans);
        update_super_roots(root);
@@ -1891,13 +1896,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                btrfs_error(root->fs_info, ret,
                            "Error while writing out transaction");
                mutex_unlock(&root->fs_info->tree_log_mutex);
-                goto cleanup_transaction;
+                goto scrub_continue;
        }
        ret = write_ctree_super(trans, root, 0);
        if (ret) {
                mutex_unlock(&root->fs_info->tree_log_mutex);
-                goto cleanup_transaction;
+                goto scrub_continue;
        }
        /*
@@ -1940,6 +1945,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        return ret;
+scrub_continue:
+        btrfs_scrub_continue(root);
 cleanup_transaction:
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 6ac037e9f9f0..b57b924e8e03 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -57,6 +57,7 @@ struct btrfs_transaction {
        struct list_head pending_snapshots;
        struct list_head ordered_operations;
        struct list_head pending_chunks;
+        struct list_head switch_commits;
        struct btrfs_delayed_ref_root delayed_refs;
        int aborted;
 };
@@ -78,6 +79,8 @@ struct btrfs_transaction {
 #define TRANS_EXTWRITERS        (__TRANS_USERSPACE | __TRANS_START |    \
                                 __TRANS_ATTACH)
+#define BTRFS_SEND_TRANS_STUB   1
 struct btrfs_trans_handle {
        u64 transid;
        u64 bytes_reserved;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 39d83da03e03..e2f45fc02610 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -136,13 +136,20 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 * syncing the tree wait for us to finish
 */
 static int start_log_trans(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root)
+                           struct btrfs_root *root,
+                           struct btrfs_log_ctx *ctx)
 {
+        int index;
        int ret;
-        int err = 0;
        mutex_lock(&root->log_mutex);
        if (root->log_root) {
+                if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
+                    trans->transid) {
+                        ret = -EAGAIN;
+                        goto out;
+                }
                if (!root->log_start_pid) {
                        root->log_start_pid = current->pid;
                        root->log_multiple_pids = false;
@@ -152,27 +159,40 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
                atomic_inc(&root->log_batch);
                atomic_inc(&root->log_writers);
+                if (ctx) {
+                        index = root->log_transid % 2;
+                        list_add_tail(&ctx->list, &root->log_ctxs[index]);
+                        ctx->log_transid = root->log_transid;
+                }
                mutex_unlock(&root->log_mutex);
                return 0;
        }
-        root->log_multiple_pids = false;
-        root->log_start_pid = current->pid;
+        ret = 0;
        mutex_lock(&root->fs_info->tree_log_mutex);
-        if (!root->fs_info->log_root_tree) {
+        if (!root->fs_info->log_root_tree)
                ret = btrfs_init_log_root_tree(trans, root->fs_info);
-                if (ret)
+        mutex_unlock(&root->fs_info->tree_log_mutex);
-                        err = ret;
+        if (ret)
-        }
+                goto out;
-        if (err == 0 && !root->log_root) {
+        if (!root->log_root) {
                ret = btrfs_add_log_tree(trans, root);
                if (ret)
-                        err = ret;
+                        goto out;
        }
-        mutex_unlock(&root->fs_info->tree_log_mutex);
+        root->log_multiple_pids = false;
+        root->log_start_pid = current->pid;
        atomic_inc(&root->log_batch);
        atomic_inc(&root->log_writers);
+        if (ctx) {
+                index = root->log_transid % 2;
+                list_add_tail(&ctx->list, &root->log_ctxs[index]);
+                ctx->log_transid = root->log_transid;
+        }
+out:
        mutex_unlock(&root->log_mutex);
-        return err;
+        return ret;
 }
 /*
@@ -2359,8 +2379,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
        return ret;
 }
-static int wait_log_commit(struct btrfs_trans_handle *trans,
+static void wait_log_commit(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root, unsigned long transid)
+                            struct btrfs_root *root, int transid)
 {
        DEFINE_WAIT(wait);
        int index = transid % 2;
@@ -2375,36 +2395,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
                                &wait, TASK_UNINTERRUPTIBLE);
                mutex_unlock(&root->log_mutex);
-                if (root->fs_info->last_trans_log_full_commit !=
+                if (root->log_transid_committed < transid &&
-                    trans->transid && root->log_transid < transid + 2 &&
                    atomic_read(&root->log_commit[index]))
                        schedule();
                finish_wait(&root->log_commit_wait[index], &wait);
                mutex_lock(&root->log_mutex);
-        } while (root->fs_info->last_trans_log_full_commit !=
+        } while (root->log_transid_committed < transid &&
-                 trans->transid && root->log_transid < transid + 2 &&
                 atomic_read(&root->log_commit[index]));
-        return 0;
 }
 static void wait_for_writer(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root)
 {
        DEFINE_WAIT(wait);
-        while (root->fs_info->last_trans_log_full_commit !=
-               trans->transid && atomic_read(&root->log_writers)) {
+        while (atomic_read(&root->log_writers)) {
                prepare_to_wait(&root->log_writer_wait,
                                &wait, TASK_UNINTERRUPTIBLE);
                mutex_unlock(&root->log_mutex);
-                if (root->fs_info->last_trans_log_full_commit !=
+                if (atomic_read(&root->log_writers))
-                    trans->transid && atomic_read(&root->log_writers))
                        schedule();
                mutex_lock(&root->log_mutex);
                finish_wait(&root->log_writer_wait, &wait);
        }
 }
+static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
+                                        struct btrfs_log_ctx *ctx)
+{
+        if (!ctx)
+                return;
+        mutex_lock(&root->log_mutex);
+        list_del_init(&ctx->list);
+        mutex_unlock(&root->log_mutex);
+}
+/* 
+ * Invoked in log mutex context, or be sure there is no other task which
+ * can access the list.
+ */
+static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
+                                             int index, int error)
+{
+        struct btrfs_log_ctx *ctx;
+        if (!error) {
+                INIT_LIST_HEAD(&root->log_ctxs[index]);
+                return;
+        }
+        list_for_each_entry(ctx, &root->log_ctxs[index], list)
+                ctx->log_ret = error;
+        INIT_LIST_HEAD(&root->log_ctxs[index]);
+}
 /*
 * btrfs_sync_log does sends a given tree log down to the disk and
 * updates the super blocks to record it.  When this call is done,
@@ -2418,7 +2465,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
 * that has happened.
 */
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
-                   struct btrfs_root *root)
+                   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
 {
        int index1;
        int index2;
@@ -2426,22 +2473,30 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        int ret;
        struct btrfs_root *log = root->log_root;
        struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
-        unsigned long log_transid = 0;
+        int log_transid = 0;
+        struct btrfs_log_ctx root_log_ctx;
        struct blk_plug plug;
        mutex_lock(&root->log_mutex);
-        log_transid = root->log_transid;
+        log_transid = ctx->log_transid;
-        index1 = root->log_transid % 2;
+        if (root->log_transid_committed >= log_transid) {
+                mutex_unlock(&root->log_mutex);
+                return ctx->log_ret;
+        }
+        index1 = log_transid % 2;
        if (atomic_read(&root->log_commit[index1])) {
-                wait_log_commit(trans, root, root->log_transid);
+                wait_log_commit(trans, root, log_transid);
                mutex_unlock(&root->log_mutex);
-                return 0;
+                return ctx->log_ret;
        }
+        ASSERT(log_transid == root->log_transid);
        atomic_set(&root->log_commit[index1], 1);
        /* wait for previous tree log sync to complete */
        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-                wait_log_commit(trans, root, root->log_transid - 1);
+                wait_log_commit(trans, root, log_transid - 1);
        while (1) {
                int batch = atomic_read(&root->log_batch);
                /* when we're on an ssd, just kick the log commit out */
@@ -2456,7 +2511,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        }
        /* bail out if we need to do a full commit */
-        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+        if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
+            trans->transid) {
                ret = -EAGAIN;
                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&root->log_mutex);
@@ -2477,6 +2533,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                blk_finish_plug(&plug);
                btrfs_abort_transaction(trans, root, ret);
                btrfs_free_logged_extents(log, log_transid);
+                ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+                                                                trans->transid;
                mutex_unlock(&root->log_mutex);
                goto out;
        }
@@ -2486,7 +2544,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        root->log_transid++;
        log->log_transid = root->log_transid;
        root->log_start_pid = 0;
-        smp_mb();
        /*
         * IO has been started, blocks of the log tree have WRITTEN flag set
         * in their headers. new modifications of the log will be written to
@@ -2494,9 +2551,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         */
        mutex_unlock(&root->log_mutex);
+        btrfs_init_log_ctx(&root_log_ctx);
        mutex_lock(&log_root_tree->log_mutex);
        atomic_inc(&log_root_tree->log_batch);
        atomic_inc(&log_root_tree->log_writers);
+        index2 = log_root_tree->log_transid % 2;
+        list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+        root_log_ctx.log_transid = log_root_tree->log_transid;
        mutex_unlock(&log_root_tree->log_mutex);
        ret = update_log_root(trans, log);
@@ -2509,13 +2573,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        }
        if (ret) {
+                if (!list_empty(&root_log_ctx.list))
+                        list_del_init(&root_log_ctx.list);
                blk_finish_plug(&plug);
+                ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+                                                                trans->transid;
                if (ret != -ENOSPC) {
                        btrfs_abort_transaction(trans, root, ret);
                        mutex_unlock(&log_root_tree->log_mutex);
                        goto out;
                }
-                root->fs_info->last_trans_log_full_commit = trans->transid;
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
@@ -2523,22 +2591,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                goto out;
        }
-        index2 = log_root_tree->log_transid % 2;
+        if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
+                mutex_unlock(&log_root_tree->log_mutex);
+                ret = root_log_ctx.log_ret;
+                goto out;
+        }
+        index2 = root_log_ctx.log_transid % 2;
        if (atomic_read(&log_root_tree->log_commit[index2])) {
                blk_finish_plug(&plug);
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
                wait_log_commit(trans, log_root_tree,
-                                log_root_tree->log_transid);
+                                root_log_ctx.log_transid);
                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
-                ret = 0;
+                ret = root_log_ctx.log_ret;
                goto out;
        }
+        ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
        atomic_set(&log_root_tree->log_commit[index2], 1);
        if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
                wait_log_commit(trans, log_root_tree,
-                                log_root_tree->log_transid - 1);
+                                root_log_ctx.log_transid - 1);
        }
        wait_for_writer(trans, log_root_tree);
@@ -2547,7 +2622,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         * now that we've moved on to the tree of log tree roots,
         * check the full commit flag again
         */
-        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+        if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
+            trans->transid) {
                blk_finish_plug(&plug);
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
                btrfs_free_logged_extents(log, log_transid);
@@ -2561,6 +2637,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                                         EXTENT_DIRTY | EXTENT_NEW);
        blk_finish_plug(&plug);
        if (ret) {
+                ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+                                                                trans->transid;
                btrfs_abort_transaction(trans, root, ret);
                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
@@ -2578,8 +2656,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                                btrfs_header_level(log_root_tree->node));
        log_root_tree->log_transid++;
-        smp_mb();
        mutex_unlock(&log_root_tree->log_mutex);
        /*
@@ -2591,6 +2667,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         */
        ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
        if (ret) {
+                ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+                                                                trans->transid;
                btrfs_abort_transaction(trans, root, ret);
                goto out_wake_log_root;
        }
@@ -2601,13 +2679,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        mutex_unlock(&root->log_mutex);
 out_wake_log_root:
+        /*
+         * We needn't get log_mutex here because we are sure all
+         * the other tasks are blocked.
+         */
+        btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
+        mutex_lock(&log_root_tree->log_mutex);
+        log_root_tree->log_transid_committed++;
        atomic_set(&log_root_tree->log_commit[index2], 0);
-        smp_mb();
+        mutex_unlock(&log_root_tree->log_mutex);
        if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
                wake_up(&log_root_tree->log_commit_wait[index2]);
 out:
+        /* See above. */
+        btrfs_remove_all_log_ctxs(root, index1, ret);
+        mutex_lock(&root->log_mutex);
+        root->log_transid_committed++;
        atomic_set(&root->log_commit[index1], 0);
-        smp_mb();
+        mutex_unlock(&root->log_mutex);
        if (waitqueue_active(&root->log_commit_wait[index1]))
                wake_up(&root->log_commit_wait[index1]);
        return ret;
@@ -3479,7 +3572,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
 static int log_one_extent(struct btrfs_trans_handle *trans,
                          struct inode *inode, struct btrfs_root *root,
-                          struct extent_map *em, struct btrfs_path *path)
+                          struct extent_map *em, struct btrfs_path *path,
+                          struct list_head *logged_list)
 {
        struct btrfs_root *log = root->log_root;
        struct btrfs_file_extent_item *fi;
@@ -3495,7 +3589,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
        u64 extent_offset = em->start - em->orig_start;
        u64 block_len;
        int ret;
-        int index = log->log_transid % 2;
        bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
        int extent_inserted = 0;
@@ -3579,17 +3672,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
         * First check and see if our csums are on our outstanding ordered
         * extents.
         */
-again:
+        list_for_each_entry(ordered, logged_list, log_list) {
-        spin_lock_irq(&log->log_extents_lock[index]);
-        list_for_each_entry(ordered, &log->logged_list[index], log_list) {
                struct btrfs_ordered_sum *sum;
                if (!mod_len)
                        break;
-                if (ordered->inode != inode)
-                        continue;
                if (ordered->file_offset + ordered->len <= mod_start ||
                    mod_start + mod_len <= ordered->file_offset)
                        continue;
@@ -3632,12 +3720,6 @@ again:
                if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
                                     &ordered->flags))
                        continue;
-                atomic_inc(&ordered->refs);
-                spin_unlock_irq(&log->log_extents_lock[index]);
-                /*
-                 * we've dropped the lock, we must either break or
-                 * start over after this.
-                 */
                if (ordered->csum_bytes_left) {
                        btrfs_start_ordered_extent(inode, ordered, 0);
@@ -3647,16 +3729,11 @@ again:
                list_for_each_entry(sum, &ordered->list, list) {
                        ret = btrfs_csum_file_blocks(trans, log, sum);
-                        if (ret) {
+                        if (ret)
-                                btrfs_put_ordered_extent(ordered);
                                goto unlocked;
-                        }
                }
-                btrfs_put_ordered_extent(ordered);
-                goto again;
        }
-        spin_unlock_irq(&log->log_extents_lock[index]);
 unlocked:
        if (!mod_len || ret)
@@ -3694,7 +3771,8 @@ unlocked:
 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct inode *inode,
-                                     struct btrfs_path *path)
+                                     struct btrfs_path *path,
+                                     struct list_head *logged_list)
 {
        struct extent_map *em, *n;
        struct list_head extents;
@@ -3752,7 +3830,7 @@ process:
                write_unlock(&tree->lock);
-                ret = log_one_extent(trans, inode, root, em, path);
+                ret = log_one_extent(trans, inode, root, em, path, logged_list);
                write_lock(&tree->lock);
                clear_em_logging(tree, em);
                free_extent_map(em);
@@ -3788,6 +3866,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        struct btrfs_key max_key;
        struct btrfs_root *log = root->log_root;
        struct extent_buffer *src = NULL;
+        LIST_HEAD(logged_list);
        u64 last_extent = 0;
        int err = 0;
        int ret;
@@ -3836,7 +3915,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        mutex_lock(&BTRFS_I(inode)->log_mutex);
-        btrfs_get_logged_extents(log, inode);
+        btrfs_get_logged_extents(inode, &logged_list);
        /*
         * a brute force approach to making sure we get the most uptodate
@@ -3962,7 +4041,8 @@ log_extents:
        btrfs_release_path(path);
        btrfs_release_path(dst_path);
        if (fast_search) {
-                ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
+                ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
+                                                &logged_list);
                if (ret) {
                        err = ret;
                        goto out_unlock;
@@ -3987,8 +4067,10 @@ log_extents:
        BTRFS_I(inode)->logged_trans = trans->transid;
        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
-        if (err)
+        if (unlikely(err))
-                btrfs_free_logged_extents(log, log->log_transid);
+                btrfs_put_logged_extents(&logged_list);
+        else
+                btrfs_submit_logged_extents(&logged_list, log);
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
        btrfs_free_path(path);
@@ -4079,7 +4161,8 @@ out:
 */
 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root, struct inode *inode,
-                                  struct dentry *parent, int exists_only)
+                                  struct dentry *parent, int exists_only,
+                                  struct btrfs_log_ctx *ctx)
 {
        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
        struct super_block *sb;
@@ -4116,9 +4199,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                goto end_no_trans;
        }
-        ret = start_log_trans(trans, root);
+        ret = start_log_trans(trans, root, ctx);
        if (ret)
-                goto end_trans;
+                goto end_no_trans;
        ret = btrfs_log_inode(trans, root, inode, inode_only);
        if (ret)
@@ -4166,6 +4249,9 @@ end_trans:
                root->fs_info->last_trans_log_full_commit = trans->transid;
                ret = 1;
        }
+        if (ret)
+                btrfs_remove_log_ctx(root, ctx);
        btrfs_end_log_trans(root);
 end_no_trans:
        return ret;
@@ -4178,12 +4264,14 @@ end_no_trans:
 * data on disk.
 */
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, struct dentry *dentry)
+                          struct btrfs_root *root, struct dentry *dentry,
+                          struct btrfs_log_ctx *ctx)
 {
        struct dentry *parent = dget_parent(dentry);
        int ret;
-        ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+        ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
+                                     0, ctx);
        dput(parent);
        return ret;
@@ -4420,6 +4508,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
                    root->fs_info->last_trans_committed))
                return 0;
-        return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+        return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
 }
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1d4ae0d15a70..91b145fce333 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,28 @@
 /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
 #define BTRFS_NO_LOG_SYNC 256
+struct btrfs_log_ctx {
+        int log_ret;
+        int log_transid;
+        struct list_head list;
+};
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
+{
+        ctx->log_ret = 0;
+        ctx->log_transid = 0;
+        INIT_LIST_HEAD(&ctx->list);
+}
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
-                   struct btrfs_root *root);
+                   struct btrfs_root *root, struct btrfs_log_ctx *ctx);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, struct dentry *dentry);
+                          struct btrfs_root *root, struct dentry *dentry,
+                          struct btrfs_log_ctx *ctx);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 const char *name, int name_len,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bab0b84d8f80..49d7fab73360 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -415,7 +415,8 @@ loop_lock:
                        device->running_pending = 1;
                        spin_unlock(&device->io_lock);
-                        btrfs_requeue_work(&device->work);
+                        btrfs_queue_work(fs_info->submit_workers,
+                                         &device->work);
                        goto done;
                }
                /* unplug every 64 requests just for good measure */
@@ -447,6 +448,14 @@ static void pending_bios_fn(struct btrfs_work *work)
        run_scheduled_bios(device);
 }
+/*
+ * Add new device to list of registered devices
+ *
+ * Returns:
+ * 1   - first time device is seen
+ * 0   - device already known
+ * < 0 - error
+ */
 static noinline int device_list_add(const char *path,
                           struct btrfs_super_block *disk_super,
                           u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -454,6 +463,7 @@ static noinline int device_list_add(const char *path,
        struct btrfs_device *device;
        struct btrfs_fs_devices *fs_devices;
        struct rcu_string *name;
+        int ret = 0;
        u64 found_transid = btrfs_super_generation(disk_super);
        fs_devices = find_fsid(disk_super->fsid);
@@ -494,6 +504,7 @@ static noinline int device_list_add(const char *path,
                fs_devices->num_devices++;
                mutex_unlock(&fs_devices->device_list_mutex);
+                ret = 1;
                device->fs_devices = fs_devices;
        } else if (!device->name || strcmp(device->name->str, path)) {
                name = rcu_string_strdup(path, GFP_NOFS);
@@ -512,7 +523,8 @@ static noinline int device_list_add(const char *path,
                fs_devices->latest_trans = found_transid;
        }
        *fs_devices_ret = fs_devices;
-        return 0;
+        return ret;
 }
 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
@@ -909,17 +921,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        transid = btrfs_super_generation(disk_super);
        total_devices = btrfs_super_num_devices(disk_super);
-        if (disk_super->label[0]) {
-                if (disk_super->label[BTRFS_LABEL_SIZE - 1])
-                        disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
-                printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
-        } else {
-                printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
-        }
-        printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
        ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+        if (ret > 0) {
+                if (disk_super->label[0]) {
+                        if (disk_super->label[BTRFS_LABEL_SIZE - 1])
+                                disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
+                        printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
+                } else {
+                        printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
+                }
+                printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
+                ret = 0;
+        }
        if (!ret && fs_devices_ret)
                (*fs_devices_ret)->total_devices = total_devices;
@@ -5263,6 +5277,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 static void btrfs_end_bio(struct bio *bio, int err)
 {
        struct btrfs_bio *bbio = bio->bi_private;
+        struct btrfs_device *dev = bbio->stripes[0].dev;
        int is_orig_bio = 0;
        if (err) {
@@ -5270,7 +5285,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
                if (err == -EIO || err == -EREMOTEIO) {
                        unsigned int stripe_index =
                                btrfs_io_bio(bio)->stripe_index;
-                        struct btrfs_device *dev;
                        BUG_ON(stripe_index >= bbio->num_stripes);
                        dev = bbio->stripes[stripe_index].dev;
@@ -5292,6 +5306,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
        if (bio == bbio->orig_bio)
                is_orig_bio = 1;
+        btrfs_bio_counter_dec(bbio->fs_info);
        if (atomic_dec_and_test(&bbio->stripes_pending)) {
                if (!is_orig_bio) {
                        bio_put(bio);
@@ -5328,13 +5344,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
        }
 }
-struct async_sched {
-        struct bio *bio;
-        int rw;
-        struct btrfs_fs_info *info;
-        struct btrfs_work work;
-};
 /*
 * see run_scheduled_bios for a description of why bios are collected for
 * async submit.
@@ -5391,8 +5400,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
        spin_unlock(&device->io_lock);
        if (should_queue)
-                btrfs_queue_worker(&root->fs_info->submit_workers,
+                btrfs_queue_work(root->fs_info->submit_workers,
-                                   &device->work);
+                                 &device->work);
 }
 static int bio_size_ok(struct block_device *bdev, struct bio *bio,
@@ -5447,6 +5456,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
        }
 #endif
        bio->bi_bdev = dev->bdev;
+        btrfs_bio_counter_inc_noblocked(root->fs_info);
        if (async)
                btrfs_schedule_bio(root, dev, rw, bio);
        else
@@ -5515,28 +5527,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        length = bio->bi_iter.bi_size;
        map_length = length;
+        btrfs_bio_counter_inc_blocked(root->fs_info);
        ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
                              mirror_num, &raid_map);
-        if (ret) /* -ENOMEM */
+        if (ret) {
+                btrfs_bio_counter_dec(root->fs_info);
                return ret;
+        }
        total_devs = bbio->num_stripes;
        bbio->orig_bio = first_bio;
        bbio->private = first_bio->bi_private;
        bbio->end_io = first_bio->bi_end_io;
+        bbio->fs_info = root->fs_info;
        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
        if (raid_map) {
                /* In this case, map_length has been set to the length of
                   a single stripe; not the whole write */
                if (rw & WRITE) {
-                        return raid56_parity_write(root, bio, bbio,
+                        ret = raid56_parity_write(root, bio, bbio,
-                                                   raid_map, map_length);
+                                                  raid_map, map_length);
                } else {
-                        return raid56_parity_recover(root, bio, bbio,
+                        ret = raid56_parity_recover(root, bio, bbio,
-                                                     raid_map, map_length,
+                                                    raid_map, map_length,
-                                                     mirror_num);
+                                                    mirror_num);
                }
+                /*
+                 * FIXME, replace dosen't support raid56 yet, please fix
+                 * it in the future.
+                 */
+                btrfs_bio_counter_dec(root->fs_info);
+                return ret;
        }
        if (map_length < length) {
@@ -5578,6 +5600,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                                  async_submit);
                dev_nr++;
        }
+        btrfs_bio_counter_dec(root->fs_info);
        return 0;
 }
@@ -5666,7 +5689,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
        else
                generate_random_uuid(dev->uuid);
-        dev->work.func = pending_bios_fn;
+        btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
        return dev;
 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8b3cd142b373..80754f9dd3df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
 struct btrfs_bio {
        atomic_t stripes_pending;
+        struct btrfs_fs_info *fs_info;
        bio_end_io_t *end_io;
        struct bio *orig_bio;
        void *private;
diff --git a/fs/buffer.c b/fs/buffer.c
index 27265a8b43c1..9ddb9fc7d923 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2114,8 +2114,8 @@ EXPORT_SYMBOL(generic_write_end);
 * Returns true if all buffers which correspond to a file portion
 * we want to read are uptodate.
 */
-int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
+int block_is_partially_uptodate(struct page *page, unsigned long from,
-                                        unsigned long from)
+                                        unsigned long count)
 {
        unsigned block_start, block_end, blocksize;
        unsigned to;
@@ -2127,7 +2127,7 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
        head = page_buffers(page);
        blocksize = head->b_size;
-        to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
+        to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
        to = from + to;
        if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
                return 0;
@@ -3088,7 +3088,7 @@ EXPORT_SYMBOL(submit_bh);
 * until the buffer gets unlocked).
 *
 * ll_rw_block sets b_end_io to simple completion handler that marks
- * the buffer up-to-date (if approriate), unlocks the buffer and wakes
+ * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
 * any waiters. 
 *
 * All of the buffers must be for the same device, and must also be a
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 622f4696e484..5b99bafc31d1 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -124,7 +124,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
        /* check parameters */
        ret = -EOPNOTSUPP;
        if (!root->d_inode ||
-            !root->d_inode->i_op ||
            !root->d_inode->i_op->lookup ||
            !root->d_inode->i_op->mkdir ||
            !root->d_inode->i_op->setxattr ||
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index ca65f39dc8dc..c0a681705104 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -391,12 +391,12 @@ try_again:
        path.dentry = dir;
        path_to_graveyard.mnt = cache->mnt;
        path_to_graveyard.dentry = cache->graveyard;
-        ret = security_path_rename(&path, rep, &path_to_graveyard, grave);
+        ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0);
        if (ret < 0) {
                cachefiles_io_error(cache, "Rename security error %d", ret);
        } else {
                ret = vfs_rename(dir->d_inode, rep,
-                                 cache->graveyard->d_inode, grave, NULL);
+                                 cache->graveyard->d_inode, grave, NULL, 0);
                if (ret != 0 && ret != -ENOMEM)
                        cachefiles_io_error(cache,
                                            "Rename failed with error %d", ret);
@@ -779,8 +779,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
        }
        ret = -EPERM;
-        if (!subdir->d_inode->i_op ||
+        if (!subdir->d_inode->i_op->setxattr ||
-            !subdir->d_inode->i_op->setxattr ||
            !subdir->d_inode->i_op->getxattr ||
            !subdir->d_inode->i_op->lookup ||
            !subdir->d_inode->i_op->mkdir ||
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index ebaff368120d..4b1fb5ca65b8 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -265,24 +265,22 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
                                goto nomem_monitor;
                }
-                ret = add_to_page_cache(newpage, bmapping,
+                ret = add_to_page_cache_lru(newpage, bmapping,
-                                        netpage->index, cachefiles_gfp);
+                                            netpage->index, cachefiles_gfp);
                if (ret == 0)
                        goto installed_new_backing_page;
                if (ret != -EEXIST)
                        goto nomem_page;
        }
-        /* we've installed a new backing page, so now we need to add it
+        /* we've installed a new backing page, so now we need to start
-         * to the LRU list and start it reading */
+         * it reading */
 installed_new_backing_page:
        _debug("- new %p", newpage);
        backpage = newpage;
        newpage = NULL;
-        lru_cache_add_file(backpage);
 read_backing_page:
        ret = bmapping->a_ops->readpage(NULL, backpage);
        if (ret < 0)
@@ -510,24 +508,23 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
                                        goto nomem;
                        }
-                        ret = add_to_page_cache(newpage, bmapping,
+                        ret = add_to_page_cache_lru(newpage, bmapping,
-                                                netpage->index, cachefiles_gfp);
+                                                    netpage->index,
+                                                    cachefiles_gfp);
                        if (ret == 0)
                                goto installed_new_backing_page;
                        if (ret != -EEXIST)
                                goto nomem;
                }
-                /* we've installed a new backing page, so now we need to add it
+                /* we've installed a new backing page, so now we need
-                 * to the LRU list and start it reading */
+                 * to start it reading */
        installed_new_backing_page:
                _debug("- new %p", newpage);
                backpage = newpage;
                newpage = NULL;
-                lru_cache_add_file(backpage);
        reread_backing_page:
                ret = bmapping->a_ops->readpage(NULL, backpage);
                if (ret < 0)
@@ -538,8 +535,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
        monitor_backing_page:
                _debug("- monitor add");
-                ret = add_to_page_cache(netpage, op->mapping, netpage->index,
+                ret = add_to_page_cache_lru(netpage, op->mapping,
-                                        cachefiles_gfp);
+                                            netpage->index, cachefiles_gfp);
                if (ret < 0) {
                        if (ret == -EEXIST) {
                                page_cache_release(netpage);
@@ -549,8 +546,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
                        goto nomem;
                }
-                lru_cache_add_file(netpage);
                /* install a monitor */
                page_cache_get(netpage);
                monitor->netfs_page = netpage;
@@ -613,8 +608,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
        backing_page_already_uptodate:
                _debug("- uptodate");
-                ret = add_to_page_cache(netpage, op->mapping, netpage->index,
+                ret = add_to_page_cache_lru(netpage, op->mapping,
-                                        cachefiles_gfp);
+                                            netpage->index, cachefiles_gfp);
                if (ret < 0) {
                        if (ret == -EEXIST) {
                                page_cache_release(netpage);
@@ -631,8 +626,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
                fscache_mark_page_cached(op, netpage);
-                lru_cache_add_file(netpage);
                /* the netpage is unlocked and marked up to date here */
                fscache_end_io(op, netpage, 0);
                page_cache_release(netpage);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 8c44fdd4e1c3..834f9f3723fb 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -205,6 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
        ci->fscache = fscache_acquire_cookie(fsc->fscache,
                                             &ceph_fscache_inode_object_def,
                                             ci, true);
+        fscache_check_consistency(ci->fscache);
 done:
        mutex_unlock(&inode->i_mutex);
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index da95f61b7a09..5ac591bd012b 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -48,6 +48,12 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
 void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
 void ceph_queue_revalidate(struct inode *inode);
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        fscache_attr_changed(ci->fscache);
+}
 static inline void ceph_fscache_invalidate(struct inode *inode)
 {
        fscache_invalidate(ceph_inode(inode)->fscache);
@@ -135,6 +141,10 @@ static inline void ceph_readpage_to_fscache(struct inode *inode,
 {
 }
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+}
 static inline void ceph_fscache_invalidate(struct inode *inode)
 {
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 17543383545c..c561b628ebce 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -622,8 +622,10 @@ retry:
        if (flags & CEPH_CAP_FLAG_AUTH) {
                if (ci->i_auth_cap == NULL ||
-                    ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
+                    ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
                        ci->i_auth_cap = cap;
+                        cap->mds_wanted = wanted;
+                }
                ci->i_cap_exporting_issued = 0;
        } else {
                WARN_ON(ci->i_auth_cap == cap);
@@ -885,7 +887,10 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
                cap = rb_entry(p, struct ceph_cap, ci_node);
                if (!__cap_is_valid(cap))
                        continue;
-                mds_wanted |= cap->mds_wanted;
+                if (cap == ci->i_auth_cap)
+                        mds_wanted |= cap->mds_wanted;
+                else
+                        mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
        }
        return mds_wanted;
 }
@@ -3256,7 +3261,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
                        rel->seq = cpu_to_le32(cap->seq);
                        rel->issue_seq = cpu_to_le32(cap->issue_seq),
                        rel->mseq = cpu_to_le32(cap->mseq);
-                        rel->caps = cpu_to_le32(cap->issued);
+                        rel->caps = cpu_to_le32(cap->implemented);
                        rel->wanted = cpu_to_le32(cap->mds_wanted);
                        rel->dname_len = 0;
                        rel->dname_seq = 0;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6d59006bfa27..16b54aa31f08 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -93,6 +93,8 @@ static int mdsc_show(struct seq_file *s, void *p)
                } else if (req->r_path1) {
                        seq_printf(s, " #%llx/%s", req->r_ino1.ino,
                                   req->r_path1);
+                } else {
+                        seq_printf(s, " #%llx", req->r_ino1.ino);
                }
                if (req->r_old_dentry) {
@@ -102,7 +104,8 @@ static int mdsc_show(struct seq_file *s, void *p)
                                path = NULL;
                        spin_lock(&req->r_old_dentry->d_lock);
                        seq_printf(s, " #%llx/%.*s (%s)",
-                           ceph_ino(req->r_old_dentry_dir),
+                                   req->r_old_dentry_dir ?
+                                   ceph_ino(req->r_old_dentry_dir) : 0,
                                   req->r_old_dentry->d_name.len,
                                   req->r_old_dentry->d_name.name,
                                   path ? path : "");
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 45eda6d7a40c..c29d6ae68874 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -119,7 +119,8 @@ static int fpos_cmp(loff_t l, loff_t r)
 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
 * the MDS if/when the directory is modified).
 */
-static int __dcache_readdir(struct file *file, struct dir_context *ctx)
+static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
+                            u32 shared_gen)
 {
        struct ceph_file_info *fi = file->private_data;
        struct dentry *parent = file->f_dentry;
@@ -133,14 +134,14 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx)
        last = fi->dentry;
        fi->dentry = NULL;
-        dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
+        dout("__dcache_readdir %p v%u at %llu (last %p)\n",
-             last);
+             dir, shared_gen, ctx->pos, last);
        spin_lock(&parent->d_lock);
        /* start at beginning? */
        if (ctx->pos == 2 || last == NULL ||
-            ctx->pos < ceph_dentry(last)->offset) {
+            fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {
                if (list_empty(&parent->d_subdirs))
                        goto out_unlock;
                p = parent->d_subdirs.prev;
@@ -161,7 +162,8 @@ more:
                        goto out_unlock;
                }
                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-                if (!d_unhashed(dentry) && dentry->d_inode &&
+                if (di->lease_shared_gen == shared_gen &&
+                    !d_unhashed(dentry) && dentry->d_inode &&
                    ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
                    ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
                    fpos_cmp(ctx->pos, di->offset) <= 0)
@@ -180,9 +182,16 @@ more:
        spin_unlock(&dentry->d_lock);
        spin_unlock(&parent->d_lock);
+        /* make sure a dentry wasn't dropped while we didn't have parent lock */
+        if (!ceph_dir_is_complete(dir)) {
+                dout(" lost dir complete on %p; falling back to mds\n", dir);
+                dput(dentry);
+                err = -EAGAIN;
+                goto out;
+        }
        dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
             dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-        ctx->pos = di->offset;
        if (!dir_emit(ctx, dentry->d_name.name,
                      dentry->d_name.len,
                      ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
@@ -190,25 +199,18 @@ more:
                if (last) {
                        /* remember our position */
                        fi->dentry = last;
-                        fi->next_offset = di->offset;
+                        fi->next_offset = fpos_off(di->offset);
                }
                dput(dentry);
                return 0;
        }
+        ctx->pos = di->offset + 1;
        if (last)
                dput(last);
        last = dentry;
-        ctx->pos++;
-        /* make sure a dentry wasn't dropped while we didn't have parent lock */
-        if (!ceph_dir_is_complete(dir)) {
-                dout(" lost dir complete on %p; falling back to mds\n", dir);
-                err = -EAGAIN;
-                goto out;
-        }
        spin_lock(&parent->d_lock);
        p = p->prev;    /* advance to next dentry */
        goto more;
@@ -252,8 +254,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
        int err;
        u32 ftype;
        struct ceph_mds_reply_info_parsed *rinfo;
-        const int max_entries = fsc->mount_options->max_readdir;
-        const int max_bytes = fsc->mount_options->max_readdir_bytes;
        dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
        if (fi->flags & CEPH_F_ATEND)
@@ -291,10 +291,13 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
            ceph_snap(inode) != CEPH_SNAPDIR &&
            __ceph_dir_is_complete(ci) &&
            __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+                u32 shared_gen = ci->i_shared_gen;
                spin_unlock(&ci->i_ceph_lock);
-                err = __dcache_readdir(file, ctx);
+                err = __dcache_readdir(file, ctx, shared_gen);
                if (err != -EAGAIN)
                        return err;
+                frag = fpos_frag(ctx->pos);
+                off = fpos_off(ctx->pos);
        } else {
                spin_unlock(&ci->i_ceph_lock);
        }
@@ -322,14 +325,16 @@ more:
                        fi->last_readdir = NULL;
                }
-                /* requery frag tree, as the frag topology may have changed */
-                frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
                dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
                     ceph_vinop(inode), frag, fi->last_name);
                req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
                if (IS_ERR(req))
                        return PTR_ERR(req);
+                err = ceph_alloc_readdir_reply_buffer(req, inode);
+                if (err) {
+                        ceph_mdsc_put_request(req);
+                        return err;
+                }
                req->r_inode = inode;
                ihold(inode);
                req->r_dentry = dget(file->f_dentry);
@@ -340,9 +345,6 @@ more:
                req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
                req->r_readdir_offset = fi->next_offset;
                req->r_args.readdir.frag = cpu_to_le32(frag);
-                req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
-                req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
-                req->r_num_caps = max_entries + 1;
                err = ceph_mdsc_do_request(mdsc, NULL, req);
                if (err < 0) {
                        ceph_mdsc_put_request(req);
@@ -369,9 +371,9 @@ more:
                                fi->next_offset = 0;
                        off = fi->next_offset;
                }
+                fi->frag = frag;
                fi->offset = fi->next_offset;
                fi->last_readdir = req;
-                fi->frag = frag;
                if (req->r_reply_info.dir_end) {
                        kfree(fi->last_name);
@@ -446,7 +448,6 @@ more:
        if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
                dout(" marking %p complete\n", inode);
                __ceph_dir_set_complete(ci, fi->dir_release_count);
-                ci->i_max_offset = ctx->pos;
        }
        spin_unlock(&ci->i_ceph_lock);
@@ -454,7 +455,7 @@ more:
        return 0;
 }
-static void reset_readdir(struct ceph_file_info *fi)
+static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
 {
        if (fi->last_readdir) {
                ceph_mdsc_put_request(fi->last_readdir);
@@ -462,7 +463,10 @@ static void reset_readdir(struct ceph_file_info *fi)
        }
        kfree(fi->last_name);
        fi->last_name = NULL;
-        fi->next_offset = 2;  /* compensate for . and .. */
+        if (ceph_frag_is_leftmost(frag))
+                fi->next_offset = 2;  /* compensate for . and .. */
+        else
+                fi->next_offset = 0;
        if (fi->dentry) {
                dput(fi->dentry);
                fi->dentry = NULL;
@@ -474,7 +478,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 {
        struct ceph_file_info *fi = file->private_data;
        struct inode *inode = file->f_mapping->host;
-        loff_t old_offset = offset;
+        loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
        loff_t retval;
        mutex_lock(&inode->i_mutex);
@@ -491,7 +495,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
                goto out;
        }
-        if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+        if (offset >= 0) {
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
@@ -504,14 +508,14 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
                 * seek to new frag, or seek prior to current chunk.
                 */
                if (offset == 0 ||
-                    fpos_frag(offset) != fpos_frag(old_offset) ||
+                    fpos_frag(offset) != fi->frag ||
                    fpos_off(offset) < fi->offset) {
                        dout("dir_llseek dropping %p content\n", file);
-                        reset_readdir(fi);
+                        reset_readdir(fi, fpos_frag(offset));
                }
                /* bump dir_release_count if we did a forward seek */
-                if (offset > old_offset)
+                if (fpos_cmp(offset, old_offset) > 0)
                        fi->dir_release_count--;
        }
 out:
@@ -812,8 +816,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
        }
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
-        req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
+        req->r_old_dentry = dget(old_dentry);
-        req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
        req->r_locked_dir = dir;
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -911,10 +914,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
        if (IS_ERR(req))
                return PTR_ERR(req);
+        ihold(old_dir);
        req->r_dentry = dget(new_dentry);
        req->r_num_caps = 2;
        req->r_old_dentry = dget(old_dentry);
-        req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
+        req->r_old_dentry_dir = old_dir;
        req->r_locked_dir = new_dir;
        req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -932,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
                 * to do it here.
                 */
-                /* d_move screws up d_subdirs order */
-                ceph_dir_clear_complete(new_dir);
                d_move(old_dentry, new_dentry);
                /* ensure target dentry is invalidated, despite
                   rehashing bug in vfs_rename_dir */
                ceph_invalidate_dentry_lease(new_dentry);
+                /* d_move screws up sibling dentries' offsets */
+                ceph_dir_clear_complete(old_dir);
+                ceph_dir_clear_complete(new_dir);
        }
        ceph_mdsc_put_request(req);
        return err;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 16796be53ca5..00d6af6a32ec 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -8,23 +8,6 @@
 #include "mds_client.h"
 /*
- * NFS export support
- *
- * NFS re-export of a ceph mount is, at present, only semireliable.
- * The basic issue is that the Ceph architectures doesn't lend itself
- * well to generating filehandles that will remain valid forever.
- *
- * So, we do our best.  If you're lucky, your inode will be in the
- * client's cache.  If it's not, and you have a connectable fh, then
- * the MDS server may be able to find it for you.  Otherwise, you get
- * ESTALE.
- *
- * There are ways to this more reliable, but in the non-connectable fh
- * case, we won't every work perfectly, and in the connectable case,
- * some changes are needed on the MDS side to work better.
- */
-/*
 * Basic fh
 */
 struct ceph_nfs_fh {
@@ -32,22 +15,12 @@ struct ceph_nfs_fh {
 } __attribute__ ((packed));
 /*
- * Larger 'connectable' fh that includes parent ino and name hash.
+ * Larger fh that includes parent ino.
- * Use this whenever possible, as it works more reliably.
 */
 struct ceph_nfs_confh {
        u64 ino, parent_ino;
-        u32 parent_name_hash;
 } __attribute__ ((packed));
-/*
- * The presence of @parent_inode here tells us whether NFS wants a
- * connectable file handle.  However, we want to make a connectionable
- * file handle unconditionally so that the MDS gets as much of a hint
- * as possible.  That means we only use @parent_dentry to indicate
- * whether nfsd wants a connectable fh, and whether we should indicate
- * failure from a too-small @max_len.
- */
 static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
                          struct inode *parent_inode)
 {
@@ -56,54 +29,36 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
        struct ceph_nfs_confh *cfh = (void *)rawfh;
        int connected_handle_length = sizeof(*cfh)/4;
        int handle_length = sizeof(*fh)/4;
-        struct dentry *dentry;
-        struct dentry *parent;
        /* don't re-export snaps */
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EINVAL;
-        dentry = d_find_alias(inode);
+        if (parent_inode && (*max_len < connected_handle_length)) {
+                *max_len = connected_handle_length;
+                return FILEID_INVALID;
+        } else if (*max_len < handle_length) {
+                *max_len = handle_length;
+                return FILEID_INVALID;
+        }
-        /* if we found an alias, generate a connectable fh */
+        if (parent_inode) {
-        if (*max_len >= connected_handle_length && dentry) {
+                dout("encode_fh %llx with parent %llx\n",
-                dout("encode_fh %p connectable\n", dentry);
+                     ceph_ino(inode), ceph_ino(parent_inode));
-                spin_lock(&dentry->d_lock);
-                parent = dentry->d_parent;
                cfh->ino = ceph_ino(inode);
-                cfh->parent_ino = ceph_ino(parent->d_inode);
+                cfh->parent_ino = ceph_ino(parent_inode);
-                cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
-                                                         dentry);
                *max_len = connected_handle_length;
-                type = 2;
+                type = FILEID_INO32_GEN_PARENT;
-                spin_unlock(&dentry->d_lock);
-        } else if (*max_len >= handle_length) {
-                if (parent_inode) {
-                        /* nfsd wants connectable */
-                        *max_len = connected_handle_length;
-                        type = FILEID_INVALID;
-                } else {
-                        dout("encode_fh %p\n", dentry);
-                        fh->ino = ceph_ino(inode);
-                        *max_len = handle_length;
-                        type = 1;
-                }
        } else {
+                dout("encode_fh %llx\n", ceph_ino(inode));
+                fh->ino = ceph_ino(inode);
                *max_len = handle_length;
-                type = FILEID_INVALID;
+                type = FILEID_INO32_GEN;
        }
-        if (dentry)
-                dput(dentry);
        return type;
 }
-/*
+static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
- * convert regular fh to dentry
- *
- * FIXME: we should try harder by querying the mds for the ino.
- */
-static struct dentry *__fh_to_dentry(struct super_block *sb,
-                                     struct ceph_nfs_fh *fh, int fh_len)
 {
        struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
        struct inode *inode;
@@ -111,11 +66,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
        struct ceph_vino vino;
        int err;
-        if (fh_len < sizeof(*fh) / 4)
+        vino.ino = ino;
-                return ERR_PTR(-ESTALE);
-        dout("__fh_to_dentry %llx\n", fh->ino);
-        vino.ino = fh->ino;
        vino.snap = CEPH_NOSNAP;
        inode = ceph_find_inode(sb, vino);
        if (!inode) {
@@ -139,139 +90,161 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
        dentry = d_obtain_alias(inode);
        if (IS_ERR(dentry)) {
-                pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
-                       fh->ino, inode);
                iput(inode);
                return dentry;
        }
        err = ceph_init_dentry(dentry);
        if (err < 0) {
-                iput(inode);
+                dput(dentry);
                return ERR_PTR(err);
        }
-        dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
+        dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry);
        return dentry;
 }
 /*
- * convert connectable fh to dentry
+ * convert regular fh to dentry
 */
-static struct dentry *__cfh_to_dentry(struct super_block *sb,
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
-                                      struct ceph_nfs_confh *cfh, int fh_len)
+                                        struct fid *fid,
+                                        int fh_len, int fh_type)
+{
+        struct ceph_nfs_fh *fh = (void *)fid->raw;
+        if (fh_type != FILEID_INO32_GEN  &&
+            fh_type != FILEID_INO32_GEN_PARENT)
+                return NULL;
+        if (fh_len < sizeof(*fh) / 4)
+                return NULL;
+        dout("fh_to_dentry %llx\n", fh->ino);
+        return __fh_to_dentry(sb, fh->ino);
+}
+static struct dentry *__get_parent(struct super_block *sb,
+                                   struct dentry *child, u64 ino)
 {
        struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+        struct ceph_mds_request *req;
        struct inode *inode;
        struct dentry *dentry;
-        struct ceph_vino vino;
        int err;
-        if (fh_len < sizeof(*cfh) / 4)
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
-                return ERR_PTR(-ESTALE);
+                                       USE_ANY_MDS);
+        if (IS_ERR(req))
-        dout("__cfh_to_dentry %llx (%llx/%x)\n",
+                return ERR_CAST(req);
-             cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
-        vino.ino = cfh->ino;
-        vino.snap = CEPH_NOSNAP;
-        inode = ceph_find_inode(sb, vino);
-        if (!inode) {
-                struct ceph_mds_request *req;
-                req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
-                                               USE_ANY_MDS);
-                if (IS_ERR(req))
-                        return ERR_CAST(req);
-                req->r_ino1 = vino;
+        if (child) {
-                req->r_ino2.ino = cfh->parent_ino;
+                req->r_inode = child->d_inode;
-                req->r_ino2.snap = CEPH_NOSNAP;
+                ihold(child->d_inode);
-                req->r_path2 = kmalloc(16, GFP_NOFS);
+        } else {
-                snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
+                req->r_ino1 = (struct ceph_vino) {
-                req->r_num_caps = 1;
+                        .ino = ino,
-                err = ceph_mdsc_do_request(mdsc, NULL, req);
+                        .snap = CEPH_NOSNAP,
-                inode = req->r_target_inode;
+                };
-                if (inode)
-                        ihold(inode);
-                ceph_mdsc_put_request(req);
-                if (!inode)
-                        return ERR_PTR(err ? err : -ESTALE);
        }
+        req->r_num_caps = 1;
+        err = ceph_mdsc_do_request(mdsc, NULL, req);
+        inode = req->r_target_inode;
+        if (inode)
+                ihold(inode);
+        ceph_mdsc_put_request(req);
+        if (!inode)
+                return ERR_PTR(-ENOENT);
        dentry = d_obtain_alias(inode);
        if (IS_ERR(dentry)) {
-                pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
-                       cfh->ino, inode);
                iput(inode);
                return dentry;
        }
        err = ceph_init_dentry(dentry);
        if (err < 0) {
-                iput(inode);
+                dput(dentry);
                return ERR_PTR(err);
        }
-        dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
+        dout("__get_parent ino %llx parent %p ino %llx.%llx\n",
+             child ? ceph_ino(child->d_inode) : ino,
+             dentry, ceph_vinop(inode));
        return dentry;
 }
-static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
+struct dentry *ceph_get_parent(struct dentry *child)
-                                        int fh_len, int fh_type)
 {
-        if (fh_type == 1)
+        /* don't re-export snaps */
-                return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw,
+        if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
-                                                                fh_len);
+                return ERR_PTR(-EINVAL);
-        else
-                return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw,
+        dout("get_parent %p ino %llx.%llx\n",
-                                                                fh_len);
+             child, ceph_vinop(child->d_inode));
+        return __get_parent(child->d_sb, child, 0);
 }
 /*
- * get parent, if possible.
+ * convert regular fh to parent
- *
- * FIXME: we could do better by querying the mds to discover the
- * parent.
 */
 static struct dentry *ceph_fh_to_parent(struct super_block *sb,
-                                         struct fid *fid,
+                                        struct fid *fid,
                                        int fh_len, int fh_type)
 {
        struct ceph_nfs_confh *cfh = (void *)fid->raw;
-        struct ceph_vino vino;
-        struct inode *inode;
        struct dentry *dentry;
-        int err;
-        if (fh_type == 1)
+        if (fh_type != FILEID_INO32_GEN_PARENT)
-                return ERR_PTR(-ESTALE);
+                return NULL;
        if (fh_len < sizeof(*cfh) / 4)
-                return ERR_PTR(-ESTALE);
+                return NULL;
-        pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
+        dout("fh_to_parent %llx\n", cfh->parent_ino);
-                 cfh->parent_name_hash);
+        dentry = __get_parent(sb, NULL, cfh->ino);
+        if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT)
+                dentry = __fh_to_dentry(sb, cfh->parent_ino);
+        return dentry;
+}
-        vino.ino = cfh->ino;
+static int ceph_get_name(struct dentry *parent, char *name,
-        vino.snap = CEPH_NOSNAP;
+                         struct dentry *child)
-        inode = ceph_find_inode(sb, vino);
+{
-        if (!inode)
+        struct ceph_mds_client *mdsc;
-                return ERR_PTR(-ESTALE);
+        struct ceph_mds_request *req;
+        int err;
-        dentry = d_obtain_alias(inode);
+        mdsc = ceph_inode_to_client(child->d_inode)->mdsc;
-        if (IS_ERR(dentry)) {
+        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
-                pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
+                                       USE_ANY_MDS);
-                       cfh->ino, inode);
+        if (IS_ERR(req))
-                iput(inode);
+                return PTR_ERR(req);
-                return dentry;
-        }
+        mutex_lock(&parent->d_inode->i_mutex);
-        err = ceph_init_dentry(dentry);
-        if (err < 0) {
+        req->r_inode = child->d_inode;
-                iput(inode);
+        ihold(child->d_inode);
-                return ERR_PTR(err);
+        req->r_ino2 = ceph_vino(parent->d_inode);
+        req->r_locked_dir = parent->d_inode;
+        req->r_num_caps = 2;
+        err = ceph_mdsc_do_request(mdsc, NULL, req);
+        mutex_unlock(&parent->d_inode->i_mutex);
+        if (!err) {
+                struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+                memcpy(name, rinfo->dname, rinfo->dname_len);
+                name[rinfo->dname_len] = 0;
+                dout("get_name %p ino %llx.%llx name %s\n",
+                     child, ceph_vinop(child->d_inode), name);
+        } else {
+                dout("get_name %p ino %llx.%llx err %d\n",
+                     child, ceph_vinop(child->d_inode), err);
        }
-        dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
-        return dentry;
+        ceph_mdsc_put_request(req);
+        return err;
 }
 const struct export_operations ceph_export_ops = {
        .encode_fh = ceph_encode_fh,
        .fh_to_dentry = ceph_fh_to_dentry,
        .fh_to_parent = ceph_fh_to_parent,
+        .get_parent = ceph_get_parent,
+        .get_name = ceph_get_name,
 };
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 09c7afe32e49..88a6df4cbe6d 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -210,7 +210,7 @@ int ceph_open(struct inode *inode, struct file *file)
        ihold(inode);
        req->r_num_caps = 1;
-        if (flags & (O_CREAT|O_TRUNC))
+        if (flags & O_CREAT)
                parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
        iput(parent_inode);
@@ -291,8 +291,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
                }
                err = finish_open(file, dentry, ceph_open, opened);
        }
 out_err:
+        if (!req->r_err && req->r_target_inode)
+                ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
        ceph_mdsc_put_request(req);
        dout("atomic_open result=%d\n", err);
        return err;
@@ -600,7 +601,7 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov,
                                            false);
                if (IS_ERR(req)) {
                        ret = PTR_ERR(req);
-                        goto out;
+                        break;
                }
                num_pages = calc_pages_for(page_align, len);
@@ -718,7 +719,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov,
                                            false);
                if (IS_ERR(req)) {
                        ret = PTR_ERR(req);
-                        goto out;
+                        break;
                }
                /*
@@ -970,6 +971,8 @@ retry_snap:
                        goto retry_snap;
                }
        } else {
+                loff_t old_size = inode->i_size;
+                struct iov_iter from;
                /*
                 * No need to acquire the i_truncate_mutex. Because
                 * the MDS revokes Fwb caps before sending truncate
@@ -977,9 +980,12 @@ retry_snap:
                 * are pending vmtruncate. So write and vmtruncate
                 * can not run at the same time
                 */
-                written = generic_file_buffered_write(iocb, iov, nr_segs,
+                iov_iter_init(&from, iov, nr_segs, count, 0);
-                                                      pos, &iocb->ki_pos,
+                written = generic_perform_write(file, &from, pos);
-                                                      count, 0);
+                if (likely(written >= 0))
+                        iocb->ki_pos = pos + written;
+                if (inode->i_size > old_size)
+                        ceph_fscache_update_objectsize(inode);
                mutex_unlock(&inode->i_mutex);
        }
@@ -1215,9 +1221,6 @@ static long ceph_fallocate(struct file *file, int mode,
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;
-        if (IS_SWAPFILE(inode))
-                return -ETXTBSY;
        mutex_lock(&inode->i_mutex);
        if (ceph_snap(inode) != CEPH_NOSNAP) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 32d519d8a2e2..233c6f96910a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -659,14 +659,6 @@ static int fill_inode(struct inode *inode,
                            le32_to_cpu(info->time_warp_seq),
                            &ctime, &mtime, &atime);
-        /* only update max_size on auth cap */
-        if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
-            ci->i_max_size != le64_to_cpu(info->max_size)) {
-                dout("max_size %lld -> %llu\n", ci->i_max_size,
-                     le64_to_cpu(info->max_size));
-                ci->i_max_size = le64_to_cpu(info->max_size);
-        }
        ci->i_layout = info->layout;
        inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
@@ -752,9 +744,16 @@ static int fill_inode(struct inode *inode,
            !__ceph_dir_is_complete(ci)) {
                dout(" marking %p complete (empty)\n", inode);
                __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
-                ci->i_max_offset = 2;
        }
 no_change:
+        /* only update max_size on auth cap */
+        if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+            ci->i_max_size != le64_to_cpu(info->max_size)) {
+                dout("max_size %lld -> %llu\n", ci->i_max_size,
+                     le64_to_cpu(info->max_size));
+                ci->i_max_size = le64_to_cpu(info->max_size);
+        }
        spin_unlock(&ci->i_ceph_lock);
        /* queue truncate if we saw i_size decrease */
@@ -890,41 +889,6 @@ out_unlock:
 }
 /*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- *
- * Always called under directory's i_mutex.
- */
-static void ceph_set_dentry_offset(struct dentry *dn)
-{
-        struct dentry *dir = dn->d_parent;
-        struct inode *inode = dir->d_inode;
-        struct ceph_inode_info *ci;
-        struct ceph_dentry_info *di;
-        BUG_ON(!inode);
-        ci = ceph_inode(inode);
-        di = ceph_dentry(dn);
-        spin_lock(&ci->i_ceph_lock);
-        if (!__ceph_dir_is_complete(ci)) {
-                spin_unlock(&ci->i_ceph_lock);
-                return;
-        }
-        di->offset = ceph_inode(inode)->i_max_offset++;
-        spin_unlock(&ci->i_ceph_lock);
-        spin_lock(&dir->d_lock);
-        spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
-        list_move(&dn->d_u.d_child, &dir->d_subdirs);
-        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
-             dn->d_u.d_child.prev, dn->d_u.d_child.next);
-        spin_unlock(&dn->d_lock);
-        spin_unlock(&dir->d_lock);
-}
-/*
 * splice a dentry to an inode.
 * caller must hold directory i_mutex for this to be safe.
 *
@@ -933,7 +897,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
 * the caller) if we fail.
 */
 static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
-                                    bool *prehash, bool set_offset)
+                                    bool *prehash)
 {
        struct dentry *realdn;
@@ -965,8 +929,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
        }
        if ((!prehash || *prehash) && d_unhashed(dn))
                d_rehash(dn);
-        if (set_offset)
-                ceph_set_dentry_offset(dn);
 out:
        return dn;
 }
@@ -987,7 +949,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 {
        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
        struct inode *in = NULL;
-        struct ceph_mds_reply_inode *ininfo;
        struct ceph_vino vino;
        struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
        int err = 0;
@@ -1044,10 +1005,59 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                                         session, req->r_request_started, -1,
                                         &req->r_caps_reservation);
                        if (err < 0)
-                                return err;
+                                goto done;
                } else {
                        WARN_ON_ONCE(1);
                }
+                if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
+                        struct qstr dname;
+                        struct dentry *dn, *parent;
+                        BUG_ON(!rinfo->head->is_target);
+                        BUG_ON(req->r_dentry);
+                        parent = d_find_any_alias(dir);
+                        BUG_ON(!parent);
+                        dname.name = rinfo->dname;
+                        dname.len = rinfo->dname_len;
+                        dname.hash = full_name_hash(dname.name, dname.len);
+                        vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+                        vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+retry_lookup:
+                        dn = d_lookup(parent, &dname);
+                        dout("d_lookup on parent=%p name=%.*s got %p\n",
+                             parent, dname.len, dname.name, dn);
+                        if (!dn) {
+                                dn = d_alloc(parent, &dname);
+                                dout("d_alloc %p '%.*s' = %p\n", parent,
+                                     dname.len, dname.name, dn);
+                                if (dn == NULL) {
+                                        dput(parent);
+                                        err = -ENOMEM;
+                                        goto done;
+                                }
+                                err = ceph_init_dentry(dn);
+                                if (err < 0) {
+                                        dput(dn);
+                                        dput(parent);
+                                        goto done;
+                                }
+                        } else if (dn->d_inode &&
+                                   (ceph_ino(dn->d_inode) != vino.ino ||
+                                    ceph_snap(dn->d_inode) != vino.snap)) {
+                                dout(" dn %p points to wrong inode %p\n",
+                                     dn, dn->d_inode);
+                                d_delete(dn);
+                                dput(dn);
+                                goto retry_lookup;
+                        }
+                        req->r_dentry = dn;
+                        dput(parent);
+                }
        }
        if (rinfo->head->is_target) {
@@ -1063,7 +1073,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                err = fill_inode(in, &rinfo->targeti, NULL,
                                session, req->r_request_started,
-                                (le32_to_cpu(rinfo->head->result) == 0) ?
+                                (!req->r_aborted && rinfo->head->result == 0) ?
                                req->r_fmode : -1,
                                &req->r_caps_reservation);
                if (err < 0) {
@@ -1112,6 +1122,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                /* rename? */
                if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+                        struct inode *olddir = req->r_old_dentry_dir;
+                        BUG_ON(!olddir);
                        dout(" src %p '%.*s' dst %p '%.*s'\n",
                             req->r_old_dentry,
                             req->r_old_dentry->d_name.len,
@@ -1131,13 +1144,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                           rehashing bug in vfs_rename_dir */
                        ceph_invalidate_dentry_lease(dn);
-                        /*
+                        /* d_move screws up sibling dentries' offsets */
-                         * d_move() puts the renamed dentry at the end of
+                        ceph_dir_clear_complete(dir);
-                         * d_subdirs.  We need to assign it an appropriate
+                        ceph_dir_clear_complete(olddir);
-                         * directory offset so we can behave when dir is
-                         * complete.
-                         */
-                        ceph_set_dentry_offset(req->r_old_dentry);
                        dout("dn %p gets new offset %lld\n", req->r_old_dentry,
                             ceph_dentry(req->r_old_dentry)->offset);
@@ -1164,8 +1174,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                /* attach proper inode */
                if (!dn->d_inode) {
+                        ceph_dir_clear_complete(dir);
                        ihold(in);
-                        dn = splice_dentry(dn, in, &have_lease, true);
+                        dn = splice_dentry(dn, in, &have_lease);
                        if (IS_ERR(dn)) {
                                err = PTR_ERR(dn);
                                goto done;
@@ -1186,17 +1197,16 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                   (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
                    req->r_op == CEPH_MDS_OP_MKSNAP)) {
                struct dentry *dn = req->r_dentry;
+                struct inode *dir = req->r_locked_dir;
                /* fill out a snapdir LOOKUPSNAP dentry */
                BUG_ON(!dn);
-                BUG_ON(!req->r_locked_dir);
+                BUG_ON(!dir);
-                BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
+                BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
-                ininfo = rinfo->targeti.in;
-                vino.ino = le64_to_cpu(ininfo->ino);
-                vino.snap = le64_to_cpu(ininfo->snapid);
                dout(" linking snapped dir %p to dn %p\n", in, dn);
+                ceph_dir_clear_complete(dir);
                ihold(in);
-                dn = splice_dentry(dn, in, NULL, true);
+                dn = splice_dentry(dn, in, NULL);
                if (IS_ERR(dn)) {
                        err = PTR_ERR(dn);
                        goto done;
@@ -1358,7 +1368,7 @@ retry_lookup:
                }
                if (!dn->d_inode) {
-                        dn = splice_dentry(dn, in, NULL, false);
+                        dn = splice_dentry(dn, in, NULL);
                        if (IS_ERR(dn)) {
                                err = PTR_ERR(dn);
                                dn = NULL;
@@ -1616,8 +1626,6 @@ static const struct inode_operations ceph_symlink_iops = {
        .getxattr = ceph_getxattr,
        .listxattr = ceph_listxattr,
        .removexattr = ceph_removexattr,
-        .get_acl = ceph_get_acl,
-        .set_acl = ceph_set_acl,
 };
 /*
@@ -1627,7 +1635,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct inode *parent_inode;
        const unsigned int ia_valid = attr->ia_valid;
        struct ceph_mds_request *req;
        struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
@@ -1819,9 +1826,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
                req->r_inode_drop = release;
                req->r_args.setattr.mask = cpu_to_le32(mask);
                req->r_num_caps = 1;
-                parent_inode = ceph_get_dentry_parent_inode(dentry);
+                err = ceph_mdsc_do_request(mdsc, NULL, req);
-                err = ceph_mdsc_do_request(mdsc, parent_inode, req);
-                iput(parent_inode);
        }
        dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
             ceph_cap_string(dirtied), mask);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index dc66c9e023e4..a822a6e58290 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,9 +1,8 @@
+#include <linux/ceph/ceph_debug.h>
 #include <linux/in.h>
 #include "super.h"
 #include "mds_client.h"
-#include <linux/ceph/ceph_debug.h>
 #include "ioctl.h"
@@ -64,7 +63,6 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
 static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 {
        struct inode *inode = file_inode(file);
-        struct inode *parent_inode;
        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_mds_request *req;
        struct ceph_ioctl_layout l;
@@ -111,6 +109,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
                return PTR_ERR(req);
        req->r_inode = inode;
        ihold(inode);
+        req->r_num_caps = 1;
        req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
        req->r_args.setlayout.layout.fl_stripe_unit =
@@ -121,9 +121,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
                cpu_to_le32(l.object_size);
        req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
-        parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
+        err = ceph_mdsc_do_request(mdsc, NULL, req);
-        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
-        iput(parent_inode);
        ceph_mdsc_put_request(req);
        return err;
 }
@@ -157,6 +155,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
                return PTR_ERR(req);
        req->r_inode = inode;
        ihold(inode);
+        req->r_num_caps = 1;
        req->r_args.setlayout.layout.fl_stripe_unit =
                        cpu_to_le32(l.stripe_unit);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ae6d14e82b0f..191398852a2e 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -2,11 +2,31 @@
 #include <linux/file.h>
 #include <linux/namei.h>
+#include <linux/random.h>
 #include "super.h"
 #include "mds_client.h"
 #include <linux/ceph/pagelist.h>
+static u64 lock_secret;
+static inline u64 secure_addr(void *addr)
+{
+        u64 v = lock_secret ^ (u64)(unsigned long)addr;
+        /*
+         * Set the most significant bit, so that MDS knows the 'owner'
+         * is sufficient to identify the owner of lock. (old code uses
+         * both 'owner' and 'pid')
+         */
+        v |= (1ULL << 63);
+        return v;
+}
+void __init ceph_flock_init(void)
+{
+        get_random_bytes(&lock_secret, sizeof(lock_secret));
+}
 /**
 * Implement fcntl and flock locking functions.
 */
@@ -14,17 +34,18 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
                             int cmd, u8 wait, struct file_lock *fl)
 {
        struct inode *inode = file_inode(file);
-        struct ceph_mds_client *mdsc =
+        struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
-                ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_mds_request *req;
        int err;
        u64 length = 0;
+        u64 owner;
        req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
        if (IS_ERR(req))
                return PTR_ERR(req);
        req->r_inode = inode;
        ihold(inode);
+        req->r_num_caps = 1;
        /* mds requires start and length rather than start and end */
        if (LLONG_MAX == fl->fl_end)
@@ -32,25 +53,27 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
        else
                length = fl->fl_end - fl->fl_start + 1;
-        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
+        if (lock_type == CEPH_LOCK_FCNTL)
-             "length: %llu, wait: %d, type: %d", (int)lock_type,
+                owner = secure_addr(fl->fl_owner);
-             (int)operation, (u64)fl->fl_pid, fl->fl_start,
+        else
-             length, wait, fl->fl_type);
+                owner = secure_addr(fl->fl_file);
+        dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
+             "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type,
+             (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
+             wait, fl->fl_type);
        req->r_args.filelock_change.rule = lock_type;
        req->r_args.filelock_change.type = cmd;
+        req->r_args.filelock_change.owner = cpu_to_le64(owner);
        req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
-        /* This should be adjusted, but I'm not sure if
-           namespaces actually get id numbers*/
-        req->r_args.filelock_change.pid_namespace =
-                cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
        req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
        req->r_args.filelock_change.length = cpu_to_le64(length);
        req->r_args.filelock_change.wait = wait;
        err = ceph_mdsc_do_request(mdsc, inode, req);
-        if ( operation == CEPH_MDS_OP_GETFILELOCK){
+        if (operation == CEPH_MDS_OP_GETFILELOCK) {
                fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
                if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
                        fl->fl_type = F_RDLCK;
@@ -87,14 +110,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
        u8 wait = 0;
        u16 op = CEPH_MDS_OP_SETFILELOCK;
-        fl->fl_nspid = get_pid(task_tgid(current));
+        if (!(fl->fl_flags & FL_POSIX))
-        dout("ceph_lock, fl_pid:%d", fl->fl_pid);
+                return -ENOLCK;
+        /* No mandatory locks */
+        if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
+                return -ENOLCK;
+        dout("ceph_lock, fl_owner: %p", fl->fl_owner);
        /* set wait bit as appropriate, then make command as Ceph expects it*/
-        if (F_SETLKW == cmd)
+        if (IS_GETLK(cmd))
-                wait = 1;
-        if (F_GETLK == cmd)
                op = CEPH_MDS_OP_GETFILELOCK;
+        else if (IS_SETLKW(cmd))
+                wait = 1;
        if (F_RDLCK == fl->fl_type)
                lock_cmd = CEPH_LOCK_SHARED;
@@ -105,7 +133,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
        if (!err) {
-                if ( op != CEPH_MDS_OP_GETFILELOCK ){
+                if (op != CEPH_MDS_OP_GETFILELOCK) {
                        dout("mds locked, locking locally");
                        err = posix_lock_file(file, fl, NULL);
                        if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
@@ -131,20 +159,22 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 {
        u8 lock_cmd;
        int err;
-        u8 wait = 1;
+        u8 wait = 0;
-        fl->fl_nspid = get_pid(task_tgid(current));
+        if (!(fl->fl_flags & FL_FLOCK))
-        dout("ceph_flock, fl_pid:%d", fl->fl_pid);
+                return -ENOLCK;
+        /* No mandatory locks */
-        /* set wait bit, then clear it out of cmd*/
+        if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
-        if (cmd & LOCK_NB)
+                return -ENOLCK;
-                wait = 0;
-        cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
+        dout("ceph_flock, fl_file: %p", fl->fl_file);
-        /* set command sequence that Ceph wants to see:
-           shared lock, exclusive lock, or unlock */
+        if (IS_SETLKW(cmd))
-        if (LOCK_SH == cmd)
+                wait = 1;
+        if (F_RDLCK == fl->fl_type)
                lock_cmd = CEPH_LOCK_SHARED;
-        else if (LOCK_EX == cmd)
+        else if (F_WRLCK == fl->fl_type)
                lock_cmd = CEPH_LOCK_EXCL;
        else
                lock_cmd = CEPH_LOCK_UNLOCK;
@@ -280,13 +310,14 @@ int lock_to_ceph_filelock(struct file_lock *lock,
                          struct ceph_filelock *cephlock)
 {
        int err = 0;
        cephlock->start = cpu_to_le64(lock->fl_start);
        cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
        cephlock->client = cpu_to_le64(0);
-        cephlock->pid = cpu_to_le64(lock->fl_pid);
+        cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
-        cephlock->pid_namespace =
+        if (lock->fl_flags & FL_POSIX)
-                cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
+                cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
+        else
+                cephlock->owner = cpu_to_le64(secure_addr(lock->fl_file));
        switch (lock->fl_type) {
        case F_RDLCK:
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f4f050a69a48..2b4d093d0563 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
+#include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -165,21 +166,18 @@ static int parse_reply_info_dir(void **p, void *end,
        if (num == 0)
                goto done;
-        /* alloc large array */
+        BUG_ON(!info->dir_in);
-        info->dir_nr = num;
-        info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
-                               sizeof(*info->dir_dname) +
-                               sizeof(*info->dir_dname_len) +
-                               sizeof(*info->dir_dlease),
-                               GFP_NOFS);
-        if (info->dir_in == NULL) {
-                err = -ENOMEM;
-                goto out_bad;
-        }
        info->dir_dname = (void *)(info->dir_in + num);
        info->dir_dname_len = (void *)(info->dir_dname + num);
        info->dir_dlease = (void *)(info->dir_dname_len + num);
+        if ((unsigned long)(info->dir_dlease + num) >
+            (unsigned long)info->dir_in + info->dir_buf_size) {
+                pr_err("dir contents are larger than expected\n");
+                WARN_ON(1);
+                goto bad;
+        }
+        info->dir_nr = num;
        while (num) {
                /* dentry */
                ceph_decode_need(p, end, sizeof(u32)*2, bad);
@@ -327,7 +325,9 @@ out_bad:
 static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
 {
-        kfree(info->dir_in);
+        if (!info->dir_in)
+                return;
+        free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
 }
@@ -512,12 +512,11 @@ void ceph_mdsc_release_request(struct kref *kref)
        struct ceph_mds_request *req = container_of(kref,
                                                    struct ceph_mds_request,
                                                    r_kref);
+        destroy_reply_info(&req->r_reply_info);
        if (req->r_request)
                ceph_msg_put(req->r_request);
-        if (req->r_reply) {
+        if (req->r_reply)
                ceph_msg_put(req->r_reply);
-                destroy_reply_info(&req->r_reply_info);
-        }
        if (req->r_inode) {
                ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
                iput(req->r_inode);
@@ -528,7 +527,9 @@ void ceph_mdsc_release_request(struct kref *kref)
                iput(req->r_target_inode);
        if (req->r_dentry)
                dput(req->r_dentry);
-        if (req->r_old_dentry) {
+        if (req->r_old_dentry)
+                dput(req->r_old_dentry);
+        if (req->r_old_dentry_dir) {
                /*
                 * track (and drop pins for) r_old_dentry_dir
                 * separately, since r_old_dentry's d_parent may have
@@ -537,7 +538,6 @@ void ceph_mdsc_release_request(struct kref *kref)
                 */
                ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
                                  CEPH_CAP_PIN);
-                dput(req->r_old_dentry);
                iput(req->r_old_dentry_dir);
        }
        kfree(req->r_path1);
@@ -1311,6 +1311,9 @@ static int trim_caps(struct ceph_mds_client *mdsc,
                        trim_caps - session->s_trim_caps);
                session->s_trim_caps = 0;
        }
+        ceph_add_cap_releases(mdsc, session);
+        ceph_send_cap_releases(mdsc, session);
        return 0;
 }
@@ -1461,15 +1464,18 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
        dout("discard_cap_releases mds%d\n", session->s_mds);
-        /* zero out the in-progress message */
+        if (!list_empty(&session->s_cap_releases)) {
-        msg = list_first_entry(&session->s_cap_releases,
+                /* zero out the in-progress message */
-                               struct ceph_msg, list_head);
+                msg = list_first_entry(&session->s_cap_releases,
-        head = msg->front.iov_base;
+                                        struct ceph_msg, list_head);
-        num = le32_to_cpu(head->num);
+                head = msg->front.iov_base;
-        dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
+                num = le32_to_cpu(head->num);
-        head->num = cpu_to_le32(0);
+                dout("discard_cap_releases mds%d %p %u\n",
-        msg->front.iov_len = sizeof(*head);
+                     session->s_mds, msg, num);
-        session->s_num_cap_releases += num;
+                head->num = cpu_to_le32(0);
+                msg->front.iov_len = sizeof(*head);
+                session->s_num_cap_releases += num;
+        }
        /* requeue completed messages */
        while (!list_empty(&session->s_cap_releases_done)) {
@@ -1492,6 +1498,43 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
 * requests
 */
+int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+                                    struct inode *dir)
+{
+        struct ceph_inode_info *ci = ceph_inode(dir);
+        struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+        struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
+        size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
+                      sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
+        int order, num_entries;
+        spin_lock(&ci->i_ceph_lock);
+        num_entries = ci->i_files + ci->i_subdirs;
+        spin_unlock(&ci->i_ceph_lock);
+        num_entries = max(num_entries, 1);
+        num_entries = min(num_entries, opt->max_readdir);
+        order = get_order(size * num_entries);
+        while (order >= 0) {
+                rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN,
+                                                        order);
+                if (rinfo->dir_in)
+                        break;
+                order--;
+        }
+        if (!rinfo->dir_in)
+                return -ENOMEM;
+        num_entries = (PAGE_SIZE << order) / size;
+        num_entries = min(num_entries, opt->max_readdir);
+        rinfo->dir_buf_size = PAGE_SIZE << order;
+        req->r_num_caps = num_entries + 1;
+        req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
+        req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
+        return 0;
+}
 /*
 * Create an mds request.
 */
@@ -2053,7 +2096,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
                ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
        if (req->r_locked_dir)
                ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
-        if (req->r_old_dentry)
+        if (req->r_old_dentry_dir)
                ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
                                  CEPH_CAP_PIN);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 68288917c737..e90cfccf93bd 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -67,6 +67,7 @@ struct ceph_mds_reply_info_parsed {
                /* for readdir results */
                struct {
                        struct ceph_mds_reply_dirfrag *dir_dir;
+                        size_t                        dir_buf_size;
                        int                           dir_nr;
                        char                          **dir_dname;
                        u32                           *dir_dname_len;
@@ -346,7 +347,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
                                    struct dentry *dn);
 extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
+extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+                                           struct inode *dir);
 extern struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 4440f447fd3f..51cc23e48111 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -54,6 +54,7 @@ const char *ceph_mds_op_name(int op)
        case CEPH_MDS_OP_LOOKUPHASH:  return "lookuphash";
        case CEPH_MDS_OP_LOOKUPPARENT:  return "lookupparent";
        case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
+        case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
        case CEPH_MDS_OP_GETATTR:  return "getattr";
        case CEPH_MDS_OP_SETXATTR: return "setxattr";
        case CEPH_MDS_OP_SETATTR: return "setattr";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 10a4ccbf38da..06150fd745ac 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1026,6 +1026,7 @@ static int __init init_ceph(void)
        if (ret)
                goto out;
+        ceph_flock_init();
        ceph_xattr_init();
        ret = register_filesystem(&ceph_fs_type);
        if (ret)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index d8801a95b685..ead05cc1f447 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -266,7 +266,6 @@ struct ceph_inode_info {
        struct timespec i_rctime;
        u64 i_rbytes, i_rfiles, i_rsubdirs;
        u64 i_files, i_subdirs;
-        u64 i_max_offset;  /* largest readdir offset, set with complete dir */
        struct rb_root i_fragtree;
        struct mutex i_fragtree_mutex;
@@ -577,7 +576,7 @@ struct ceph_file_info {
        /* readdir: position within a frag */
        unsigned offset;       /* offset of last chunk, adjusted for . and .. */
-        u64 next_offset;       /* offset of next chunk (last_name's + 1) */
+        unsigned next_offset;  /* offset of next chunk (last_name's + 1) */
        char *last_name;       /* last entry in previous chunk */
        struct dentry *dentry; /* next dentry (for dcache readdir) */
        int dir_release_count;
@@ -871,6 +870,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 extern const struct export_operations ceph_export_ops;
 /* locks.c */
+extern __init void ceph_flock_init(void);
 extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
 extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
 extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a55ec37378c6..c9c2b887381e 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -64,32 +64,48 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
 }
 static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
-                                        size_t size)
+                                   size_t size)
 {
        int ret;
        struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
        struct ceph_osd_client *osdc = &fsc->client->osdc;
        s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
        const char *pool_name;
+        char buf[128];
        dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
        down_read(&osdc->map_sem);
        pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
-        if (pool_name)
+        if (pool_name) {
-                ret = snprintf(val, size,
+                size_t len = strlen(pool_name);
-                "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s",
+                ret = snprintf(buf, sizeof(buf),
+                "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
                (unsigned long long)ceph_file_layout_su(ci->i_layout),
                (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
-                (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
+                (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
-                pool_name);
+                if (!size) {
-        else
+                        ret += len;
-                ret = snprintf(val, size,
+                } else if (ret + len > size) {
+                        ret = -ERANGE;
+                } else {
+                        memcpy(val, buf, ret);
+                        memcpy(val + ret, pool_name, len);
+                        ret += len;
+                }
+        } else {
+                ret = snprintf(buf, sizeof(buf),
                "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
                (unsigned long long)ceph_file_layout_su(ci->i_layout),
                (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
                (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
                (unsigned long long)pool);
+                if (size) {
+                        if (ret <= size)
+                                memcpy(val, buf, ret);
+                        else
+                                ret = -ERANGE;
+                }
+        }
        up_read(&osdc->map_sem);
        return ret;
 }
@@ -215,7 +231,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
                .name_size = sizeof("ceph.dir.layout"),
                .getxattr_cb = ceph_vxattrcb_layout,
                .readonly = false,
-                .hidden = false,
+                .hidden = true,
                .exists_cb = ceph_vxattrcb_layout_exists,
        },
        XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
@@ -242,7 +258,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
                .name_size = sizeof("ceph.file.layout"),
                .getxattr_cb = ceph_vxattrcb_layout,
                .readonly = false,
-                .hidden = false,
+                .hidden = true,
                .exists_cb = ceph_vxattrcb_layout_exists,
        },
        XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
@@ -842,7 +858,6 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
        struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
        struct inode *inode = dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct inode *parent_inode;
        struct ceph_mds_request *req;
        struct ceph_mds_client *mdsc = fsc->mdsc;
        int err;
@@ -893,9 +908,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
        req->r_data_len = size;
        dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
-        parent_inode = ceph_get_dentry_parent_inode(dentry);
+        err = ceph_mdsc_do_request(mdsc, NULL, req);
-        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
-        iput(parent_inode);
        ceph_mdsc_put_request(req);
        dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
@@ -1019,7 +1032,6 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
        struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct inode *inode = dentry->d_inode;
-        struct inode *parent_inode;
        struct ceph_mds_request *req;
        int err;
@@ -1033,9 +1045,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
        req->r_num_caps = 1;
        req->r_path2 = kstrdup(name, GFP_NOFS);
-        parent_inode = ceph_get_dentry_parent_inode(dentry);
+        err = ceph_mdsc_do_request(mdsc, NULL, req);
-        err = ceph_mdsc_do_request(mdsc, parent_inode, req);
-        iput(parent_inode);
        ceph_mdsc_put_request(req);
        return err;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 849f6132b327..5be1f997ecde 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -253,6 +253,11 @@ cifs_alloc_inode(struct super_block *sb)
        cifs_set_oplock_level(cifs_inode, 0);
        cifs_inode->delete_pending = false;
        cifs_inode->invalid_mapping = false;
+        clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cifs_inode->flags);
+        clear_bit(CIFS_INODE_PENDING_WRITERS, &cifs_inode->flags);
+        clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cifs_inode->flags);
+        spin_lock_init(&cifs_inode->writers_lock);
+        cifs_inode->writers = 0;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
        cifs_inode->server_eof = 0;
        cifs_inode->uniqueid = 0;
@@ -286,7 +291,7 @@ cifs_destroy_inode(struct inode *inode)
 static void
 cifs_evict_inode(struct inode *inode)
 {
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        cifs_fscache_release_inode_cookie(inode);
 }
@@ -541,6 +546,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
 static int cifs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_NODIRATIME;
        return 0;
 }
@@ -731,19 +737,26 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                                   unsigned long nr_segs, loff_t pos)
 {
        struct inode *inode = file_inode(iocb->ki_filp);
+        struct cifsInodeInfo *cinode = CIFS_I(inode);
        ssize_t written;
        int rc;
+        written = cifs_get_writer(cinode);
+        if (written)
+                return written;
        written = generic_file_aio_write(iocb, iov, nr_segs, pos);
        if (CIFS_CACHE_WRITE(CIFS_I(inode)))
-                return written;
+                goto out;
        rc = filemap_fdatawrite(inode->i_mapping);
        if (rc)
                cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n",
                         rc, inode);
+out:
+        cifs_put_writer(cinode);
        return written;
 }
@@ -849,7 +862,6 @@ const struct inode_operations cifs_file_inode_ops = {
 /*      revalidate:cifs_revalidate, */
        .setattr = cifs_setattr,
        .getattr = cifs_getattr, /* do we need this anymore? */
-        .rename = cifs_rename,
        .permission = cifs_permission,
 #ifdef CONFIG_CIFS_XATTR
        .setxattr = cifs_setxattr,
@@ -1005,7 +1017,7 @@ cifs_init_once(void *inode)
        init_rwsem(&cifsi->lock_sem);
 }
-static int
+static int __init
 cifs_init_inodecache(void)
 {
        cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index c0f3718b77a8..30f6e9251a4a 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -228,6 +228,8 @@ struct smb_version_operations {
        /* verify the message */
        int (*check_message)(char *, unsigned int);
        bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
+        void (*downgrade_oplock)(struct TCP_Server_Info *,
+                                        struct cifsInodeInfo *, bool);
        /* process transaction2 response */
        bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *,
                             char *, int);
@@ -1113,6 +1115,12 @@ struct cifsInodeInfo {
        unsigned int epoch;             /* used to track lease state changes */
        bool delete_pending;            /* DELETE_ON_CLOSE is set */
        bool invalid_mapping;           /* pagecache is invalid */
+        unsigned long flags;
+#define CIFS_INODE_PENDING_OPLOCK_BREAK   (0) /* oplock break in progress */
+#define CIFS_INODE_PENDING_WRITERS        (1) /* Writes in progress */
+#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */
+        spinlock_t writers_lock;
+        unsigned int writers;           /* Number of writers on this inode */
        unsigned long time;             /* jiffies of last update of inode */
        u64  server_eof;                /* current file size on server -- protected by i_lock */
        u64  uniqueid;                  /* server inode number */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index acc4ee8ed075..ca7980a1e303 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -127,6 +127,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
                                      int offset);
 extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
+extern int cifs_get_writer(struct cifsInodeInfo *cinode);
+extern void cifs_put_writer(struct cifsInodeInfo *cinode);
+extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode);
 extern int cifs_unlock_range(struct cifsFileInfo *cfile,
                             struct file_lock *flock, const unsigned int xid);
 extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index f3264bd7a83d..6ce4e0954b98 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -6197,6 +6197,9 @@ QAllEAsRetry:
        cifs_dbg(FYI, "ea length %d\n", list_len);
        if (list_len <= 8) {
                cifs_dbg(FYI, "empty EA list returned from server\n");
+                /* didn't find the named attribute */
+                if (ea_name)
+                        rc = -ENODATA;
                goto QAllEAsOut;
        }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 834fce759d80..5ed03e0b8b40 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2579,19 +2579,32 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
        struct cifsInodeInfo *cinode = CIFS_I(inode);
        struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
        ssize_t rc = -EACCES;
-        loff_t lock_pos = pos;
+        loff_t lock_pos = iocb->ki_pos;
-        if (file->f_flags & O_APPEND)
-                lock_pos = i_size_read(inode);
        /*
         * We need to hold the sem to be sure nobody modifies lock list
         * with a brlock that prevents writing.
         */
        down_read(&cinode->lock_sem);
+        mutex_lock(&inode->i_mutex);
+        if (file->f_flags & O_APPEND)
+                lock_pos = i_size_read(inode);
        if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs),
                                     server->vals->exclusive_lock_type, NULL,
-                                     CIFS_WRITE_OP))
+                                     CIFS_WRITE_OP)) {
-                rc = generic_file_aio_write(iocb, iov, nr_segs, pos);
+                rc = __generic_file_aio_write(iocb, iov, nr_segs);
+                mutex_unlock(&inode->i_mutex);
+                if (rc > 0) {
+                        ssize_t err;
+                        err = generic_write_sync(file, iocb->ki_pos - rc, rc);
+                        if (err < 0)
+                                rc = err;
+                }
+        } else {
+                mutex_unlock(&inode->i_mutex);
+        }
        up_read(&cinode->lock_sem);
        return rc;
 }
@@ -2608,12 +2621,20 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
        ssize_t written;
+        written = cifs_get_writer(cinode);
+        if (written)
+                return written;
        if (CIFS_CACHE_WRITE(cinode)) {
                if (cap_unix(tcon->ses) &&
                (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
-                    && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+                  && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
-                        return generic_file_aio_write(iocb, iov, nr_segs, pos);
+                        written = generic_file_aio_write(
-                return cifs_writev(iocb, iov, nr_segs, pos);
+                                        iocb, iov, nr_segs, pos);
+                        goto out;
+                }
+                written = cifs_writev(iocb, iov, nr_segs, pos);
+                goto out;
        }
        /*
         * For non-oplocked files in strict cache mode we need to write the data
@@ -2633,6 +2654,8 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
                         inode);
                cinode->oplock = 0;
        }
+out:
+        cifs_put_writer(cinode);
        return written;
 }
@@ -2727,56 +2750,27 @@ cifs_retry_async_readv(struct cifs_readdata *rdata)
 /**
 * cifs_readdata_to_iov - copy data from pages in response to an iovec
 * @rdata:      the readdata response with list of pages holding data
- * @iov:        vector in which we should copy the data
+ * @iter:       destination for our data
- * @nr_segs:    number of segments in vector
- * @offset:     offset into file of the first iovec
- * @copied:     used to return the amount of data copied to the iov
 *
 * This function copies data from a list of pages in a readdata response into
 * an array of iovecs. It will first calculate where the data should go
 * based on the info in the readdata and then copy the data into that spot.
 */
-static ssize_t
+static int
-cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov,
+cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
-                        unsigned long nr_segs, loff_t offset, ssize_t *copied)
 {
-        int rc = 0;
+        size_t remaining = rdata->bytes;
-        struct iov_iter ii;
-        size_t pos = rdata->offset - offset;
-        ssize_t remaining = rdata->bytes;
-        unsigned char *pdata;
        unsigned int i;
-        /* set up iov_iter and advance to the correct offset */
-        iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0);
-        iov_iter_advance(&ii, pos);
-        *copied = 0;
        for (i = 0; i < rdata->nr_pages; i++) {
-                ssize_t copy;
                struct page *page = rdata->pages[i];
+                size_t copy = min_t(size_t, remaining, PAGE_SIZE);
-                /* copy a whole page or whatever's left */
+                size_t written = copy_page_to_iter(page, 0, copy, iter);
-                copy = min_t(ssize_t, remaining, PAGE_SIZE);
+                remaining -= written;
+                if (written < copy && iov_iter_count(iter) > 0)
-                /* ...but limit it to whatever space is left in the iov */
+                        break;
-                copy = min_t(ssize_t, copy, iov_iter_count(&ii));
-                /* go while there's data to be copied and no errors */
-                if (copy && !rc) {
-                        pdata = kmap(page);
-                        rc = memcpy_toiovecend(ii.iov, pdata, ii.iov_offset,
-                                                (int)copy);
-                        kunmap(page);
-                        if (!rc) {
-                                *copied += copy;
-                                remaining -= copy;
-                                iov_iter_advance(&ii, copy);
-                        }
-                }
        }
+        return remaining ? -EFAULT : 0;
-        return rc;
 }
 static void
@@ -2837,20 +2831,21 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
        return total_read > 0 ? total_read : result;
 }
-static ssize_t
+ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
-cifs_iovec_read(struct file *file, const struct iovec *iov,
+                               unsigned long nr_segs, loff_t pos)
-                 unsigned long nr_segs, loff_t *poffset)
 {
+        struct file *file = iocb->ki_filp;
        ssize_t rc;
        size_t len, cur_len;
        ssize_t total_read = 0;
-        loff_t offset = *poffset;
+        loff_t offset = pos;
        unsigned int npages;
        struct cifs_sb_info *cifs_sb;
        struct cifs_tcon *tcon;
        struct cifsFileInfo *open_file;
        struct cifs_readdata *rdata, *tmp;
        struct list_head rdata_list;
+        struct iov_iter to;
        pid_t pid;
        if (!nr_segs)
@@ -2860,6 +2855,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
        if (!len)
                return 0;
+        iov_iter_init(&to, iov, nr_segs, len, 0);
        INIT_LIST_HEAD(&rdata_list);
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
        open_file = file->private_data;
@@ -2885,7 +2882,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
                                            cifs_uncached_readv_complete);
                if (!rdata) {
                        rc = -ENOMEM;
-                        goto error;
+                        break;
                }
                rc = cifs_read_allocate_pages(rdata, npages);
@@ -2917,55 +2914,44 @@ error:
        if (!list_empty(&rdata_list))
                rc = 0;
+        len = iov_iter_count(&to);
        /* the loop below should proceed in the order of increasing offsets */
-restart_loop:
        list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
+        again:
                if (!rc) {
-                        ssize_t copied;
                        /* FIXME: freezable sleep too? */
                        rc = wait_for_completion_killable(&rdata->done);
                        if (rc)
                                rc = -EINTR;
-                        else if (rdata->result)
+                        else if (rdata->result) {
                                rc = rdata->result;
-                        else {
+                                /* resend call if it's a retryable error */
-                                rc = cifs_readdata_to_iov(rdata, iov,
+                                if (rc == -EAGAIN) {
-                                                        nr_segs, *poffset,
+                                        rc = cifs_retry_async_readv(rdata);
-                                                        &copied);
+                                        goto again;
-                                total_read += copied;
+                                }
+                        } else {
+                                rc = cifs_readdata_to_iov(rdata, &to);
                        }
-                        /* resend call if it's a retryable error */
-                        if (rc == -EAGAIN) {
-                                rc = cifs_retry_async_readv(rdata);
-                                goto restart_loop;
-                        }
                }
                list_del_init(&rdata->list);
                kref_put(&rdata->refcount, cifs_uncached_readdata_release);
        }
+        total_read = len - iov_iter_count(&to);
        cifs_stats_bytes_read(tcon, total_read);
-        *poffset += total_read;
        /* mask nodata case */
        if (rc == -ENODATA)
                rc = 0;
-        return total_read ? total_read : rc;
+        if (total_read) {
-}
+                iocb->ki_pos = pos + total_read;
+                return total_read;
-ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+        }
-                               unsigned long nr_segs, loff_t pos)
+        return rc;
-{
-        ssize_t read;
-        read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
-        if (read > 0)
-                iocb->ki_pos = pos;
-        return read;
 }
 ssize_t
@@ -3113,6 +3099,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static struct vm_operations_struct cifs_file_vm_ops = {
        .fault = filemap_fault,
+        .map_pages = filemap_map_pages,
        .page_mkwrite = cifs_page_mkwrite,
        .remap_pages = generic_file_remap_pages,
 };
@@ -3644,6 +3631,13 @@ static int cifs_launder_page(struct page *page)
        return rc;
 }
+static int
+cifs_pending_writers_wait(void *unused)
+{
+        schedule();
+        return 0;
+}
 void cifs_oplock_break(struct work_struct *work)
 {
        struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@ -3651,8 +3645,15 @@ void cifs_oplock_break(struct work_struct *work)
        struct inode *inode = cfile->dentry->d_inode;
        struct cifsInodeInfo *cinode = CIFS_I(inode);
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+        struct TCP_Server_Info *server = tcon->ses->server;
        int rc = 0;
+        wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
+                        cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE);
+        server->ops->downgrade_oplock(server, cinode,
+                test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags));
        if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) &&
                                                cifs_has_mand_locks(cinode)) {
                cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n",
@@ -3689,6 +3690,7 @@ void cifs_oplock_break(struct work_struct *work)
                                                             cinode);
                cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
        }
+        cifs_done_oplock_break(cinode);
 }
 /*
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index aadc2b68678b..a22d667f1069 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1737,6 +1737,9 @@ cifs_inode_needs_reval(struct inode *inode)
        if (cifs_i->time == 0)
                return true;
+        if (!cifs_sb->actimeo)
+                return true;
        if (!time_in_range(jiffies, cifs_i->time,
                                cifs_i->time + cifs_sb->actimeo))
                return true;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 2f9f3790679d..3b0c62e622da 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -466,8 +466,22 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
                                cifs_dbg(FYI, "file id match, oplock break\n");
                                pCifsInode = CIFS_I(netfile->dentry->d_inode);
-                                cifs_set_oplock_level(pCifsInode,
+                                set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
-                                        pSMB->OplockLevel ? OPLOCK_READ : 0);
+                                        &pCifsInode->flags);
+                                /*
+                                 * Set flag if the server downgrades the oplock
+                                 * to L2 else clear.
+                                 */
+                                if (pSMB->OplockLevel)
+                                        set_bit(
+                                           CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+                                           &pCifsInode->flags);
+                                else
+                                        clear_bit(
+                                           CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+                                           &pCifsInode->flags);
                                queue_work(cifsiod_wq,
                                           &netfile->oplock_break);
                                netfile->oplock_break_cancelled = false;
@@ -551,6 +565,62 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
                cinode->oplock = 0;
 }
+static int
+cifs_oplock_break_wait(void *unused)
+{
+        schedule();
+        return signal_pending(current) ? -ERESTARTSYS : 0;
+}
+/*
+ * We wait for oplock breaks to be processed before we attempt to perform
+ * writes.
+ */
+int cifs_get_writer(struct cifsInodeInfo *cinode)
+{
+        int rc;
+start:
+        rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK,
+                                   cifs_oplock_break_wait, TASK_KILLABLE);
+        if (rc)
+                return rc;
+        spin_lock(&cinode->writers_lock);
+        if (!cinode->writers)
+                set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+        cinode->writers++;
+        /* Check to see if we have started servicing an oplock break */
+        if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) {
+                cinode->writers--;
+                if (cinode->writers == 0) {
+                        clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+                        wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
+                }
+                spin_unlock(&cinode->writers_lock);
+                goto start;
+        }
+        spin_unlock(&cinode->writers_lock);
+        return 0;
+}
+void cifs_put_writer(struct cifsInodeInfo *cinode)
+{
+        spin_lock(&cinode->writers_lock);
+        cinode->writers--;
+        if (cinode->writers == 0) {
+                clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+                wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
+        }
+        spin_unlock(&cinode->writers_lock);
+}
+void cifs_done_oplock_break(struct cifsInodeInfo *cinode)
+{
+        clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
+        wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK);
+}
 bool
 backup_cred(struct cifs_sb_info *cifs_sb)
 {
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 526fb89f9230..d1fdfa848703 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -372,6 +372,16 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
        return 0;
 }
+static void
+cifs_downgrade_oplock(struct TCP_Server_Info *server,
+                        struct cifsInodeInfo *cinode, bool set_level2)
+{
+        if (set_level2)
+                cifs_set_oplock_level(cinode, OPLOCK_READ);
+        else
+                cifs_set_oplock_level(cinode, 0);
+}
 static bool
 cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server,
                  char *buf, int malformed)
@@ -1019,6 +1029,7 @@ struct smb_version_operations smb1_operations = {
        .clear_stats = cifs_clear_stats,
        .print_stats = cifs_print_stats,
        .is_oplock_break = is_valid_oplock_break,
+        .downgrade_oplock = cifs_downgrade_oplock,
        .check_trans2 = cifs_check_trans2,
        .need_neg = cifs_need_neg,
        .negotiate = cifs_negotiate,
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index fb3966265b6e..b8021fde987d 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -575,9 +575,21 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
                                else
                                        cfile->oplock_break_cancelled = false;
-                                server->ops->set_oplock_level(cinode,
+                                set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
-                                  rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0,
+                                        &cinode->flags);
-                                  0, NULL);
+                                /*
+                                 * Set flag if the server downgrades the oplock
+                                 * to L2 else clear.
+                                 */
+                                if (rsp->OplockLevel)
+                                        set_bit(
+                                           CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+                                           &cinode->flags);
+                                else
+                                        clear_bit(
+                                           CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+                                           &cinode->flags);
                                queue_work(cifsiod_wq, &cfile->oplock_break);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 192f51a12cf1..35ddc3ed119d 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -905,6 +905,17 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 }
 static void
+smb2_downgrade_oplock(struct TCP_Server_Info *server,
+                        struct cifsInodeInfo *cinode, bool set_level2)
+{
+        if (set_level2)
+                server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II,
+                                                0, NULL);
+        else
+                server->ops->set_oplock_level(cinode, 0, 0, NULL);
+}
+static void
 smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
                      unsigned int epoch, bool *purge_cache)
 {
@@ -1110,6 +1121,7 @@ struct smb_version_operations smb20_operations = {
        .clear_stats = smb2_clear_stats,
        .print_stats = smb2_print_stats,
        .is_oplock_break = smb2_is_valid_oplock_break,
+        .downgrade_oplock = smb2_downgrade_oplock,
        .need_neg = smb2_need_neg,
        .negotiate = smb2_negotiate,
        .negotiate_wsize = smb2_negotiate_wsize,
@@ -1184,6 +1196,7 @@ struct smb_version_operations smb21_operations = {
        .clear_stats = smb2_clear_stats,
        .print_stats = smb2_print_stats,
        .is_oplock_break = smb2_is_valid_oplock_break,
+        .downgrade_oplock = smb2_downgrade_oplock,
        .need_neg = smb2_need_neg,
        .negotiate = smb2_negotiate,
        .negotiate_wsize = smb2_negotiate_wsize,
@@ -1259,6 +1272,7 @@ struct smb_version_operations smb30_operations = {
        .print_stats = smb2_print_stats,
        .dump_share_caps = smb2_dump_share_caps,
        .is_oplock_break = smb2_is_valid_oplock_break,
+        .downgrade_oplock = smb2_downgrade_oplock,
        .need_neg = smb2_need_neg,
        .negotiate = smb2_negotiate,
        .negotiate_wsize = smb2_negotiate_wsize,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 860344701067..3802f8c94acc 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1352,7 +1352,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
                     u64 persistent_fid, u64 volatile_fid)
 {
        int rc;
-        char *res_key = NULL;
        struct  compress_ioctl fsctl_input;
        char *ret_data = NULL;
@@ -1365,7 +1364,6 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
                        2 /* in data len */, &ret_data /* out data */, NULL);
        cifs_dbg(FYI, "set compression rc %d\n", rc);
-        kfree(res_key);
        return rc;
 }
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index b7143cf783ac..381c993b1427 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -10,7 +10,7 @@ extern int coda_hard;
 extern int coda_fake_statfs;
 void coda_destroy_inodecache(void);
-int coda_init_inodecache(void);
+int __init coda_init_inodecache(void);
 int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync);
 void coda_sysctl_init(void);
 void coda_sysctl_clean(void);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 506de34a4ef3..d9c7751f10ac 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -73,7 +73,7 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
 }
-int coda_init_inodecache(void)
+int __init coda_init_inodecache(void)
 {
        coda_inode_cachep = kmem_cache_create("coda_inode_cache",
                                sizeof(struct coda_inode_info),
@@ -96,6 +96,7 @@ void coda_destroy_inodecache(void)
 static int coda_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_NOATIME;
        return 0;
 }
@@ -250,7 +251,7 @@ static void coda_put_super(struct super_block *sb)
 static void coda_evict_inode(struct inode *inode)
 {
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        coda_cache_clear_inode(inode);
 }
diff --git a/fs/compat.c b/fs/compat.c
index 6af20de2c1a3..66d3d3c6b4b2 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -72,8 +72,8 @@ int compat_printk(const char *fmt, ...)
 * Not all architectures have sys_utime, so implement this in terms
 * of sys_utimes.
 */
-asmlinkage long compat_sys_utime(const char __user *filename,
+COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
-                                 struct compat_utimbuf __user *t)
+                       struct compat_utimbuf __user *, t)
 {
        struct timespec tv[2];
@@ -87,13 +87,13 @@ asmlinkage long compat_sys_utime(const char __user *filename,
        return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
 }
-asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags)
+COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
 {
        struct timespec tv[2];
        if  (t) {
-                if (get_compat_timespec(&tv[0], &t[0]) ||
+                if (compat_get_timespec(&tv[0], &t[0]) ||
-                    get_compat_timespec(&tv[1], &t[1]))
+                    compat_get_timespec(&tv[1], &t[1]))
                        return -EFAULT;
                if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
@@ -102,7 +102,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filena
        return do_utimes(dfd, filename, t ? tv : NULL, flags);
 }
-asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t)
+COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
 {
        struct timespec tv[2];
@@ -121,7 +121,7 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filena
        return do_utimes(dfd, filename, t ? tv : NULL, 0);
 }
-asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t)
+COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
 {
        return compat_sys_futimesat(AT_FDCWD, filename, t);
 }
@@ -159,8 +159,8 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
        return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
 }
-asmlinkage long compat_sys_newstat(const char __user * filename,
+COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
-                struct compat_stat __user *statbuf)
+                       struct compat_stat __user *, statbuf)
 {
        struct kstat stat;
        int error;
@@ -171,8 +171,8 @@ asmlinkage long compat_sys_newstat(const char __user * filename,
        return cp_compat_stat(&stat, statbuf);
 }
-asmlinkage long compat_sys_newlstat(const char __user * filename,
+COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
-                struct compat_stat __user *statbuf)
+                       struct compat_stat __user *, statbuf)
 {
        struct kstat stat;
        int error;
@@ -184,9 +184,9 @@ asmlinkage long compat_sys_newlstat(const char __user * filename,
 }
 #ifndef __ARCH_WANT_STAT64
-asmlinkage long compat_sys_newfstatat(unsigned int dfd,
+COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
-                const char __user *filename,
+                       const char __user *, filename,
-                struct compat_stat __user *statbuf, int flag)
+                       struct compat_stat __user *, statbuf, int, flag)
 {
        struct kstat stat;
        int error;
@@ -198,8 +198,8 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd,
 }
 #endif
-asmlinkage long compat_sys_newfstat(unsigned int fd,
+COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
-                struct compat_stat __user * statbuf)
+                       struct compat_stat __user *, statbuf)
 {
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);
@@ -247,7 +247,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
 * The following statfs calls are copies of code from fs/statfs.c and
 * should be checked against those from time to time
 */
-asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
+COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf)
 {
        struct kstatfs tmp;
        int error = user_statfs(pathname, &tmp);
@@ -256,7 +256,7 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta
        return error;
 }
-asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf)
+COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf)
 {
        struct kstatfs tmp;
        int error = fd_statfs(fd, &tmp);
@@ -298,7 +298,7 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
        return 0;
 }
-asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
+COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
 {
        struct kstatfs tmp;
        int error;
@@ -312,7 +312,7 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s
        return error;
 }
-asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf)
+COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
 {
        struct kstatfs tmp;
        int error;
@@ -331,7 +331,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
 * Given how simple this syscall is that apporach is more maintainable
 * than the various conversion hacks.
 */
-asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
+COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
 {
        struct compat_ustat tmp;
        struct kstatfs sbuf;
@@ -399,12 +399,28 @@ static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *u
 }
 #endif
-asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
+static unsigned int
-                unsigned long arg)
+convert_fcntl_cmd(unsigned int cmd)
+{
+        switch (cmd) {
+        case F_GETLK64:
+                return F_GETLK;
+        case F_SETLK64:
+                return F_SETLK;
+        case F_SETLKW64:
+                return F_SETLKW;
+        }
+        return cmd;
+}
+COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+                       compat_ulong_t, arg)
 {
        mm_segment_t old_fs;
        struct flock f;
        long ret;
+        unsigned int conv_cmd;
        switch (cmd) {
        case F_GETLK:
@@ -441,16 +457,18 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
        case F_GETLK64:
        case F_SETLK64:
        case F_SETLKW64:
+        case F_OFD_GETLK:
+        case F_OFD_SETLK:
+        case F_OFD_SETLKW:
                ret = get_compat_flock64(&f, compat_ptr(arg));
                if (ret != 0)
                        break;
                old_fs = get_fs();
                set_fs(KERNEL_DS);
-                ret = sys_fcntl(fd, (cmd == F_GETLK64) ? F_GETLK :
+                conv_cmd = convert_fcntl_cmd(cmd);
-                                ((cmd == F_SETLK64) ? F_SETLK : F_SETLKW),
+                ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
-                                (unsigned long)&f);
                set_fs(old_fs);
-                if (cmd == F_GETLK64 && ret == 0) {
+                if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) {
                        /* need to return lock information - see above for commentary */
                        if (f.l_start > COMPAT_LOFF_T_MAX)
                                ret = -EOVERFLOW;
@@ -468,16 +486,22 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
        return ret;
 }
-asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd,
+COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
-                unsigned long arg)
+                       compat_ulong_t, arg)
 {
-        if ((cmd == F_GETLK64) || (cmd == F_SETLK64) || (cmd == F_SETLKW64))
+        switch (cmd) {
+        case F_GETLK64:
+        case F_SETLK64:
+        case F_SETLKW64:
+        case F_OFD_GETLK:
+        case F_OFD_SETLK:
+        case F_OFD_SETLKW:
                return -EINVAL;
+        }
        return compat_sys_fcntl64(fd, cmd, arg);
 }
-asmlinkage long
+COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p)
-compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p)
 {
        long ret;
        aio_context_t ctx64;
@@ -496,32 +520,24 @@ compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p)
        return ret;
 }
-asmlinkage long
+COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
-compat_sys_io_getevents(aio_context_t ctx_id,
+                       compat_long_t, min_nr,
-                                 unsigned long min_nr,
+                       compat_long_t, nr,
-                                 unsigned long nr,
+                       struct io_event __user *, events,
-                                 struct io_event __user *events,
+                       struct compat_timespec __user *, timeout)
-                                 struct compat_timespec __user *timeout)
 {
-        long ret;
        struct timespec t;
        struct timespec __user *ut = NULL;
-        ret = -EFAULT;
-        if (unlikely(!access_ok(VERIFY_WRITE, events, 
-                                nr * sizeof(struct io_event))))
-                goto out;
        if (timeout) {
-                if (get_compat_timespec(&t, timeout))
+                if (compat_get_timespec(&t, timeout))
-                        goto out;
+                        return -EFAULT;
                ut = compat_alloc_user_space(sizeof(*ut));
                if (copy_to_user(ut, &t, sizeof(t)) )
-                        goto out;
+                        return -EFAULT;
        } 
-        ret = sys_io_getevents(ctx_id, min_nr, nr, events, ut);
+        return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
-out:
-        return ret;
 }
 /* A write operation does a read from user space and vice versa */
@@ -617,8 +633,8 @@ copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
 #define MAX_AIO_SUBMITS         (PAGE_SIZE/sizeof(struct iocb *))
-asmlinkage long
+COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
-compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
+                       int, nr, u32 __user *, iocb)
 {
        struct iocb __user * __user *iocb64; 
        long ret;
@@ -770,10 +786,10 @@ static int do_nfs4_super_data_conv(void *raw_data)
 #define NCPFS_NAME      "ncpfs"
 #define NFS4_NAME       "nfs4"
-asmlinkage long compat_sys_mount(const char __user * dev_name,
+COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
-                                 const char __user * dir_name,
+                       const char __user *, dir_name,
-                                 const char __user * type, unsigned long flags,
+                       const char __user *, type, compat_ulong_t, flags,
-                                 const void __user * data)
+                       const void __user *, data)
 {
        char *kernel_type;
        unsigned long data_page;
@@ -869,8 +885,8 @@ efault:
        return -EFAULT;
 }
-asmlinkage long compat_sys_old_readdir(unsigned int fd,
+COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
-        struct compat_old_linux_dirent __user *dirent, unsigned int count)
+                struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
 {
        int error;
        struct fd f = fdget(fd);
@@ -948,8 +964,8 @@ efault:
        return -EFAULT;
 }
-asmlinkage long compat_sys_getdents(unsigned int fd,
+COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
-                struct compat_linux_dirent __user *dirent, unsigned int count)
+                struct compat_linux_dirent __user *, dirent, unsigned int, count)
 {
        struct fd f;
        struct compat_linux_dirent __user * lastdirent;
@@ -981,7 +997,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
        return error;
 }
-#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64
+#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64
 struct compat_getdents_callback64 {
        struct dir_context ctx;
@@ -1033,8 +1049,8 @@ efault:
        return -EFAULT;
 }
-asmlinkage long compat_sys_getdents64(unsigned int fd,
+COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
-                struct linux_dirent64 __user * dirent, unsigned int count)
+                struct linux_dirent64 __user *, dirent, unsigned int, count)
 {
        struct fd f;
        struct linux_dirent64 __user * lastdirent;
@@ -1066,7 +1082,7 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
        fdput(f);
        return error;
 }
-#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */
+#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */
 /*
 * Exactly like fs/open.c:sys_open(), except that it doesn't set the
@@ -1287,9 +1303,9 @@ out_nofds:
        return ret;
 }
-asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
+COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
-        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+        compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
-        struct compat_timeval __user *tvp)
+        struct compat_timeval __user *, tvp)
 {
        struct timespec end_time, *to = NULL;
        struct compat_timeval tv;
@@ -1320,7 +1336,7 @@ struct compat_sel_arg_struct {
        compat_uptr_t tvp;
 };
-asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
+COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
 {
        struct compat_sel_arg_struct a;
@@ -1381,9 +1397,9 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
        return ret;
 }
-asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
+COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
-        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+        compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
-        struct compat_timespec __user *tsp, void __user *sig)
+        struct compat_timespec __user *, tsp, void __user *, sig)
 {
        compat_size_t sigsetsize = 0;
        compat_uptr_t up = 0;
@@ -1400,9 +1416,9 @@ asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
                                 sigsetsize);
 }
-asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
+COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
-        unsigned int nfds, struct compat_timespec __user *tsp,
+        unsigned int,  nfds, struct compat_timespec __user *, tsp,
-        const compat_sigset_t __user *sigmask, compat_size_t sigsetsize)
+        const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
        compat_sigset_t ss32;
        sigset_t ksigmask, sigsaved;
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index a81147e2e4ef..4d24d17bcfc1 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -88,6 +88,11 @@ static void cputime_to_compat_timeval(const cputime_t cputime,
 #define ELF_HWCAP               COMPAT_ELF_HWCAP
 #endif
+#ifdef  COMPAT_ELF_HWCAP2
+#undef  ELF_HWCAP2
+#define ELF_HWCAP2              COMPAT_ELF_HWCAP2
+#endif
 #ifdef  COMPAT_ARCH_DLINFO
 #undef  ARCH_DLINFO
 #define ARCH_DLINFO             COMPAT_ARCH_DLINFO
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 3881610b6438..e82289047272 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1538,9 +1538,10 @@ static int compat_ioctl_check_table(unsigned int xcmd)
        return ioctl_pointer[i] == xcmd;
 }
-asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
+COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
-                                unsigned long arg)
+                       compat_ulong_t, arg32)
 {
+        unsigned long arg = arg32;
        struct fd f = fdget(fd);
        int error = -EBADF;
        if (!f.file)
diff --git a/fs/coredump.c b/fs/coredump.c
index e3ad709a4232..0b2528fb640e 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -73,10 +73,15 @@ static int expand_corename(struct core_name *cn, int size)
 static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
 {
        int free, need;
+        va_list arg_copy;
 again:
        free = cn->size - cn->used;
-        need = vsnprintf(cn->corename + cn->used, free, fmt, arg);
+        va_copy(arg_copy, arg);
+        need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy);
+        va_end(arg_copy);
        if (need < free) {
                cn->used += need;
                return 0;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 06610cf94d57..ddcfe590b8a8 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -195,8 +195,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
                struct page *page = NULL;
                if (blocknr + i < devsize) {
-                        page = read_mapping_page_async(mapping, blocknr + i,
+                        page = read_mapping_page(mapping, blocknr + i, NULL);
-                                                                        NULL);
                        /* synchronous error? */
                        if (IS_ERR(page))
                                page = NULL;
@@ -244,6 +243,7 @@ static void cramfs_kill_sb(struct super_block *sb)
 static int cramfs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_RDONLY;
        return 0;
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index ca02c13a84aa..be2bea834bf4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -246,16 +246,8 @@ static void __d_free(struct rcu_head *head)
        kmem_cache_free(dentry_cache, dentry); 
 }
-/*
+static void dentry_free(struct dentry *dentry)
- * no locks, please.
- */
-static void d_free(struct dentry *dentry)
 {
-        BUG_ON((int)dentry->d_lockref.count > 0);
-        this_cpu_dec(nr_dentry);
-        if (dentry->d_op && dentry->d_op->d_release)
-                dentry->d_op->d_release(dentry);
        /* if dentry was never visible to RCU, immediate free is OK */
        if (!(dentry->d_flags & DCACHE_RCUACCESS))
                __d_free(&dentry->d_u.d_rcu);
@@ -403,56 +395,6 @@ static void dentry_lru_add(struct dentry *dentry)
                d_lru_add(dentry);
 }
-/*
- * Remove a dentry with references from the LRU.
- *
- * If we are on the shrink list, then we can get to try_prune_one_dentry() and
- * lose our last reference through the parent walk. In this case, we need to
- * remove ourselves from the shrink list, not the LRU.
- */
-static void dentry_lru_del(struct dentry *dentry)
-{
-        if (dentry->d_flags & DCACHE_LRU_LIST) {
-                if (dentry->d_flags & DCACHE_SHRINK_LIST)
-                        return d_shrink_del(dentry);
-                d_lru_del(dentry);
-        }
-}
-/**
- * d_kill - kill dentry and return parent
- * @dentry: dentry to kill
- * @parent: parent dentry
- *
- * The dentry must already be unhashed and removed from the LRU.
- *
- * If this is the root of the dentry tree, return NULL.
- *
- * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
- * d_kill.
- */
-static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
-        __releases(dentry->d_lock)
-        __releases(parent->d_lock)
-        __releases(dentry->d_inode->i_lock)
-{
-        list_del(&dentry->d_u.d_child);
-        /*
-         * Inform d_walk() that we are no longer attached to the
-         * dentry tree
-         */
-        dentry->d_flags |= DCACHE_DENTRY_KILLED;
-        if (parent)
-                spin_unlock(&parent->d_lock);
-        dentry_iput(dentry);
-        /*
-         * dentry_iput drops the locks, at which point nobody (except
-         * transient RCU lookups) can reach this dentry.
-         */
-        d_free(dentry);
-        return parent;
-}
 /**
 * d_drop - drop a dentry
 * @dentry: dentry to drop
@@ -499,37 +441,12 @@ void d_drop(struct dentry *dentry)
 }
 EXPORT_SYMBOL(d_drop);
-/*
+static void __dentry_kill(struct dentry *dentry)
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * If ref is non-zero, then decrement the refcount too.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static struct dentry *
-dentry_kill(struct dentry *dentry, int unlock_on_failure)
-        __releases(dentry->d_lock)
 {
-        struct inode *inode;
+        struct dentry *parent = NULL;
-        struct dentry *parent;
+        bool can_free = true;
+        if (!IS_ROOT(dentry))
-        inode = dentry->d_inode;
-        if (inode && !spin_trylock(&inode->i_lock)) {
-relock:
-                if (unlock_on_failure) {
-                        spin_unlock(&dentry->d_lock);
-                        cpu_relax();
-                }
-                return dentry; /* try again with same dentry */
-        }
-        if (IS_ROOT(dentry))
-                parent = NULL;
-        else
                parent = dentry->d_parent;
-        if (parent && !spin_trylock(&parent->d_lock)) {
-                if (inode)
-                        spin_unlock(&inode->i_lock);
-                goto relock;
-        }
        /*
         * The dentry is now unrecoverably dead to the world.
@@ -543,10 +460,103 @@ relock:
        if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
                dentry->d_op->d_prune(dentry);
-        dentry_lru_del(dentry);
+        if (dentry->d_flags & DCACHE_LRU_LIST) {
+                if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
+                        d_lru_del(dentry);
+        }
        /* if it was on the hash then remove it */
        __d_drop(dentry);
-        return d_kill(dentry, parent);
+        list_del(&dentry->d_u.d_child);
+        /*
+         * Inform d_walk() that we are no longer attached to the
+         * dentry tree
+         */
+        dentry->d_flags |= DCACHE_DENTRY_KILLED;
+        if (parent)
+                spin_unlock(&parent->d_lock);
+        dentry_iput(dentry);
+        /*
+         * dentry_iput drops the locks, at which point nobody (except
+         * transient RCU lookups) can reach this dentry.
+         */
+        BUG_ON((int)dentry->d_lockref.count > 0);
+        this_cpu_dec(nr_dentry);
+        if (dentry->d_op && dentry->d_op->d_release)
+                dentry->d_op->d_release(dentry);
+        spin_lock(&dentry->d_lock);
+        if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+                dentry->d_flags |= DCACHE_MAY_FREE;
+                can_free = false;
+        }
+        spin_unlock(&dentry->d_lock);
+        if (likely(can_free))
+                dentry_free(dentry);
+}
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * If ref is non-zero, then decrement the refcount too.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static struct dentry *dentry_kill(struct dentry *dentry)
+        __releases(dentry->d_lock)
+{
+        struct inode *inode = dentry->d_inode;
+        struct dentry *parent = NULL;
+        if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+                goto failed;
+        if (!IS_ROOT(dentry)) {
+                parent = dentry->d_parent;
+                if (unlikely(!spin_trylock(&parent->d_lock))) {
+                        if (inode)
+                                spin_unlock(&inode->i_lock);
+                        goto failed;
+                }
+        }
+        __dentry_kill(dentry);
+        return parent;
+failed:
+        spin_unlock(&dentry->d_lock);
+        cpu_relax();
+        return dentry; /* try again with same dentry */
+}
+static inline struct dentry *lock_parent(struct dentry *dentry)
+{
+        struct dentry *parent = dentry->d_parent;
+        if (IS_ROOT(dentry))
+                return NULL;
+        if (likely(spin_trylock(&parent->d_lock)))
+                return parent;
+        spin_unlock(&dentry->d_lock);
+        rcu_read_lock();
+again:
+        parent = ACCESS_ONCE(dentry->d_parent);
+        spin_lock(&parent->d_lock);
+        /*
+         * We can't blindly lock dentry until we are sure
+         * that we won't violate the locking order.
+         * Any changes of dentry->d_parent must have
+         * been done with parent->d_lock held, so
+         * spin_lock() above is enough of a barrier
+         * for checking if it's still our child.
+         */
+        if (unlikely(parent != dentry->d_parent)) {
+                spin_unlock(&parent->d_lock);
+                goto again;
+        }
+        rcu_read_unlock();
+        if (parent != dentry)
+                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+        else
+                parent = NULL;
+        return parent;
 }
 /* 
@@ -602,7 +612,7 @@ repeat:
        return;
 kill_it:
-        dentry = dentry_kill(dentry, 1);
+        dentry = dentry_kill(dentry);
        if (dentry)
                goto repeat;
 }
@@ -815,64 +825,15 @@ restart:
 }
 EXPORT_SYMBOL(d_prune_aliases);
-/*
- * Try to throw away a dentry - free the inode, dput the parent.
- * Requires dentry->d_lock is held, and dentry->d_count == 0.
- * Releases dentry->d_lock.
- *
- * This may fail if locks cannot be acquired no problem, just try again.
- */
-static struct dentry * try_prune_one_dentry(struct dentry *dentry)
-        __releases(dentry->d_lock)
-{
-        struct dentry *parent;
-        parent = dentry_kill(dentry, 0);
-        /*
-         * If dentry_kill returns NULL, we have nothing more to do.
-         * if it returns the same dentry, trylocks failed. In either
-         * case, just loop again.
-         *
-         * Otherwise, we need to prune ancestors too. This is necessary
-         * to prevent quadratic behavior of shrink_dcache_parent(), but
-         * is also expected to be beneficial in reducing dentry cache
-         * fragmentation.
-         */
-        if (!parent)
-                return NULL;
-        if (parent == dentry)
-                return dentry;
-        /* Prune ancestors. */
-        dentry = parent;
-        while (dentry) {
-                if (lockref_put_or_lock(&dentry->d_lockref))
-                        return NULL;
-                dentry = dentry_kill(dentry, 1);
-        }
-        return NULL;
-}
 static void shrink_dentry_list(struct list_head *list)
 {
-        struct dentry *dentry;
+        struct dentry *dentry, *parent;
-        rcu_read_lock();
+        while (!list_empty(list)) {
-        for (;;) {
+                struct inode *inode;
-                dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
+                dentry = list_entry(list->prev, struct dentry, d_lru);
-                if (&dentry->d_lru == list)
-                        break; /* empty */
-                /*
-                 * Get the dentry lock, and re-verify that the dentry is
-                 * this on the shrinking list. If it is, we know that
-                 * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set.
-                 */
                spin_lock(&dentry->d_lock);
-                if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
+                parent = lock_parent(dentry);
-                        spin_unlock(&dentry->d_lock);
-                        continue;
-                }
                /*
                 * The dispose list is isolated and dentries are not accounted
@@ -885,30 +846,63 @@ static void shrink_dentry_list(struct list_head *list)
                 * We found an inuse dentry which was not removed from
                 * the LRU because of laziness during lookup. Do not free it.
                 */
-                if (dentry->d_lockref.count) {
+                if ((int)dentry->d_lockref.count > 0) {
                        spin_unlock(&dentry->d_lock);
+                        if (parent)
+                                spin_unlock(&parent->d_lock);
                        continue;
                }
-                rcu_read_unlock();
-                /*
-                 * If 'try_to_prune()' returns a dentry, it will
-                 * be the same one we passed in, and d_lock will
-                 * have been held the whole time, so it will not
-                 * have been added to any other lists. We failed
-                 * to get the inode lock.
-                 *
-                 * We just add it back to the shrink list.
-                 */
-                dentry = try_prune_one_dentry(dentry);
-                rcu_read_lock();
+                if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
-                if (dentry) {
+                        bool can_free = dentry->d_flags & DCACHE_MAY_FREE;
+                        spin_unlock(&dentry->d_lock);
+                        if (parent)
+                                spin_unlock(&parent->d_lock);
+                        if (can_free)
+                                dentry_free(dentry);
+                        continue;
+                }
+                inode = dentry->d_inode;
+                if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
                        d_shrink_add(dentry, list);
                        spin_unlock(&dentry->d_lock);
+                        if (parent)
+                                spin_unlock(&parent->d_lock);
+                        continue;
+                }
+                __dentry_kill(dentry);
+                /*
+                 * We need to prune ancestors too. This is necessary to prevent
+                 * quadratic behavior of shrink_dcache_parent(), but is also
+                 * expected to be beneficial in reducing dentry cache
+                 * fragmentation.
+                 */
+                dentry = parent;
+                while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) {
+                        parent = lock_parent(dentry);
+                        if (dentry->d_lockref.count != 1) {
+                                dentry->d_lockref.count--;
+                                spin_unlock(&dentry->d_lock);
+                                if (parent)
+                                        spin_unlock(&parent->d_lock);
+                                break;
+                        }
+                        inode = dentry->d_inode;        /* can't be NULL */
+                        if (unlikely(!spin_trylock(&inode->i_lock))) {
+                                spin_unlock(&dentry->d_lock);
+                                if (parent)
+                                        spin_unlock(&parent->d_lock);
+                                cpu_relax();
+                                continue;
+                        }
+                        __dentry_kill(dentry);
+                        dentry = parent;
                }
        }
-        rcu_read_unlock();
 }
 static enum lru_status
@@ -1261,34 +1255,23 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
        if (data->start == dentry)
                goto out;
-        /*
+        if (dentry->d_flags & DCACHE_SHRINK_LIST) {
-         * move only zero ref count dentries to the dispose list.
-         *
-         * Those which are presently on the shrink list, being processed
-         * by shrink_dentry_list(), shouldn't be moved.  Otherwise the
-         * loop in shrink_dcache_parent() might not make any progress
-         * and loop forever.
-         */
-        if (dentry->d_lockref.count) {
-                dentry_lru_del(dentry);
-        } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
-                /*
-                 * We can't use d_lru_shrink_move() because we
-                 * need to get the global LRU lock and do the
-                 * LRU accounting.
-                 */
-                d_lru_del(dentry);
-                d_shrink_add(dentry, &data->dispose);
                data->found++;
-                ret = D_WALK_NORETRY;
+        } else {
+                if (dentry->d_flags & DCACHE_LRU_LIST)
+                        d_lru_del(dentry);
+                if (!dentry->d_lockref.count) {
+                        d_shrink_add(dentry, &data->dispose);
+                        data->found++;
+                }
        }
        /*
         * We can return to the caller if we have found some (this
         * ensures forward progress). We'll be coming back to find
         * the rest.
         */
-        if (data->found && need_resched())
+        if (!list_empty(&data->dispose))
-                ret = D_WALK_QUIT;
+                ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
 out:
        return ret;
 }
@@ -1318,45 +1301,35 @@ void shrink_dcache_parent(struct dentry *parent)
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
-static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry)
+static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
 {
-        struct select_data *data = _data;
+        /* it has busy descendents; complain about those instead */
-        enum d_walk_ret ret = D_WALK_CONTINUE;
+        if (!list_empty(&dentry->d_subdirs))
+                return D_WALK_CONTINUE;
-        if (dentry->d_lockref.count) {
+        /* root with refcount 1 is fine */
-                dentry_lru_del(dentry);
+        if (dentry == _data && dentry->d_lockref.count == 1)
-                if (likely(!list_empty(&dentry->d_subdirs)))
+                return D_WALK_CONTINUE;
-                        goto out;
-                if (dentry == data->start && dentry->d_lockref.count == 1)
+        printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} "
-                        goto out;
+                        " still in use (%d) [unmount of %s %s]\n",
-                printk(KERN_ERR
-                       "BUG: Dentry %p{i=%lx,n=%s}"
-                       " still in use (%d)"
-                       " [unmount of %s %s]\n",
                       dentry,
                       dentry->d_inode ?
                       dentry->d_inode->i_ino : 0UL,
-                       dentry->d_name.name,
+                       dentry,
                       dentry->d_lockref.count,
                       dentry->d_sb->s_type->name,
                       dentry->d_sb->s_id);
-                BUG();
+        WARN_ON(1);
-        } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
+        return D_WALK_CONTINUE;
-                /*
+}
-                 * We can't use d_lru_shrink_move() because we
-                 * need to get the global LRU lock and do the
+static void do_one_tree(struct dentry *dentry)
-                 * LRU accounting.
+{
-                 */
+        shrink_dcache_parent(dentry);
-                if (dentry->d_flags & DCACHE_LRU_LIST)
+        d_walk(dentry, dentry, umount_check, NULL);
-                        d_lru_del(dentry);
+        d_drop(dentry);
-                d_shrink_add(dentry, &data->dispose);
+        dput(dentry);
-                data->found++;
-                ret = D_WALK_NORETRY;
-        }
-out:
-        if (data->found && need_resched())
-                ret = D_WALK_QUIT;
-        return ret;
 }
 /*
@@ -1366,40 +1339,15 @@ void shrink_dcache_for_umount(struct super_block *sb)
 {
        struct dentry *dentry;
-        if (down_read_trylock(&sb->s_umount))
+        WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");
-                BUG();
        dentry = sb->s_root;
        sb->s_root = NULL;
-        for (;;) {
+        do_one_tree(dentry);
-                struct select_data data;
-                INIT_LIST_HEAD(&data.dispose);
-                data.start = dentry;
-                data.found = 0;
-                d_walk(dentry, &data, umount_collect, NULL);
-                if (!data.found)
-                        break;
-                shrink_dentry_list(&data.dispose);
-                cond_resched();
-        }
-        d_drop(dentry);
-        dput(dentry);
        while (!hlist_bl_empty(&sb->s_anon)) {
-                struct select_data data;
+                dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash));
-                dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
+                do_one_tree(dentry);
-                INIT_LIST_HEAD(&data.dispose);
-                data.start = NULL;
-                data.found = 0;
-                d_walk(dentry, &data, umount_collect, NULL);
-                if (data.found)
-                        shrink_dentry_list(&data.dispose);
-                cond_resched();
        }
 }
@@ -1647,8 +1595,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
        unsigned add_flags = d_flags_for_inode(inode);
        spin_lock(&dentry->d_lock);
-        dentry->d_flags &= ~DCACHE_ENTRY_TYPE;
+        __d_set_type(dentry, add_flags);
-        dentry->d_flags |= add_flags;
        if (inode)
                hlist_add_head(&dentry->d_alias, &inode->i_dentry);
        dentry->d_inode = inode;
@@ -2483,12 +2430,14 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
                        dentry->d_name.name = dentry->d_iname;
                } else {
                        /*
-                         * Both are internal.  Just copy target to dentry
+                         * Both are internal.
                         */
-                        memcpy(dentry->d_iname, target->d_name.name,
+                        unsigned int i;
-                                        target->d_name.len + 1);
+                        BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
-                        dentry->d_name.len = target->d_name.len;
+                        for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
-                        return;
+                                swap(((long *) &dentry->d_iname)[i],
+                                     ((long *) &target->d_iname)[i]);
+                        }
                }
        }
        swap(dentry->d_name.len, target->d_name.len);
@@ -2545,13 +2494,15 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry,
 * __d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
+ * @exchange: exchange the two dentries
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way. Caller must hold
 * rename_lock, the i_mutex of the source and target directories,
 * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
 */
-static void __d_move(struct dentry * dentry, struct dentry * target)
+static void __d_move(struct dentry *dentry, struct dentry *target,
+                     bool exchange)
 {
        if (!dentry->d_inode)
                printk(KERN_WARNING "VFS: moving negative dcache entry\n");
@@ -2573,8 +2524,15 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
        __d_drop(dentry);
        __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
-        /* Unhash the target: dput() will then get rid of it */
+        /*
+         * Unhash the target (d_delete() is not usable here).  If exchanging
+         * the two dentries, then rehash onto the other's hash queue.
+         */
        __d_drop(target);
+        if (exchange) {
+                __d_rehash(target,
+                           d_hash(dentry->d_parent, dentry->d_name.hash));
+        }
        list_del(&dentry->d_u.d_child);
        list_del(&target->d_u.d_child);
@@ -2601,6 +2559,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
        write_seqcount_end(&dentry->d_seq);
        dentry_unlock_parents_for_move(dentry, target);
+        if (exchange)
+                fsnotify_d_move(target);
        spin_unlock(&target->d_lock);
        fsnotify_d_move(dentry);
        spin_unlock(&dentry->d_lock);
@@ -2618,11 +2578,30 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
 void d_move(struct dentry *dentry, struct dentry *target)
 {
        write_seqlock(&rename_lock);
-        __d_move(dentry, target);
+        __d_move(dentry, target, false);
        write_sequnlock(&rename_lock);
 }
 EXPORT_SYMBOL(d_move);
+/*
+ * d_exchange - exchange two dentries
+ * @dentry1: first dentry
+ * @dentry2: second dentry
+ */
+void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
+{
+        write_seqlock(&rename_lock);
+        WARN_ON(!dentry1->d_inode);
+        WARN_ON(!dentry2->d_inode);
+        WARN_ON(IS_ROOT(dentry1));
+        WARN_ON(IS_ROOT(dentry2));
+        __d_move(dentry1, dentry2, true);
+        write_sequnlock(&rename_lock);
+}
 /**
 * d_ancestor - search for an ancestor
 * @p1: ancestor dentry
@@ -2670,7 +2649,7 @@ static struct dentry *__d_unalias(struct inode *inode,
        m2 = &alias->d_parent->d_inode->i_mutex;
 out_unalias:
        if (likely(!d_mountpoint(alias))) {
-                __d_move(alias, dentry);
+                __d_move(alias, dentry, false);
                ret = alias;
        }
 out_err:
@@ -3112,6 +3091,7 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
                end = ERR_PTR(-ENAMETOOLONG);
        return end;
 }
+EXPORT_SYMBOL(simple_dname);
 /*
 * Write full pathname from the root of the filesystem into the buffer.
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 9c0444cccbe1..8c41b52da358 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -218,6 +218,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data)
        int err;
        struct debugfs_fs_info *fsi = sb->s_fs_info;
+        sync_filesystem(sb);
        err = debugfs_parse_options(data, &fsi->mount_opts);
        if (err)
                goto fail;
@@ -358,7 +359,7 @@ exit:
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have.
 * @parent: a pointer to the parent dentry for this file.  This should be a
- *          directory dentry if set.  If this paramater is NULL, then the
+ *          directory dentry if set.  If this parameter is NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @data: a pointer to something that the caller will want to get to later
 *        on.  The inode.i_private pointer will point to this value on
@@ -400,7 +401,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
 * @name: a pointer to a string containing the name of the directory to
 *        create.
 * @parent: a pointer to the parent dentry for this file.  This should be a
- *          directory dentry if set.  If this paramater is NULL, then the
+ *          directory dentry if set.  If this parameter is NULL, then the
 *          directory will be created in the root of the debugfs filesystem.
 *
 * This function creates a directory in debugfs with the given name.
@@ -425,7 +426,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
 * @name: a pointer to a string containing the name of the symbolic link to
 *        create.
 * @parent: a pointer to the parent dentry for this symbolic link.  This
- *          should be a directory dentry if set.  If this paramater is NULL,
+ *          should be a directory dentry if set.  If this parameter is NULL,
 *          then the symbolic link will be created in the root of the debugfs
 *          filesystem.
 * @target: a pointer to a string containing the path to the target of the
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index a726b9f29cb7..c71038079b47 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -313,6 +313,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
        struct pts_fs_info *fsi = DEVPTS_SB(sb);
        struct pts_mount_opts *opts = &fsi->mount_opts;
+        sync_filesystem(sb);
        err = parse_mount_options(data, PARSE_REMOUNT, opts);
        /*
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 160a5489a939..31ba0935e32e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -664,7 +664,6 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
                goto out;
        sector = start_sector << (sdio->blkbits - 9);
        nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
-        nr_pages = min(nr_pages, BIO_MAX_PAGES);
        BUG_ON(nr_pages <= 0);
        dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
        sdio->boundary = 0;
@@ -1194,13 +1193,19 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        }
        /*
-         * For file extending writes updating i_size before data
+         * For file extending writes updating i_size before data writeouts
-         * writeouts complete can expose uninitialized blocks. So
+         * complete can expose uninitialized blocks in dumb filesystems.
-         * even for AIO, we need to wait for i/o to complete before
+         * In that case we need to wait for I/O completion even if asked
-         * returning in this case.
+         * for an asynchronous write.
         */
-        dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
+        if (is_sync_kiocb(iocb))
-                (end > i_size_read(inode)));
+                dio->is_async = false;
+        else if (!(dio->flags & DIO_ASYNC_EXTEND) &&
+            (rw & WRITE) && end > i_size_read(inode))
+                dio->is_async = false;
+        else
+                dio->is_async = true;
        dio->inode = inode;
        dio->rw = rw;
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 0e90f0c91b93..dcea1e37a1b7 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -14,6 +14,7 @@
 #include "dlm_internal.h"
 #include "lock.h"
 #include "user.h"
+#include "ast.h"
 static uint64_t dlm_cb_seq;
 static DEFINE_SPINLOCK(dlm_cb_seq_spin);
@@ -308,6 +309,6 @@ void dlm_callback_resume(struct dlm_ls *ls)
        mutex_unlock(&ls->ls_cb_mutex);
        if (count)
-                log_debug(ls, "dlm_callback_resume %d", count);
+                log_rinfo(ls, "dlm_callback_resume %d", count);
 }
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 278a75cda446..d975851a7e1e 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -68,7 +68,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
        uint16_t namelen;
        unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0;
-        log_debug(ls, "dlm_recover_directory");
+        log_rinfo(ls, "dlm_recover_directory");
        if (dlm_no_directory(ls))
                goto out_status;
@@ -189,7 +189,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
        error = 0;
        dlm_set_recover_status(ls, DLM_RS_DIR);
-        log_debug(ls, "dlm_recover_directory %u in %u new",
+        log_rinfo(ls, "dlm_recover_directory %u in %u new",
                  count, count_add);
 out_free:
        kfree(last_name);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index e7665c31f7b1..5eff6ea3e27f 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -65,6 +65,8 @@ struct dlm_mhandle;
        printk(KERN_ERR "dlm: "fmt"\n" , ##args)
 #define log_error(ls, fmt, args...) \
        printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
+#define log_rinfo(ls, fmt, args...) \
+        printk(KERN_INFO "dlm: %s: " fmt "\n", (ls)->ls_name , ##args);
 #define log_debug(ls, fmt, args...) \
 do { \
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index e223a911a834..83f3d5520307 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -687,6 +687,7 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
                log_error(ls, "find_rsb new from_other %d dir %d our %d %s",
                          from_nodeid, dir_nodeid, our_nodeid, r->res_name);
                dlm_free_rsb(r);
+                r = NULL;
                error = -ENOTBLK;
                goto out_unlock;
        }
@@ -5462,7 +5463,7 @@ void dlm_recover_purge(struct dlm_ls *ls)
        up_write(&ls->ls_root_sem);
        if (lkb_count)
-                log_debug(ls, "dlm_recover_purge %u locks for %u nodes",
+                log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes",
                          lkb_count, nodes_count);
 }
@@ -5536,7 +5537,7 @@ void dlm_recover_grant(struct dlm_ls *ls)
        }
        if (lkb_count)
-                log_debug(ls, "dlm_recover_grant %u locks on %u resources",
+                log_rinfo(ls, "dlm_recover_grant %u locks on %u resources",
                          lkb_count, rsb_count);
 }
@@ -5695,7 +5696,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
        put_rsb(r);
 out:
        if (error && error != -EEXIST)
-                log_debug(ls, "dlm_recover_master_copy remote %d %x error %d",
+                log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d",
                          from_nodeid, remid, error);
        rl->rl_result = cpu_to_le32(error);
        return error;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d5abafd56a6d..04d6398c1f1c 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -190,7 +190,7 @@ static int do_uevent(struct dlm_ls *ls, int in)
        else
                kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
-        log_debug(ls, "%s the lockspace group...", in ? "joining" : "leaving");
+        log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving");
        /* dlm_controld will see the uevent, do the necessary group management
           and then write to sysfs to wake us */
@@ -198,7 +198,7 @@ static int do_uevent(struct dlm_ls *ls, int in)
        error = wait_event_interruptible(ls->ls_uevent_wait,
                        test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
-        log_debug(ls, "group event done %d %d", error, ls->ls_uevent_result);
+        log_rinfo(ls, "group event done %d %d", error, ls->ls_uevent_result);
        if (error)
                goto out;
@@ -640,7 +640,7 @@ static int new_lockspace(const char *name, const char *cluster,
        dlm_create_debug_file(ls);
-        log_debug(ls, "join complete");
+        log_rinfo(ls, "join complete");
        *lockspace = ls;
        return 0;
@@ -835,7 +835,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
        dlm_clear_members(ls);
        dlm_clear_members_gone(ls);
        kfree(ls->ls_node_array);
-        log_debug(ls, "release_lockspace final free");
+        log_rinfo(ls, "release_lockspace final free");
        kobject_put(&ls->ls_kobj);
        /* The ls structure will be freed when the kobject is done with */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3190ca973dd6..1e5b45359509 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -424,7 +424,7 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
 }
 /* Data available on socket or listen socket received a connect */
-static void lowcomms_data_ready(struct sock *sk, int count_unused)
+static void lowcomms_data_ready(struct sock *sk)
 {
        struct connection *con = sock2con(sk);
        if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 476557b54921..9c47f1c14a8b 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -60,18 +60,15 @@ void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc)
 #define SLOT_DEBUG_LINE 128
-static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
+static void log_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
-                            struct rcom_slot *ro0, struct dlm_slot *array,
+                      struct rcom_slot *ro0, struct dlm_slot *array,
-                            int array_size)
+                      int array_size)
 {
        char line[SLOT_DEBUG_LINE];
        int len = SLOT_DEBUG_LINE - 1;
        int pos = 0;
        int ret, i;
-        if (!dlm_config.ci_log_debug)
-                return;
        memset(line, 0, sizeof(line));
        if (array) {
@@ -95,7 +92,7 @@ static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
                }
        }
-        log_debug(ls, "generation %u slots %d%s", gen, num_slots, line);
+        log_rinfo(ls, "generation %u slots %d%s", gen, num_slots, line);
 }
 int dlm_slots_copy_in(struct dlm_ls *ls)
@@ -129,7 +126,7 @@ int dlm_slots_copy_in(struct dlm_ls *ls)
                ro->ro_slot = le16_to_cpu(ro->ro_slot);
        }
-        log_debug_slots(ls, gen, num_slots, ro0, NULL, 0);
+        log_slots(ls, gen, num_slots, ro0, NULL, 0);
        list_for_each_entry(memb, &ls->ls_nodes, list) {
                for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
@@ -274,7 +271,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
        gen++;
-        log_debug_slots(ls, gen, num, NULL, array, array_size);
+        log_slots(ls, gen, num, NULL, array, array_size);
        max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) -
                     sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
@@ -447,7 +444,7 @@ static int ping_members(struct dlm_ls *ls)
                        break;
        }
        if (error)
-                log_debug(ls, "ping_members aborted %d last nodeid %d",
+                log_rinfo(ls, "ping_members aborted %d last nodeid %d",
                          error, ls->ls_recover_nodeid);
        return error;
 }
@@ -539,7 +536,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
           count as a negative change so the "neg" recovery steps will happen */
        list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
-                log_debug(ls, "prev removed member %d", memb->nodeid);
+                log_rinfo(ls, "prev removed member %d", memb->nodeid);
                neg++;
        }
@@ -551,10 +548,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
                        continue;
                if (!node) {
-                        log_debug(ls, "remove member %d", memb->nodeid);
+                        log_rinfo(ls, "remove member %d", memb->nodeid);
                } else {
                        /* removed and re-added */
-                        log_debug(ls, "remove member %d comm_seq %u %u",
+                        log_rinfo(ls, "remove member %d comm_seq %u %u",
                                  memb->nodeid, memb->comm_seq, node->comm_seq);
                }
@@ -571,7 +568,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
                if (dlm_is_member(ls, node->nodeid))
                        continue;
                dlm_add_member(ls, node);
-                log_debug(ls, "add member %d", node->nodeid);
+                log_rinfo(ls, "add member %d", node->nodeid);
        }
        list_for_each_entry(memb, &ls->ls_nodes, list) {
@@ -591,7 +588,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
                complete(&ls->ls_members_done);
        }
-        log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
+        log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
        return error;
 }
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index a6bc63f6e31b..eaea789bf97d 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -526,7 +526,7 @@ int dlm_recover_masters(struct dlm_ls *ls)
        int nodir = dlm_no_directory(ls);
        int error;
-        log_debug(ls, "dlm_recover_masters");
+        log_rinfo(ls, "dlm_recover_masters");
        down_read(&ls->ls_root_sem);
        list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
@@ -552,7 +552,7 @@ int dlm_recover_masters(struct dlm_ls *ls)
        }
        up_read(&ls->ls_root_sem);
-        log_debug(ls, "dlm_recover_masters %u of %u", count, total);
+        log_rinfo(ls, "dlm_recover_masters %u of %u", count, total);
        error = dlm_wait_function(ls, &recover_idr_empty);
 out:
@@ -685,7 +685,7 @@ int dlm_recover_locks(struct dlm_ls *ls)
        }
        up_read(&ls->ls_root_sem);
-        log_debug(ls, "dlm_recover_locks %d out", count);
+        log_rinfo(ls, "dlm_recover_locks %d out", count);
        error = dlm_wait_function(ls, &recover_list_empty);
 out:
@@ -883,7 +883,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
        up_read(&ls->ls_root_sem);
        if (count)
-                log_debug(ls, "dlm_recover_rsbs %d done", count);
+                log_rinfo(ls, "dlm_recover_rsbs %d done", count);
 }
 /* Create a single list of all root rsb's to be used during recovery */
@@ -950,6 +950,6 @@ void dlm_clear_toss(struct dlm_ls *ls)
        }
        if (count)
-                log_debug(ls, "dlm_clear_toss %u done", count);
+                log_rinfo(ls, "dlm_clear_toss %u done", count);
 }
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 32f9f8926ec3..6859b4bf971e 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -55,7 +55,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        unsigned long start;
        int error, neg = 0;
-        log_debug(ls, "dlm_recover %llu", (unsigned long long)rv->seq);
+        log_rinfo(ls, "dlm_recover %llu", (unsigned long long)rv->seq);
        mutex_lock(&ls->ls_recoverd_active);
@@ -76,7 +76,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        error = dlm_recover_members(ls, rv, &neg);
        if (error) {
-                log_debug(ls, "dlm_recover_members error %d", error);
+                log_rinfo(ls, "dlm_recover_members error %d", error);
                goto fail;
        }
@@ -90,7 +90,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        error = dlm_recover_members_wait(ls);
        if (error) {
-                log_debug(ls, "dlm_recover_members_wait error %d", error);
+                log_rinfo(ls, "dlm_recover_members_wait error %d", error);
                goto fail;
        }
@@ -103,7 +103,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        error = dlm_recover_directory(ls);
        if (error) {
-                log_debug(ls, "dlm_recover_directory error %d", error);
+                log_rinfo(ls, "dlm_recover_directory error %d", error);
                goto fail;
        }
@@ -111,11 +111,11 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        error = dlm_recover_directory_wait(ls);
        if (error) {
-                log_debug(ls, "dlm_recover_directory_wait error %d", error);
+                log_rinfo(ls, "dlm_recover_directory_wait error %d", error);
                goto fail;
        }
-        log_debug(ls, "dlm_recover_directory %u out %u messages",
+        log_rinfo(ls, "dlm_recover_directory %u out %u messages",
                  ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg);
        /*
@@ -144,7 +144,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
                error = dlm_recover_masters(ls);
                if (error) {
-                        log_debug(ls, "dlm_recover_masters error %d", error);
+                        log_rinfo(ls, "dlm_recover_masters error %d", error);
                        goto fail;
                }
@@ -154,7 +154,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
                error = dlm_recover_locks(ls);
                if (error) {
-                        log_debug(ls, "dlm_recover_locks error %d", error);
+                        log_rinfo(ls, "dlm_recover_locks error %d", error);
                        goto fail;
                }
@@ -162,11 +162,11 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
                error = dlm_recover_locks_wait(ls);
                if (error) {
-                        log_debug(ls, "dlm_recover_locks_wait error %d", error);
+                        log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
                        goto fail;
                }
-                log_debug(ls, "dlm_recover_locks %u in",
+                log_rinfo(ls, "dlm_recover_locks %u in",
                          ls->ls_recover_locks_in);
                /*
@@ -186,7 +186,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
                error = dlm_recover_locks_wait(ls);
                if (error) {
-                        log_debug(ls, "dlm_recover_locks_wait error %d", error);
+                        log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
                        goto fail;
                }
        }
@@ -205,7 +205,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        error = dlm_recover_done_wait(ls);
        if (error) {
-                log_debug(ls, "dlm_recover_done_wait error %d", error);
+                log_rinfo(ls, "dlm_recover_done_wait error %d", error);
                goto fail;
        }
@@ -217,25 +217,25 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
        error = enable_locking(ls, rv->seq);
        if (error) {
-                log_debug(ls, "enable_locking error %d", error);
+                log_rinfo(ls, "enable_locking error %d", error);
                goto fail;
        }
        error = dlm_process_requestqueue(ls);
        if (error) {
-                log_debug(ls, "dlm_process_requestqueue error %d", error);
+                log_rinfo(ls, "dlm_process_requestqueue error %d", error);
                goto fail;
        }
        error = dlm_recover_waiters_post(ls);
        if (error) {
-                log_debug(ls, "dlm_recover_waiters_post error %d", error);
+                log_rinfo(ls, "dlm_recover_waiters_post error %d", error);
                goto fail;
        }
        dlm_recover_grant(ls);
-        log_debug(ls, "dlm_recover %llu generation %u done: %u ms",
+        log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms",
                  (unsigned long long)rv->seq, ls->ls_generation,
                  jiffies_to_msecs(jiffies - start));
        mutex_unlock(&ls->ls_recoverd_active);
@@ -245,7 +245,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 fail:
        dlm_release_root_list(ls);
-        log_debug(ls, "dlm_recover %llu error %d",
+        log_rinfo(ls, "dlm_recover %llu error %d",
                  (unsigned long long)rv->seq, error);
        mutex_unlock(&ls->ls_recoverd_active);
        return error;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 9fd702f5bfb2..9280202e488c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -59,10 +59,22 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
        if (ret)
                return ret;
        if (write) {
-                if (sysctl_drop_caches & 1)
+                static int stfu;
+                if (sysctl_drop_caches & 1) {
                        iterate_supers(drop_pagecache_sb, NULL);
-                if (sysctl_drop_caches & 2)
+                        count_vm_event(DROP_PAGECACHE);
+                }
+                if (sysctl_drop_caches & 2) {
                        drop_slab();
+                        count_vm_event(DROP_SLAB);
+                }
+                if (!stfu) {
+                        pr_info("%s (%d): drop_caches: %d\n",
+                                current->comm, task_pid_nr(current),
+                                sysctl_drop_caches);
+                }
+                stfu |= sysctl_drop_caches & 4;
        }
        return 0;
 }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index b167ca48b8ee..d4a9431ec73c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -641,7 +641,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        }
        rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
                        lower_new_dir_dentry->d_inode, lower_new_dentry,
-                        NULL);
+                        NULL, 0);
        if (rc)
                goto out_lock;
        if (target_inode)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index e879cf8ff0b1..afa1b81c3418 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -132,7 +132,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 */
 static void ecryptfs_evict_inode(struct inode *inode)
 {
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        iput(ecryptfs_inode_to_lower(inode));
 }
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 8dd524f32284..cdb2971192a5 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -21,7 +21,7 @@ static ssize_t efivarfs_file_write(struct file *file,
        u32 attributes;
        struct inode *inode = file->f_mapping->host;
        unsigned long datasize = count - sizeof(attributes);
-        ssize_t bytes = 0;
+        ssize_t bytes;
        bool set = false;
        if (count < sizeof(attributes))
@@ -33,14 +33,9 @@ static ssize_t efivarfs_file_write(struct file *file,
        if (attributes & ~(EFI_VARIABLE_MASK))
                return -EINVAL;
-        data = kmalloc(datasize, GFP_KERNEL);
+        data = memdup_user(userbuf + sizeof(attributes), datasize);
-        if (!data)
+        if (IS_ERR(data))
-                return -ENOMEM;
+                return PTR_ERR(data);
-        if (copy_from_user(data, userbuf + sizeof(attributes), datasize)) {
-                bytes = -EFAULT;
-                goto out;
-        }
        bytes = efivar_entry_set_get_size(var, attributes, &datasize,
                                          data, &set);
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 50215bbd6463..3befcc9f5d63 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -91,7 +91,7 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
 }
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
        efs_inode_cachep = kmem_cache_create("efs_inode_cache",
                                sizeof(struct efs_inode_info),
@@ -114,6 +114,7 @@ static void destroy_inodecache(void)
 static int efs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_RDONLY;
        return 0;
 }
diff --git a/fs/exec.c b/fs/exec.c
index 3d78fccdd723..238b7aa26f68 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,6 +26,7 @@
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/swap.h>
@@ -97,6 +98,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
        module_put(fmt->module);
 }
+#ifdef CONFIG_USELIB
 /*
 * Note that a shared library must be both readable and executable due to
 * security reasons.
@@ -156,6 +158,7 @@ exit:
 out:
        return error;
 }
+#endif /* #ifdef CONFIG_USELIB */
 #ifdef CONFIG_MMU
 /*
@@ -654,10 +657,10 @@ int setup_arg_pages(struct linux_binprm *bprm,
        unsigned long rlim_stack;
 #ifdef CONFIG_STACK_GROWSUP
-        /* Limit stack size to 1GB */
+        /* Limit stack size */
        stack_base = rlimit_max(RLIMIT_STACK);
-        if (stack_base > (1 << 30))
+        if (stack_base > STACK_SIZE_MAX)
-                stack_base = 1 << 30;
+                stack_base = STACK_SIZE_MAX;
        /* Make sure we didn't let the argument array grow too large. */
        if (vma->vm_end - vma->vm_start > stack_base)
@@ -810,7 +813,7 @@ EXPORT_SYMBOL(kernel_read);
 ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
 {
-        ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos);
+        ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
        if (res > 0)
                flush_icache_range(addr, addr + len);
        return res;
@@ -820,7 +823,7 @@ EXPORT_SYMBOL(read_code);
 static int exec_mmap(struct mm_struct *mm)
 {
        struct task_struct *tsk;
-        struct mm_struct * old_mm, *active_mm;
+        struct mm_struct *old_mm, *active_mm;
        /* Notify parent that we're no longer interested in the old VM */
        tsk = current;
@@ -846,6 +849,8 @@ static int exec_mmap(struct mm_struct *mm)
        tsk->mm = mm;
        tsk->active_mm = mm;
        activate_mm(active_mm, mm);
+        tsk->mm->vmacache_seqnum = 0;
+        vmacache_flush(tsk);
        task_unlock(tsk);
        if (old_mm) {
                up_read(&old_mm->mmap_sem);
@@ -1041,7 +1046,7 @@ EXPORT_SYMBOL_GPL(get_task_comm);
 * so that a new one can be started
 */
-void set_task_comm(struct task_struct *tsk, char *buf)
+void set_task_comm(struct task_struct *tsk, const char *buf)
 {
        task_lock(tsk);
        trace_task_rename(tsk, buf);
@@ -1050,21 +1055,6 @@ void set_task_comm(struct task_struct *tsk, char *buf)
        perf_event_comm(tsk);
 }
-static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
-{
-        int i, ch;
-        /* Copies the binary name from after last slash */
-        for (i = 0; (ch = *(fn++)) != '\0';) {
-                if (ch == '/')
-                        i = 0; /* overwrite what we wrote */
-                else
-                        if (i < len - 1)
-                                tcomm[i++] = ch;
-        }
-        tcomm[i] = '\0';
-}
 int flush_old_exec(struct linux_binprm * bprm)
 {
        int retval;
@@ -1078,8 +1068,6 @@ int flush_old_exec(struct linux_binprm * bprm)
                goto out;
        set_mm_exe_file(bprm->mm, bprm->file);
-        filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
        /*
         * Release all of the old mmap stuff
         */
@@ -1122,7 +1110,7 @@ void setup_new_exec(struct linux_binprm * bprm)
        else
                set_dumpable(current->mm, suid_dumpable);
-        set_task_comm(current, bprm->tcomm);
+        set_task_comm(current, kbasename(bprm->filename));
        /* Set the new mm task size. We have to do that late because it may
         * depend on TIF_32BIT which is only updated in flush_thread() on
@@ -1619,9 +1607,9 @@ SYSCALL_DEFINE3(execve,
        return do_execve(getname(filename), argv, envp);
 }
 #ifdef CONFIG_COMPAT
-asmlinkage long compat_sys_execve(const char __user * filename,
+COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
-        const compat_uptr_t __user * argv,
+        const compat_uptr_t __user *, argv,
-        const compat_uptr_t __user * envp)
+        const compat_uptr_t __user *, envp)
 {
        return compat_do_execve(getname(filename), argv, envp);
 }
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index ee4317faccb1..d1c244d67667 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1486,7 +1486,7 @@ void exofs_evict_inode(struct inode *inode)
        struct ore_io_state *ios;
        int ret;
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        /* TODO: should do better here */
        if (inode->i_nlink || is_bad_inode(inode))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 7682b970d0f1..4e2c032ab8a1 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -21,12 +21,12 @@
 #undef ORE_DBGMSG2
 #define ORE_DBGMSG2 ORE_DBGMSG
-struct page *_raid_page_alloc(void)
+static struct page *_raid_page_alloc(void)
 {
        return alloc_page(GFP_KERNEL);
 }
-void _raid_page_free(struct page *p)
+static void _raid_page_free(struct page *p)
 {
        __free_page(p);
 }
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9d9763328734..ed73ed8ebbee 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -543,7 +543,7 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
        return !(odi->systemid_len || odi->osdname_len);
 }
-int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
+static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
                      struct exofs_dev **peds)
 {
        struct __alloc_ore_devs_and_exofs_devs {
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 1b8001bbe947..27695e6f4e46 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -4,7 +4,6 @@
 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
 */
-#include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7cadd823bb31..7d66fb0e4cca 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -284,7 +284,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
                int best_ndir = inodes_per_group;
                int best_group = -1;
-                get_random_bytes(&group, sizeof(group));
+                group = prandom_u32();
                parent_group = (unsigned)group % ngroups;
                for (i = 0; i < ngroups; i++) {
                        group = (parent_group + i) % ngroups;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 94ed36849b71..b1d2a4675d42 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -78,7 +78,7 @@ void ext2_evict_inode(struct inode * inode)
                dquot_drop(inode);
        }
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        if (want_delete) {
                sb_start_intwrite(inode->i_sb);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 20d6697bd638..3750031cfa2f 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -192,7 +192,7 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
 }
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
        ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
                                             sizeof(struct ext2_inode_info),
@@ -1254,6 +1254,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        unsigned long old_sb_flags;
        int err;
+        sync_filesystem(sb);
        spin_lock(&sbi->s_lock);
        /* Store the old options */
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index cfedb2cb0d8c..c0ebc4db8849 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -42,8 +42,8 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
                              value, size, flags);
 }
-int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+static int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-                    void *fs_info)
+                           void *fs_info)
 {
        const struct xattr *xattr;
        int err = 0;
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 22548f56197b..158b5d4ce067 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1727,10 +1727,7 @@ allocated:
        percpu_counter_sub(&sbi->s_freeblocks_counter, num);
        BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
-        err = ext3_journal_dirty_metadata(handle, gdp_bh);
+        fatal = ext3_journal_dirty_metadata(handle, gdp_bh);
-        if (!fatal)
-                fatal = err;
        if (fatal)
                goto out;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index e66e4808719f..17742eed2c16 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -275,7 +275,7 @@ static inline loff_t ext3_get_htree_eof(struct file *filp)
 * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
 *       will be invalid once the directory was converted into a dx directory
 */
-loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
+static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
 {
        struct inode *inode = file->f_mapping->host;
        int dx_dir = is_dx_dir(inode);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 082afd78b107..a1b810230cc5 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -215,7 +215,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
                int best_ndir = inodes_per_group;
                int best_group = -1;
-                get_random_bytes(&group, sizeof(group));
+                group = prandom_u32();
                parent_group = (unsigned)group % ngroups;
                for (i = 0; i < ngroups; i++) {
                        group = (parent_group + i) % ngroups;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 384b6ebb655f..f5157d0d1b43 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -228,7 +228,7 @@ void ext3_evict_inode (struct inode *inode)
                log_wait_commit(journal, commit_tid);
                filemap_write_and_wait(&inode->i_data);
        }
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        ext3_discard_reservation(inode);
        rsv = ei->i_block_alloc_info;
@@ -1559,56 +1559,17 @@ static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
 }
 /*
- * Note that we always start a transaction even if we're not journalling
+ * Note that whenever we need to map blocks we start a transaction even if
- * data.  This is to preserve ordering: any hole instantiation within
+ * we're not journalling data.  This is to preserve ordering: any hole
- * __block_write_full_page -> ext3_get_block() should be journalled
+ * instantiation within __block_write_full_page -> ext3_get_block() should be
- * along with the data so we don't crash and then get metadata which
+ * journalled along with the data so we don't crash and then get metadata which
 * refers to old data.
 *
 * In all journalling modes block_write_full_page() will start the I/O.
 *
- * Problem:
- *
- *      ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- *              ext3_writepage()
- *
- * Similar for:
- *
- *      ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
- *
- * Same applies to ext3_get_block().  We will deadlock on various things like
- * lock_journal and i_truncate_mutex.
- *
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
- *
- * 16May01: If we're reentered then journal_current_handle() will be
- *          non-zero. We simply *return*.
- *
- * 1 July 2001: @@@ FIXME:
- *   In journalled data mode, a data buffer may be metadata against the
- *   current transaction.  But the same file is part of a shared mapping
- *   and someone does a writepage() on it.
- *
- *   We will move the buffer onto the async_data list, but *after* it has
- *   been dirtied. So there's a small window where we have dirty data on
- *   BJ_Metadata.
- *
- *   Note that this only applies to the last partial page in the file.  The
- *   bit which block_write_full_page() uses prepare/commit for.  (That's
- *   broken code anyway: it's wrong for msync()).
- *
- *   It's a rare case: affects the final partial page, for journalled data
- *   where the file is subject to bith write() and writepage() in the same
- *   transction.  To fix it we'll need a custom block_write_full_page().
- *   We'll probably need that anyway for journalling writepage() output.
- *
 * We don't honour synchronous mounts for writepage().  That would be
 * disastrous.  Any write() or metadata operation will sync the fs for
 * us.
- *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
 */
 static int ext3_ordered_writepage(struct page *page,
                                struct writeback_control *wbc)
@@ -1673,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page,
         * block_write_full_page() succeeded.  Otherwise they are unmapped,
         * and generally junk.
         */
-        if (ret == 0) {
+        if (ret == 0)
-                err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+                ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
                                        NULL, journal_dirty_data_fn);
-                if (!ret)
-                        ret = err;
-        }
        walk_page_buffers(handle, page_bufs, 0,
                        PAGE_CACHE_SIZE, NULL, bput_one);
        err = ext3_journal_stop(handle);
@@ -1925,6 +1883,8 @@ retry:
                         * and pretend the write failed... */
                        ext3_truncate_failed_direct_write(inode);
                        ret = PTR_ERR(handle);
+                        if (inode->i_nlink)
+                                ext3_orphan_del(NULL, inode);
                        goto out;
                }
                if (inode->i_nlink)
@@ -3212,21 +3172,20 @@ out_brelse:
 *
 * We are called from a few places:
 *
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
 *   Here, there will be no transaction running. We wait for any running
 *   transaction to commit.
 *
- * - Within sys_sync(), kupdate and such.
+ * - Within flush work (for sys_sync(), kupdate and such).
- *   We wait on commit, if tol to.
+ *   We wait on commit, if told to.
 *
- * - Within prune_icache() (PF_MEMALLOC == true)
+ * - Within iput_final() -> write_inode_now()
- *   Here we simply return.  We can't afford to block kswapd on the
+ *   We wait on commit, if told to.
- *   journal commit.
 *
 * In all cases it is actually safe for us to return without doing anything,
 * because the inode has been copied into a raw inode buffer in
- * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
+ * ext3_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL
- * knfsd.
+ * writeback.
 *
 * Note that we are absolutely dependent upon all inode dirtiers doing the
 * right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -3238,13 +3197,13 @@ out_brelse:
 *      stuff();
 *      inode->i_size = expr;
 *
- * is in error because a kswapd-driven write_inode() could occur while
+ * is in error because write_inode() could occur while `stuff()' is running,
- * `stuff()' is running, and the new i_size will be lost.  Plus the inode
+ * and the new i_size will be lost.  Plus the inode will no longer be on the
- * will no longer be on the superblock's dirty inode list.
+ * superblock's dirty inode list.
 */
 int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-        if (current->flags & PF_MEMALLOC)
+        if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
                return 0;
        if (ext3_journal_current_handle()) {
@@ -3253,7 +3212,12 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
                return -EIO;
        }
-        if (wbc->sync_mode != WB_SYNC_ALL)
+        /*
+         * No need to force transaction in WB_SYNC_NONE mode. Also
+         * ext3_sync_fs() will force the commit after everything is
+         * written.
+         */
+        if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
                return 0;
        return ext3_force_commit(inode->i_sb);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 37fd31ed16e7..08cdfe5461e3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -527,7 +527,7 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
 }
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
        ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
                                             sizeof(struct ext3_inode_info),
@@ -2649,6 +2649,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        int i;
 #endif
+        sync_filesystem(sb);
        /* Store the original options */
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3387664ad70e..722c2bf9645d 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -43,8 +43,9 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
                              name, value, size, flags);
 }
-int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+static int ext3_initxattrs(struct inode *inode,
-                    void *fs_info)
+                           const struct xattr *xattr_array,
+                           void *fs_info)
 {
        const struct xattr *xattr;
        handle_t *handle = fs_info;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 6ea7b1436bbc..5c56785007e0 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -667,7 +667,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
                        continue;
                x = ext4_count_free(bitmap_bh->b_data,
-                                    EXT4_BLOCKS_PER_GROUP(sb) / 8);
+                                    EXT4_CLUSTERS_PER_GROUP(sb) / 8);
                printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
                        i, ext4_free_group_clusters(sb, gdp), x);
                bitmap_count += x;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d3a534fdc5ff..66946aa62127 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -31,6 +31,7 @@
 #include <linux/percpu_counter.h>
 #include <linux/ratelimit.h>
 #include <crypto/hash.h>
+#include <linux/falloc.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
@@ -567,6 +568,8 @@ enum {
 #define EXT4_GET_BLOCKS_NO_LOCK                 0x0100
        /* Do not put hole in extent cache */
 #define EXT4_GET_BLOCKS_NO_PUT_HOLE             0x0200
+        /* Convert written extents to unwritten */
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN       0x0400
 /*
 * The bit position of these flags must not overlap with any of the
@@ -998,6 +1001,8 @@ struct ext4_inode_info {
 #define EXT4_MOUNT2_STD_GROUP_SIZE      0x00000002 /* We have standard group
                                                      size of blocksize * 8
                                                      blocks */
+#define EXT4_MOUNT2_HURD_COMPAT         0x00000004 /* Support HURD-castrated
+                                                      file systems */
 #define clear_opt(sb, opt)              EXT4_SB(sb)->s_mount_opt &= \
                                                ~EXT4_MOUNT_##opt
@@ -1326,6 +1331,7 @@ struct ext4_sb_info {
        struct list_head s_es_lru;
        unsigned long s_es_last_sorted;
        struct percpu_counter s_extent_cache_cnt;
+        struct mb_cache *s_mb_cache;
        spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
        /* Ratelimit ext4 messages. */
@@ -2133,8 +2139,6 @@ extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
-extern int ext4_block_zero_page_range(handle_t *handle,
-                struct address_space *mapping, loff_t from, loff_t length);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t lend);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -2462,23 +2466,6 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
        up_write(&EXT4_I(inode)->i_data_sem);
 }
-/*
- * Update i_disksize after writeback has been started. Races with truncate
- * are avoided by checking i_size under i_data_sem.
- */
-static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize)
-{
-        loff_t i_size;
-        down_write(&EXT4_I(inode)->i_data_sem);
-        i_size = i_size_read(inode);
-        if (newsize > i_size)
-                newsize = i_size;
-        if (newsize > EXT4_I(inode)->i_disksize)
-                EXT4_I(inode)->i_disksize = newsize;
-        up_write(&EXT4_I(inode)->i_data_sem);
-}
 struct ext4_group_info {
        unsigned long   bb_state;
        struct rb_root  bb_free_root;
@@ -2757,6 +2744,7 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
 extern int ext4_ext_precache(struct inode *inode);
+extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
 /* move_extent.c */
 extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2766,6 +2754,8 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);
+extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+                            struct ext4_extent **extent);
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 3fe29de832c8..c3fb607413ed 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -259,6 +259,16 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                if (WARN_ON_ONCE(err)) {
                        ext4_journal_abort_handle(where, line, __func__, bh,
                                                  handle, err);
+                        if (inode == NULL) {
+                                pr_err("EXT4: jbd2_journal_dirty_metadata "
+                                       "failed: handle type %u started at "
+                                       "line %u, credits %u/%u, errcode %d",
+                                       handle->h_type,
+                                       handle->h_line_no,
+                                       handle->h_requested_credits,
+                                       handle->h_buffer_credits, err);
+                                return err;
+                        }
                        ext4_error_inode(inode, where, line,
                                         bh->b_blocknr,
                                         "journal_dirty_metadata failed: "
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74bc2d549c58..01b0c208f625 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -37,7 +37,6 @@
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <linux/falloc.h>
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
 #include "ext4_jbd2.h"
@@ -1691,7 +1690,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
         * the extent that was written properly split out and conversion to
         * initialized is trivial.
         */
-        if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))
+        if (ext4_ext_is_uninitialized(ex1) != ext4_ext_is_uninitialized(ex2))
                return 0;
        ext1_ee_len = ext4_ext_get_actual_len(ex1);
@@ -1708,6 +1707,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
         */
        if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
                return 0;
+        if (ext4_ext_is_uninitialized(ex1) &&
+            (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
+             atomic_read(&EXT4_I(inode)->i_unwritten) ||
+             (ext1_ee_len + ext2_ee_len > EXT_UNINIT_MAX_LEN)))
+                return 0;
 #ifdef AGGRESSIVE_TEST
        if (ext1_ee_len >= 4)
                return 0;
@@ -1731,7 +1735,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
 {
        struct ext4_extent_header *eh;
        unsigned int depth, len;
-        int merge_done = 0;
+        int merge_done = 0, uninit;
        depth = ext_depth(inode);
        BUG_ON(path[depth].p_hdr == NULL);
@@ -1741,8 +1745,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
                if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
                        break;
                /* merge with next extent! */
+                uninit = ext4_ext_is_uninitialized(ex);
                ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                + ext4_ext_get_actual_len(ex + 1));
+                if (uninit)
+                        ext4_ext_mark_uninitialized(ex);
                if (ex + 1 < EXT_LAST_EXTENT(eh)) {
                        len = (EXT_LAST_EXTENT(eh) - ex - 1)
@@ -1896,7 +1903,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        struct ext4_ext_path *npath = NULL;
        int depth, len, err;
        ext4_lblk_t next;
-        int mb_flags = 0;
+        int mb_flags = 0, uninit;
        if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
                EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1946,9 +1953,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
                                                  path + depth);
                        if (err)
                                return err;
+                        uninit = ext4_ext_is_uninitialized(ex);
                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
+                        if (uninit)
+                                ext4_ext_mark_uninitialized(ex);
                        eh = path[depth].p_hdr;
                        nearex = ex;
                        goto merge;
@@ -1971,10 +1980,13 @@ prepend:
                        if (err)
                                return err;
+                        uninit = ext4_ext_is_uninitialized(ex);
                        ex->ee_block = newext->ee_block;
                        ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
                        ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
                                        + ext4_ext_get_actual_len(newext));
+                        if (uninit)
+                                ext4_ext_mark_uninitialized(ex);
                        eh = path[depth].p_hdr;
                        nearex = ex;
                        goto merge;
@@ -2585,6 +2597,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        ex_ee_block = le32_to_cpu(ex->ee_block);
        ex_ee_len = ext4_ext_get_actual_len(ex);
+        /*
+         * If we're starting with an extent other than the last one in the
+         * node, we need to see if it shares a cluster with the extent to
+         * the right (towards the end of the file). If its leftmost cluster
+         * is this extent's rightmost cluster and it is not cluster aligned,
+         * we'll mark it as a partial that is not to be deallocated.
+         */
+        if (ex != EXT_LAST_EXTENT(eh)) {
+                ext4_fsblk_t current_pblk, right_pblk;
+                long long current_cluster, right_cluster;
+                current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
+                current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
+                right_pblk = ext4_ext_pblock(ex + 1);
+                right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
+                if (current_cluster == right_cluster &&
+                        EXT4_PBLK_COFF(sbi, right_pblk))
+                        *partial_cluster = -right_cluster;
+        }
        trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
        while (ex >= EXT_FIRST_EXTENT(eh) &&
@@ -2710,10 +2743,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                err = ext4_ext_correct_indexes(handle, inode, path);
        /*
-         * Free the partial cluster only if the current extent does not
+         * If there's a partial cluster and at least one extent remains in
-         * reference it. Otherwise we might free used cluster.
+         * the leaf, free the partial cluster if it isn't shared with the
+         * current extent.  If there's a partial cluster and no extents
+         * remain in the leaf, it can't be freed here.  It can only be
+         * freed when it's possible to determine if it's not shared with
+         * any other extent - when the next leaf is processed or when space
+         * removal is complete.
         */
-        if (*partial_cluster > 0 &&
+        if (*partial_cluster > 0 && eh->eh_entries &&
            (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
             *partial_cluster)) {
                int flags = get_default_free_blocks_flags(inode);
@@ -3275,6 +3313,11 @@ static int ext4_split_extent(handle_t *handle,
                return PTR_ERR(path);
        depth = ext_depth(inode);
        ex = path[depth].p_ext;
+        if (!ex) {
+                EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+                                 (unsigned long) map->m_lblk);
+                return -EIO;
+        }
        uninitialized = ext4_ext_is_uninitialized(ex);
        split_flag1 = 0;
@@ -3569,6 +3612,8 @@ out:
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
 *
+ * This works the same way in the case of initialized -> unwritten conversion.
+ *
 * One of more index blocks maybe needed if the extent tree grow after
 * the uninitialized extent split. To prevent ENOSPC occur at the IO
 * complete, we need to split the uninitialized extent before DIO submit
@@ -3579,7 +3624,7 @@ out:
 *
 * Returns the size of uninitialized extent to be written on success.
 */
-static int ext4_split_unwritten_extents(handle_t *handle,
+static int ext4_split_convert_extents(handle_t *handle,
                                        struct inode *inode,
                                        struct ext4_map_blocks *map,
                                        struct ext4_ext_path *path,
@@ -3591,9 +3636,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        unsigned int ee_len;
        int split_flag = 0, depth;
-        ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+        ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
-                "block %llu, max_blocks %u\n", inode->i_ino,
+                  __func__, inode->i_ino,
-                (unsigned long long)map->m_lblk, map->m_len);
+                  (unsigned long long)map->m_lblk, map->m_len);
        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
                inode->i_sb->s_blocksize_bits;
@@ -3608,14 +3653,79 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
-        split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+        /* Convert to unwritten */
-        split_flag |= EXT4_EXT_MARK_UNINIT2;
+        if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
-        if (flags & EXT4_GET_BLOCKS_CONVERT)
+                split_flag |= EXT4_EXT_DATA_VALID1;
-                split_flag |= EXT4_EXT_DATA_VALID2;
+        /* Convert to initialized */
+        } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+                split_flag |= ee_block + ee_len <= eof_block ?
+                              EXT4_EXT_MAY_ZEROOUT : 0;
+                split_flag |= (EXT4_EXT_MARK_UNINIT2 | EXT4_EXT_DATA_VALID2);
+        }
        flags |= EXT4_GET_BLOCKS_PRE_IO;
        return ext4_split_extent(handle, inode, path, map, split_flag, flags);
 }
+static int ext4_convert_initialized_extents(handle_t *handle,
+                                            struct inode *inode,
+                                            struct ext4_map_blocks *map,
+                                            struct ext4_ext_path *path)
+{
+        struct ext4_extent *ex;
+        ext4_lblk_t ee_block;
+        unsigned int ee_len;
+        int depth;
+        int err = 0;
+        depth = ext_depth(inode);
+        ex = path[depth].p_ext;
+        ee_block = le32_to_cpu(ex->ee_block);
+        ee_len = ext4_ext_get_actual_len(ex);
+        ext_debug("%s: inode %lu, logical"
+                "block %llu, max_blocks %u\n", __func__, inode->i_ino,
+                  (unsigned long long)ee_block, ee_len);
+        if (ee_block != map->m_lblk || ee_len > map->m_len) {
+                err = ext4_split_convert_extents(handle, inode, map, path,
+                                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+                if (err < 0)
+                        goto out;
+                ext4_ext_drop_refs(path);
+                path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+                if (IS_ERR(path)) {
+                        err = PTR_ERR(path);
+                        goto out;
+                }
+                depth = ext_depth(inode);
+                ex = path[depth].p_ext;
+                if (!ex) {
+                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+                                         (unsigned long) map->m_lblk);
+                        err = -EIO;
+                        goto out;
+                }
+        }
+        err = ext4_ext_get_access(handle, inode, path + depth);
+        if (err)
+                goto out;
+        /* first mark the extent as uninitialized */
+        ext4_ext_mark_uninitialized(ex);
+        /* note: ext4_ext_correct_indexes() isn't needed here because
+         * borders are not changed
+         */
+        ext4_ext_try_to_merge(handle, inode, path, ex);
+        /* Mark modified extent as dirty */
+        err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+out:
+        ext4_ext_show_leaf(inode, path);
+        return err;
+}
 static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                                                struct inode *inode,
                                                struct ext4_map_blocks *map,
@@ -3649,8 +3759,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
                             inode->i_ino, (unsigned long long)ee_block, ee_len,
                             (unsigned long long)map->m_lblk, map->m_len);
 #endif
-                err = ext4_split_unwritten_extents(handle, inode, map, path,
+                err = ext4_split_convert_extents(handle, inode, map, path,
-                                                   EXT4_GET_BLOCKS_CONVERT);
+                                                 EXT4_GET_BLOCKS_CONVERT);
                if (err < 0)
                        goto out;
                ext4_ext_drop_refs(path);
@@ -3851,6 +3961,38 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 }
 static int
+ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
+                        struct ext4_map_blocks *map,
+                        struct ext4_ext_path *path, int flags,
+                        unsigned int allocated, ext4_fsblk_t newblock)
+{
+        int ret = 0;
+        int err = 0;
+        /*
+         * Make sure that the extent is no bigger than we support with
+         * uninitialized extent
+         */
+        if (map->m_len > EXT_UNINIT_MAX_LEN)
+                map->m_len = EXT_UNINIT_MAX_LEN / 2;
+        ret = ext4_convert_initialized_extents(handle, inode, map,
+                                                path);
+        if (ret >= 0) {
+                ext4_update_inode_fsync_trans(handle, inode, 1);
+                err = check_eofblocks_fl(handle, inode, map->m_lblk,
+                                         path, map->m_len);
+        } else
+                err = ret;
+        map->m_flags |= EXT4_MAP_UNWRITTEN;
+        if (allocated > map->m_len)
+                allocated = map->m_len;
+        map->m_len = allocated;
+        return err ? err : allocated;
+}
+static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                        struct ext4_map_blocks *map,
                        struct ext4_ext_path *path, int flags,
@@ -3877,8 +4019,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        /* get_block() before submit the IO, split the extent */
        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                ret = ext4_split_unwritten_extents(handle, inode, map,
+                ret = ext4_split_convert_extents(handle, inode, map,
-                                                   path, flags);
+                                         path, flags | EXT4_GET_BLOCKS_CONVERT);
                if (ret <= 0)
                        goto out;
                /*
@@ -3993,10 +4135,6 @@ out1:
        map->m_pblk = newblock;
        map->m_len = allocated;
 out2:
-        if (path) {
-                ext4_ext_drop_refs(path);
-                kfree(path);
-        }
        return err ? err : allocated;
 }
@@ -4128,7 +4266,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
        struct ext4_extent newex, *ex, *ex2;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_fsblk_t newblock = 0;
-        int free_on_err = 0, err = 0, depth;
+        int free_on_err = 0, err = 0, depth, ret;
        unsigned int allocated = 0, offset = 0;
        unsigned int allocated_clusters = 0;
        struct ext4_allocation_request ar;
@@ -4170,6 +4308,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
                unsigned short ee_len;
                /*
                 * Uninitialized extents are treated as holes, except that
                 * we split out initialized portions during a write.
@@ -4186,13 +4325,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
                                  ee_block, ee_len, newblock);
-                        if (!ext4_ext_is_uninitialized(ex))
+                        /*
+                         * If the extent is initialized check whether the
+                         * caller wants to convert it to unwritten.
+                         */
+                        if ((!ext4_ext_is_uninitialized(ex)) &&
+                            (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
+                                allocated = ext4_ext_convert_initialized_extent(
+                                                handle, inode, map, path, flags,
+                                                allocated, newblock);
+                                goto out2;
+                        } else if (!ext4_ext_is_uninitialized(ex))
                                goto out;
-                        allocated = ext4_ext_handle_uninitialized_extents(
+                        ret = ext4_ext_handle_uninitialized_extents(
                                handle, inode, map, path, flags,
                                allocated, newblock);
-                        goto out3;
+                        if (ret < 0)
+                                err = ret;
+                        else
+                                allocated = ret;
+                        goto out2;
                }
        }
@@ -4473,7 +4626,6 @@ out2:
                kfree(path);
        }
-out3:
        trace_ext4_ext_map_blocks_exit(inode, flags, map,
                                       err ? err : allocated);
        ext4_es_lru_add(inode);
@@ -4514,34 +4666,203 @@ retry:
        ext4_std_error(inode->i_sb, err);
 }
-static void ext4_falloc_update_inode(struct inode *inode,
+static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
-                                int mode, loff_t new_size, int update_ctime)
+                                  ext4_lblk_t len, int flags, int mode)
+{
+        struct inode *inode = file_inode(file);
+        handle_t *handle;
+        int ret = 0;
+        int ret2 = 0;
+        int retries = 0;
+        struct ext4_map_blocks map;
+        unsigned int credits;
+        map.m_lblk = offset;
+        /*
+         * Don't normalize the request if it can fit in one extent so
+         * that it doesn't get unnecessarily split into multiple
+         * extents.
+         */
+        if (len <= EXT_UNINIT_MAX_LEN)
+                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+        /*
+         * credits to insert 1 extent into extent tree
+         */
+        credits = ext4_chunk_trans_blocks(inode, len);
+retry:
+        while (ret >= 0 && ret < len) {
+                map.m_lblk = map.m_lblk + ret;
+                map.m_len = len = len - ret;
+                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+                                            credits);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        break;
+                }
+                ret = ext4_map_blocks(handle, inode, &map, flags);
+                if (ret <= 0) {
+                        ext4_debug("inode #%lu: block %u: len %u: "
+                                   "ext4_ext_map_blocks returned %d",
+                                   inode->i_ino, map.m_lblk,
+                                   map.m_len, ret);
+                        ext4_mark_inode_dirty(handle, inode);
+                        ret2 = ext4_journal_stop(handle);
+                        break;
+                }
+                ret2 = ext4_journal_stop(handle);
+                if (ret2)
+                        break;
+        }
+        if (ret == -ENOSPC &&
+                        ext4_should_retry_alloc(inode->i_sb, &retries)) {
+                ret = 0;
+                goto retry;
+        }
+        return ret > 0 ? ret2 : ret;
+}
+static long ext4_zero_range(struct file *file, loff_t offset,
+                            loff_t len, int mode)
 {
-        struct timespec now;
+        struct inode *inode = file_inode(file);
+        handle_t *handle = NULL;
+        unsigned int max_blocks;
+        loff_t new_size = 0;
+        int ret = 0;
+        int flags;
+        int partial;
+        loff_t start, end;
+        ext4_lblk_t lblk;
+        struct address_space *mapping = inode->i_mapping;
+        unsigned int blkbits = inode->i_blkbits;
+        trace_ext4_zero_range(inode, offset, len, mode);
-        if (update_ctime) {
+        if (!S_ISREG(inode->i_mode))
-                now = current_fs_time(inode->i_sb);
+                return -EINVAL;
-                if (!timespec_equal(&inode->i_ctime, &now))
-                        inode->i_ctime = now;
+        /*
+         * Write out all dirty pages to avoid race conditions
+         * Then release them.
+         */
+        if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+                ret = filemap_write_and_wait_range(mapping, offset,
+                                                   offset + len - 1);
+                if (ret)
+                        return ret;
        }
+        /*
+         * Round up offset. This is not fallocate, we neet to zero out
+         * blocks, so convert interior block aligned part of the range to
+         * unwritten and possibly manually zero out unaligned parts of the
+         * range.
+         */
+        start = round_up(offset, 1 << blkbits);
+        end = round_down((offset + len), 1 << blkbits);
+        if (start < offset || end > offset + len)
+                return -EINVAL;
+        partial = (offset + len) & ((1 << blkbits) - 1);
+        lblk = start >> blkbits;
+        max_blocks = (end >> blkbits);
+        if (max_blocks < lblk)
+                max_blocks = 0;
+        else
+                max_blocks -= lblk;
+        flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
+        if (mode & FALLOC_FL_KEEP_SIZE)
+                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+        mutex_lock(&inode->i_mutex);
        /*
-         * Update only when preallocation was requested beyond
+         * Indirect files do not support unwritten extnets
-         * the file size.
         */
-        if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+                ret = -EOPNOTSUPP;
+                goto out_mutex;
+        }
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+             offset + len > i_size_read(inode)) {
+                new_size = offset + len;
+                ret = inode_newsize_ok(inode, new_size);
+                if (ret)
+                        goto out_mutex;
+                /*
+                 * If we have a partial block after EOF we have to allocate
+                 * the entire block.
+                 */
+                if (partial)
+                        max_blocks += 1;
+        }
+        if (max_blocks > 0) {
+                /* Now release the pages and zero block aligned part of pages*/
+                truncate_pagecache_range(inode, start, end - 1);
+                /* Wait all existing dio workers, newcomers will block on i_mutex */
+                ext4_inode_block_unlocked_dio(inode);
+                inode_dio_wait(inode);
+                /*
+                 * Remove entire range from the extent status tree.
+                 */
+                ret = ext4_es_remove_extent(inode, lblk, max_blocks);
+                if (ret)
+                        goto out_dio;
+                ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
+                                             mode);
+                if (ret)
+                        goto out_dio;
+        }
+        handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                ext4_std_error(inode->i_sb, ret);
+                goto out_dio;
+        }
+        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+        if (new_size) {
                if (new_size > i_size_read(inode))
                        i_size_write(inode, new_size);
                if (new_size > EXT4_I(inode)->i_disksize)
                        ext4_update_i_disksize(inode, new_size);
        } else {
                /*
-                 * Mark that we allocate beyond EOF so the subsequent truncate
+                * Mark that we allocate beyond EOF so the subsequent truncate
-                 * can proceed even if the new size is the same as i_size.
+                * can proceed even if the new size is the same as i_size.
-                 */
+                */
-                if (new_size > i_size_read(inode))
+                if ((offset + len) > i_size_read(inode))
                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
+        ext4_mark_inode_dirty(handle, inode);
+        /* Zero out partial block at the edges of the range */
+        ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+        if (file->f_flags & O_SYNC)
+                ext4_handle_sync(handle);
+        ext4_journal_stop(handle);
+out_dio:
+        ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
 }
 /*
@@ -4555,17 +4876,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
        struct inode *inode = file_inode(file);
        handle_t *handle;
-        loff_t new_size;
+        loff_t new_size = 0;
        unsigned int max_blocks;
        int ret = 0;
-        int ret2 = 0;
-        int retries = 0;
        int flags;
-        struct ext4_map_blocks map;
+        ext4_lblk_t lblk;
-        unsigned int credits, blkbits = inode->i_blkbits;
+        struct timespec tv;
+        unsigned int blkbits = inode->i_blkbits;
        /* Return error if mode is not supported */
-        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
        if (mode & FALLOC_FL_PUNCH_HOLE)
@@ -4582,83 +4903,69 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
+        if (mode & FALLOC_FL_COLLAPSE_RANGE)
+                return ext4_collapse_range(inode, offset, len);
+        if (mode & FALLOC_FL_ZERO_RANGE)
+                return ext4_zero_range(file, offset, len, mode);
        trace_ext4_fallocate_enter(inode, offset, len, mode);
-        map.m_lblk = offset >> blkbits;
+        lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
         * If blocksize = 4096 offset = 3072 and len = 2048
         */
        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-                - map.m_lblk;
+                - lblk;
-        /*
-         * credits to insert 1 extent into extent tree
-         */
-        credits = ext4_chunk_trans_blocks(inode, max_blocks);
-        mutex_lock(&inode->i_mutex);
-        ret = inode_newsize_ok(inode, (len + offset));
-        if (ret) {
-                mutex_unlock(&inode->i_mutex);
-                trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
-                return ret;
-        }
        flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
        if (mode & FALLOC_FL_KEEP_SIZE)
                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
-        /*
-         * Don't normalize the request if it can fit in one extent so
-         * that it doesn't get unnecessarily split into multiple
-         * extents.
-         */
-        if (len <= EXT_UNINIT_MAX_LEN << blkbits)
-                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
-retry:
+        mutex_lock(&inode->i_mutex);
-        while (ret >= 0 && ret < max_blocks) {
-                map.m_lblk = map.m_lblk + ret;
-                map.m_len = max_blocks = max_blocks - ret;
-                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
-                                            credits);
-                if (IS_ERR(handle)) {
-                        ret = PTR_ERR(handle);
-                        break;
-                }
-                ret = ext4_map_blocks(handle, inode, &map, flags);
-                if (ret <= 0) {
-#ifdef EXT4FS_DEBUG
-                        ext4_warning(inode->i_sb,
-                                     "inode #%lu: block %u: len %u: "
-                                     "ext4_ext_map_blocks returned %d",
-                                     inode->i_ino, map.m_lblk,
-                                     map.m_len, ret);
-#endif
-                        ext4_mark_inode_dirty(handle, inode);
-                        ret2 = ext4_journal_stop(handle);
-                        break;
-                }
-                if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
-                                                blkbits) >> blkbits))
-                        new_size = offset + len;
-                else
-                        new_size = ((loff_t) map.m_lblk + ret) << blkbits;
-                ext4_falloc_update_inode(inode, mode, new_size,
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-                                         (map.m_flags & EXT4_MAP_NEW));
+             offset + len > i_size_read(inode)) {
-                ext4_mark_inode_dirty(handle, inode);
+                new_size = offset + len;
-                if ((file->f_flags & O_SYNC) && ret >= max_blocks)
+                ret = inode_newsize_ok(inode, new_size);
-                        ext4_handle_sync(handle);
+                if (ret)
-                ret2 = ext4_journal_stop(handle);
+                        goto out;
-                if (ret2)
-                        break;
        }
-        if (ret == -ENOSPC &&
-                        ext4_should_retry_alloc(inode->i_sb, &retries)) {
+        ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
-                ret = 0;
+        if (ret)
-                goto retry;
+                goto out;
+        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+        if (IS_ERR(handle))
+                goto out;
+        tv = inode->i_ctime = ext4_current_time(inode);
+        if (new_size) {
+                if (new_size > i_size_read(inode)) {
+                        i_size_write(inode, new_size);
+                        inode->i_mtime = tv;
+                }
+                if (new_size > EXT4_I(inode)->i_disksize)
+                        ext4_update_i_disksize(inode, new_size);
+        } else {
+                /*
+                * Mark that we allocate beyond EOF so the subsequent truncate
+                * can proceed even if the new size is the same as i_size.
+                */
+                if ((offset + len) > i_size_read(inode))
+                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
+        ext4_mark_inode_dirty(handle, inode);
+        if (file->f_flags & O_SYNC)
+                ext4_handle_sync(handle);
+        ext4_journal_stop(handle);
+out:
        mutex_unlock(&inode->i_mutex);
-        trace_ext4_fallocate_exit(inode, offset, max_blocks,
+        trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
-                                ret > 0 ? ret2 : ret);
+        return ret;
-        return ret > 0 ? ret2 : ret;
 }
 /*
@@ -4869,3 +5176,333 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        ext4_es_lru_add(inode);
        return error;
 }
+/*
+ * ext4_access_path:
+ * Function to access the path buffer for marking it dirty.
+ * It also checks if there are sufficient credits left in the journal handle
+ * to update path.
+ */
+static int
+ext4_access_path(handle_t *handle, struct inode *inode,
+                struct ext4_ext_path *path)
+{
+        int credits, err;
+        if (!ext4_handle_valid(handle))
+                return 0;
+        /*
+         * Check if need to extend journal credits
+         * 3 for leaf, sb, and inode plus 2 (bmap and group
+         * descriptor) for each block group; assume two block
+         * groups
+         */
+        if (handle->h_buffer_credits < 7) {
+                credits = ext4_writepage_trans_blocks(inode);
+                err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+                /* EAGAIN is success */
+                if (err && err != -EAGAIN)
+                        return err;
+        }
+        err = ext4_ext_get_access(handle, inode, path);
+        return err;
+}
+/*
+ * ext4_ext_shift_path_extents:
+ * Shift the extents of a path structure lying between path[depth].p_ext
+ * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
+ * from starting block for each extent.
+ */
+static int
+ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
+                            struct inode *inode, handle_t *handle,
+                            ext4_lblk_t *start)
+{
+        int depth, err = 0;
+        struct ext4_extent *ex_start, *ex_last;
+        bool update = 0;
+        depth = path->p_depth;
+        while (depth >= 0) {
+                if (depth == path->p_depth) {
+                        ex_start = path[depth].p_ext;
+                        if (!ex_start)
+                                return -EIO;
+                        ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+                        if (!ex_last)
+                                return -EIO;
+                        err = ext4_access_path(handle, inode, path + depth);
+                        if (err)
+                                goto out;
+                        if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
+                                update = 1;
+                        *start = le32_to_cpu(ex_last->ee_block) +
+                                ext4_ext_get_actual_len(ex_last);
+                        while (ex_start <= ex_last) {
+                                le32_add_cpu(&ex_start->ee_block, -shift);
+                                /* Try to merge to the left. */
+                                if ((ex_start >
+                                     EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
+                                    ext4_ext_try_to_merge_right(inode,
+                                                        path, ex_start - 1))
+                                        ex_last--;
+                                else
+                                        ex_start++;
+                        }
+                        err = ext4_ext_dirty(handle, inode, path + depth);
+                        if (err)
+                                goto out;
+                        if (--depth < 0 || !update)
+                                break;
+                }
+                /* Update index too */
+                err = ext4_access_path(handle, inode, path + depth);
+                if (err)
+                        goto out;
+                le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
+                err = ext4_ext_dirty(handle, inode, path + depth);
+                if (err)
+                        goto out;
+                /* we are done if current index is not a starting index */
+                if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
+                        break;
+                depth--;
+        }
+out:
+        return err;
+}
+/*
+ * ext4_ext_shift_extents:
+ * All the extents which lies in the range from start to the last allocated
+ * block for the file are shifted downwards by shift blocks.
+ * On success, 0 is returned, error otherwise.
+ */
+static int
+ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
+                       ext4_lblk_t start, ext4_lblk_t shift)
+{
+        struct ext4_ext_path *path;
+        int ret = 0, depth;
+        struct ext4_extent *extent;
+        ext4_lblk_t stop_block, current_block;
+        ext4_lblk_t ex_start, ex_end;
+        /* Let path point to the last extent */
+        path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+        if (IS_ERR(path))
+                return PTR_ERR(path);
+        depth = path->p_depth;
+        extent = path[depth].p_ext;
+        if (!extent) {
+                ext4_ext_drop_refs(path);
+                kfree(path);
+                return ret;
+        }
+        stop_block = le32_to_cpu(extent->ee_block) +
+                        ext4_ext_get_actual_len(extent);
+        ext4_ext_drop_refs(path);
+        kfree(path);
+        /* Nothing to shift, if hole is at the end of file */
+        if (start >= stop_block)
+                return ret;
+        /*
+         * Don't start shifting extents until we make sure the hole is big
+         * enough to accomodate the shift.
+         */
+        path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+        if (IS_ERR(path))
+                return PTR_ERR(path);
+        depth = path->p_depth;
+        extent =  path[depth].p_ext;
+        if (extent) {
+                ex_start = le32_to_cpu(extent->ee_block);
+                ex_end = le32_to_cpu(extent->ee_block) +
+                        ext4_ext_get_actual_len(extent);
+        } else {
+                ex_start = 0;
+                ex_end = 0;
+        }
+        ext4_ext_drop_refs(path);
+        kfree(path);
+        if ((start == ex_start && shift > ex_start) ||
+            (shift > start - ex_end))
+                return -EINVAL;
+        /* Its safe to start updating extents */
+        while (start < stop_block) {
+                path = ext4_ext_find_extent(inode, start, NULL, 0);
+                if (IS_ERR(path))
+                        return PTR_ERR(path);
+                depth = path->p_depth;
+                extent = path[depth].p_ext;
+                if (!extent) {
+                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+                                         (unsigned long) start);
+                        return -EIO;
+                }
+                current_block = le32_to_cpu(extent->ee_block);
+                if (start > current_block) {
+                        /* Hole, move to the next extent */
+                        ret = mext_next_extent(inode, path, &extent);
+                        if (ret != 0) {
+                                ext4_ext_drop_refs(path);
+                                kfree(path);
+                                if (ret == 1)
+                                        ret = 0;
+                                break;
+                        }
+                }
+                ret = ext4_ext_shift_path_extents(path, shift, inode,
+                                handle, &start);
+                ext4_ext_drop_refs(path);
+                kfree(path);
+                if (ret)
+                        break;
+        }
+        return ret;
+}
+/*
+ * ext4_collapse_range:
+ * This implements the fallocate's collapse range functionality for ext4
+ * Returns: 0 and non-zero on error.
+ */
+int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+        struct super_block *sb = inode->i_sb;
+        ext4_lblk_t punch_start, punch_stop;
+        handle_t *handle;
+        unsigned int credits;
+        loff_t new_size, ioffset;
+        int ret;
+        /* Collapse range works only on fs block size aligned offsets. */
+        if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
+            len & (EXT4_BLOCK_SIZE(sb) - 1))
+                return -EINVAL;
+        if (!S_ISREG(inode->i_mode))
+                return -EINVAL;
+        if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1)
+                return -EOPNOTSUPP;
+        trace_ext4_collapse_range(inode, offset, len);
+        punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+        punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+        /* Call ext4_force_commit to flush all data in case of data=journal. */
+        if (ext4_should_journal_data(inode)) {
+                ret = ext4_force_commit(inode->i_sb);
+                if (ret)
+                        return ret;
+        }
+        /*
+         * Need to round down offset to be aligned with page size boundary
+         * for page size > block size.
+         */
+        ioffset = round_down(offset, PAGE_SIZE);
+        /* Write out all dirty pages */
+        ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+                                           LLONG_MAX);
+        if (ret)
+                return ret;
+        /* Take mutex lock */
+        mutex_lock(&inode->i_mutex);
+        /*
+         * There is no need to overlap collapse range with EOF, in which case
+         * it is effectively a truncate operation
+         */
+        if (offset + len >= i_size_read(inode)) {
+                ret = -EINVAL;
+                goto out_mutex;
+        }
+        /* Currently just for extent based files */
+        if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+                ret = -EOPNOTSUPP;
+                goto out_mutex;
+        }
+        truncate_pagecache(inode, ioffset);
+        /* Wait for existing dio to complete */
+        ext4_inode_block_unlocked_dio(inode);
+        inode_dio_wait(inode);
+        credits = ext4_writepage_trans_blocks(inode);
+        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out_dio;
+        }
+        down_write(&EXT4_I(inode)->i_data_sem);
+        ext4_discard_preallocations(inode);
+        ret = ext4_es_remove_extent(inode, punch_start,
+                                    EXT_MAX_BLOCKS - punch_start);
+        if (ret) {
+                up_write(&EXT4_I(inode)->i_data_sem);
+                goto out_stop;
+        }
+        ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+        if (ret) {
+                up_write(&EXT4_I(inode)->i_data_sem);
+                goto out_stop;
+        }
+        ext4_discard_preallocations(inode);
+        ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+                                     punch_stop - punch_start);
+        if (ret) {
+                up_write(&EXT4_I(inode)->i_data_sem);
+                goto out_stop;
+        }
+        new_size = i_size_read(inode) - len;
+        i_size_write(inode, new_size);
+        EXT4_I(inode)->i_disksize = new_size;
+        up_write(&EXT4_I(inode)->i_data_sem);
+        if (IS_SYNC(inode))
+                ext4_handle_sync(handle);
+        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+        ext4_mark_inode_dirty(handle, inode);
+out_stop:
+        ext4_journal_stop(handle);
+out_dio:
+        ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 3981ff783950..0ebc21204b51 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -184,7 +184,7 @@ static void ext4_es_print_tree(struct inode *inode)
        while (node) {
                struct extent_status *es;
                es = rb_entry(node, struct extent_status, rb_node);
-                printk(KERN_DEBUG " [%u/%u) %llu %llx",
+                printk(KERN_DEBUG " [%u/%u) %llu %x",
                       es->es_lblk, es->es_len,
                       ext4_es_pblock(es), ext4_es_status(es));
                node = rb_next(node);
@@ -445,8 +445,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                                pr_warn("ES insert assertion failed for "
                                        "inode: %lu we can find an extent "
                                        "at block [%d/%d/%llu/%c], but we "
-                                        "want to add an delayed/hole extent "
+                                        "want to add a delayed/hole extent "
-                                        "[%d/%d/%llu/%llx]\n",
+                                        "[%d/%d/%llu/%x]\n",
                                        inode->i_ino, ee_block, ee_len,
                                        ee_start, ee_status ? 'u' : 'w',
                                        es->es_lblk, es->es_len,
@@ -486,8 +486,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
                if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "can't find an extent at block %d but we want "
-                                "to add an written/unwritten extent "
+                                "to add a written/unwritten extent "
-                                "[%d/%d/%llu/%llx]\n", inode->i_ino,
+                                "[%d/%d/%llu/%x]\n", inode->i_ino,
                                es->es_lblk, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                }
@@ -524,7 +524,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
                         */
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "We can find blocks but we want to add a "
-                                "delayed/hole extent [%d/%d/%llu/%llx]\n",
+                                "delayed/hole extent [%d/%d/%llu/%x]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                        return;
@@ -554,7 +554,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
                if (ext4_es_is_written(es)) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "We can't find the block but we want to add "
-                                "an written extent [%d/%d/%llu/%llx]\n",
+                                "a written extent [%d/%d/%llu/%x]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                        return;
@@ -658,8 +658,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
        newes.es_lblk = lblk;
        newes.es_len = len;
-        ext4_es_store_pblock(&newes, pblk);
+        ext4_es_store_pblock_status(&newes, pblk, status);
-        ext4_es_store_status(&newes, status);
        trace_ext4_es_insert_extent(inode, &newes);
        ext4_es_insert_extent_check(inode, &newes);
@@ -699,8 +698,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
        newes.es_lblk = lblk;
        newes.es_len = len;
-        ext4_es_store_pblock(&newes, pblk);
+        ext4_es_store_pblock_status(&newes, pblk, status);
-        ext4_es_store_status(&newes, status);
        trace_ext4_es_cache_extent(inode, &newes);
        if (!len)
@@ -812,13 +810,13 @@ retry:
                        newes.es_lblk = end + 1;
                        newes.es_len = len2;
+                        block = 0x7FDEADBEEFULL;
                        if (ext4_es_is_written(&orig_es) ||
-                            ext4_es_is_unwritten(&orig_es)) {
+                            ext4_es_is_unwritten(&orig_es))
                                block = ext4_es_pblock(&orig_es) +
                                        orig_es.es_len - len2;
-                                ext4_es_store_pblock(&newes, block);
+                        ext4_es_store_pblock_status(&newes, block,
-                        }
+                                                    ext4_es_status(&orig_es));
-                        ext4_es_store_status(&newes, ext4_es_status(&orig_es));
                        err = __es_insert_extent(inode, &newes);
                        if (err) {
                                es->es_lblk = orig_es.es_lblk;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 167f4ab8ecc3..f1b62a419920 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -129,6 +129,15 @@ static inline void ext4_es_store_status(struct extent_status *es,
                       (es->es_pblk & ~ES_MASK));
 }
+static inline void ext4_es_store_pblock_status(struct extent_status *es,
+                                               ext4_fsblk_t pb,
+                                               unsigned int status)
+{
+        es->es_pblk = (((ext4_fsblk_t)
+                        (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+                       (pb & ~ES_MASK));
+}
 extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
 extern void ext4_es_lru_add(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1a5073959f32..063fc1538355 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -82,7 +82,7 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
        size_t count = iov_length(iov, nr_segs);
        loff_t final_size = pos + count;
-        if (pos >= inode->i_size)
+        if (pos >= i_size_read(inode))
                return 0;
        if ((pos & blockmask) || (final_size & blockmask))
@@ -146,14 +146,14 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
                        overwrite = 1;
        }
-        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+        ret = __generic_file_aio_write(iocb, iov, nr_segs);
        mutex_unlock(&inode->i_mutex);
        if (ret > 0) {
                ssize_t err;
                err = generic_write_sync(file, iocb->ki_pos - ret, ret);
-                if (err < 0 && ret > 0)
+                if (err < 0)
                        ret = err;
        }
        blk_finish_plug(&plug);
@@ -200,6 +200,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 static const struct vm_operations_struct ext4_file_vm_ops = {
        .fault          = filemap_fault,
+        .map_pages      = filemap_map_pages,
        .page_mkwrite   = ext4_page_mkwrite,
        .remap_pages    = generic_file_remap_pages,
 };
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 24bfd7ff3049..d7b7462a0e13 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -215,7 +215,7 @@ void ext4_evict_inode(struct inode *inode)
                        jbd2_complete_transaction(journal, commit_tid);
                        filemap_write_and_wait(&inode->i_data);
                }
-                truncate_inode_pages(&inode->i_data, 0);
+                truncate_inode_pages_final(&inode->i_data);
                WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
                goto no_delete;
@@ -226,7 +226,7 @@ void ext4_evict_inode(struct inode *inode)
        if (ext4_should_order_data(inode))
                ext4_begin_ordered_truncate(inode, 0);
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
        if (is_bad_inode(inode))
@@ -504,6 +504,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 {
        struct extent_status es;
        int retval;
+        int ret = 0;
 #ifdef ES_AGGRESSIVE_TEST
        struct ext4_map_blocks orig_map;
@@ -515,6 +516,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                  "logical block %lu\n", inode->i_ino, flags, map->m_len,
                  (unsigned long) map->m_lblk);
+        /*
+         * ext4_map_blocks returns an int, and m_len is an unsigned int
+         */
+        if (unlikely(map->m_len > INT_MAX))
+                map->m_len = INT_MAX;
+        /* We can handle the block number less than EXT_MAX_BLOCKS */
+        if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
+                return -EIO;
        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
                ext4_es_lru_add(inode);
@@ -553,7 +564,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                                             EXT4_GET_BLOCKS_KEEP_SIZE);
        }
        if (retval > 0) {
-                int ret;
                unsigned int status;
                if (unlikely(retval != map->m_len)) {
@@ -580,7 +590,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 found:
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, map);
+                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }
@@ -597,7 +607,13 @@ found:
         * with buffer head unmapped.
         */
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
-                return retval;
+                /*
+                 * If we need to convert extent to unwritten
+                 * we continue and do the actual work in
+                 * ext4_ext_map_blocks()
+                 */
+                if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
+                        return retval;
        /*
         * Here we clear m_flags because after allocating an new extent,
@@ -653,7 +669,6 @@ found:
                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
        if (retval > 0) {
-                int ret;
                unsigned int status;
                if (unlikely(retval != map->m_len)) {
@@ -688,7 +703,7 @@ found:
 has_zeroout:
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, map);
+                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }
@@ -2232,13 +2247,23 @@ static int mpage_map_and_submit_extent(handle_t *handle,
                        return err;
        } while (map->m_len);
-        /* Update on-disk size after IO is submitted */
+        /*
+         * Update on-disk size after IO is submitted.  Races with
+         * truncate are avoided by checking i_size under i_data_sem.
+         */
        disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
        if (disksize > EXT4_I(inode)->i_disksize) {
                int err2;
+                loff_t i_size;
-                ext4_wb_update_i_disksize(inode, disksize);
+                down_write(&EXT4_I(inode)->i_data_sem);
+                i_size = i_size_read(inode);
+                if (disksize > i_size)
+                        disksize = i_size;
+                if (disksize > EXT4_I(inode)->i_disksize)
+                        EXT4_I(inode)->i_disksize = disksize;
                err2 = ext4_mark_inode_dirty(handle, inode);
+                up_write(&EXT4_I(inode)->i_data_sem);
                if (err2)
                        ext4_error(inode->i_sb,
                                   "Failed to mark inode %lu dirty",
@@ -3313,33 +3338,13 @@ void ext4_set_aops(struct inode *inode)
 }
 /*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
- */
-int ext4_block_truncate_page(handle_t *handle,
-                struct address_space *mapping, loff_t from)
-{
-        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-        unsigned length;
-        unsigned blocksize;
-        struct inode *inode = mapping->host;
-        blocksize = inode->i_sb->s_blocksize;
-        length = blocksize - (offset & (blocksize - 1));
-        return ext4_block_zero_page_range(handle, mapping, from, length);
-}
-/*
 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
 * starting from file offset 'from'.  The range to be zero'd must
 * be contained with in one block.  If the specified range exceeds
 * the end of the block it will be shortened to end of the block
 * that cooresponds to 'from'
 */
-int ext4_block_zero_page_range(handle_t *handle,
+static int ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length)
 {
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -3429,6 +3434,26 @@ unlock:
        return err;
 }
+/*
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+int ext4_block_truncate_page(handle_t *handle,
+                struct address_space *mapping, loff_t from)
+{
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned length;
+        unsigned blocksize;
+        struct inode *inode = mapping->host;
+        blocksize = inode->i_sb->s_blocksize;
+        length = blocksize - (offset & (blocksize - 1));
+        return ext4_block_zero_page_range(handle, mapping, from, length);
+}
 int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t length)
 {
@@ -3502,7 +3527,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;
-        trace_ext4_punch_hole(inode, offset, length);
+        trace_ext4_punch_hole(inode, offset, length, 0);
        /*
         * Write out all dirty pages to avoid race conditions
@@ -3516,15 +3541,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
        }
        mutex_lock(&inode->i_mutex);
-        /* It's not possible punch hole on append only file */
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
-                ret = -EPERM;
-                goto out_mutex;
-        }
-        if (IS_SWAPFILE(inode)) {
-                ret = -ETXTBSY;
-                goto out_mutex;
-        }
        /* No need to punch hole beyond i_size */
        if (offset >= inode->i_size)
@@ -3605,10 +3621,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
                ret = ext4_free_hole_blocks(handle, inode, first_block,
                                            stop_block);
-        ext4_discard_preallocations(inode);
        up_write(&EXT4_I(inode)->i_data_sem);
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);
+        /* Now release the pages again to reduce race window */
+        if (last_block_offset > first_block_offset)
+                truncate_pagecache_range(inode, first_block_offset,
+                                         last_block_offset);
        inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
        ext4_mark_inode_dirty(handle, inode);
 out_stop:
@@ -3682,7 +3703,7 @@ void ext4_truncate(struct inode *inode)
        /*
         * There is a possibility that we're either freeing the inode
-         * or it completely new indode. In those cases we might not
+         * or it's a completely new inode. In those cases we might not
         * have i_mutex locked because it's not necessary.
         */
        if (!(inode->i_state & (I_NEW|I_FREEING)))
@@ -3934,8 +3955,8 @@ void ext4_set_inode_flags(struct inode *inode)
                new_fl |= S_NOATIME;
        if (flags & EXT4_DIRSYNC_FL)
                new_fl |= S_DIRSYNC;
-        set_mask_bits(&inode->i_flags,
+        inode_set_flags(inode, new_fl,
-                      S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl);
+                        S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
 }
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4154,11 +4175,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
        EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
-        inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
-        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+                inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
-                if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+                if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
-                        inode->i_version |=
+                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
-                        (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+                                inode->i_version |=
+                    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+                }
        }
        ret = 0;
@@ -4328,8 +4351,7 @@ static int ext4_do_update_inode(handle_t *handle,
                goto out_brelse;
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
-        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
-            cpu_to_le32(EXT4_OS_HURD))
                raw_inode->i_file_acl_high =
                        cpu_to_le16(ei->i_file_acl >> 32);
        raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
@@ -4374,12 +4396,15 @@ static int ext4_do_update_inode(handle_t *handle,
                        raw_inode->i_block[block] = ei->i_data[block];
        }
-        raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
-        if (ei->i_extra_isize) {
+                raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
-                if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+                if (ei->i_extra_isize) {
-                        raw_inode->i_version_hi =
+                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
-                        cpu_to_le32(inode->i_version >> 32);
+                                raw_inode->i_version_hi =
-                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+                                        cpu_to_le32(inode->i_version >> 32);
+                        raw_inode->i_extra_isize =
+                                cpu_to_le16(ei->i_extra_isize);
+                }
        }
        ext4_inode_csum_set(inode, raw_inode, ei);
@@ -4402,21 +4427,20 @@ out_brelse:
 *
 * We are called from a few places:
 *
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
 *   Here, there will be no transaction running. We wait for any running
 *   transaction to commit.
 *
- * - Within sys_sync(), kupdate and such.
+ * - Within flush work (sys_sync(), kupdate and such).
- *   We wait on commit, if tol to.
+ *   We wait on commit, if told to.
 *
- * - Within prune_icache() (PF_MEMALLOC == true)
+ * - Within iput_final() -> write_inode_now()
- *   Here we simply return.  We can't afford to block kswapd on the
+ *   We wait on commit, if told to.
- *   journal commit.
 *
 * In all cases it is actually safe for us to return without doing anything,
 * because the inode has been copied into a raw inode buffer in
- * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
+ * ext4_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL
- * knfsd.
+ * writeback.
 *
 * Note that we are absolutely dependent upon all inode dirtiers doing the
 * right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -4428,15 +4452,15 @@ out_brelse:
 *      stuff();
 *      inode->i_size = expr;
 *
- * is in error because a kswapd-driven write_inode() could occur while
+ * is in error because write_inode() could occur while `stuff()' is running,
- * `stuff()' is running, and the new i_size will be lost.  Plus the inode
+ * and the new i_size will be lost.  Plus the inode will no longer be on the
- * will no longer be on the superblock's dirty inode list.
+ * superblock's dirty inode list.
 */
 int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        int err;
-        if (current->flags & PF_MEMALLOC)
+        if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
                return 0;
        if (EXT4_SB(inode->i_sb)->s_journal) {
@@ -4446,7 +4470,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
                        return -EIO;
                }
-                if (wbc->sync_mode != WB_SYNC_ALL)
+                /*
+                 * No need to force transaction in WB_SYNC_NONE mode. Also
+                 * ext4_sync_fs() will force the commit after everything is
+                 * written.
+                 */
+                if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
                        return 0;
                err = ext4_force_commit(inode->i_sb);
@@ -4456,7 +4485,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
                err = __ext4_get_inode_loc(inode, &iloc, 0);
                if (err)
                        return err;
-                if (wbc->sync_mode == WB_SYNC_ALL)
+                /*
+                 * sync(2) will flush the whole buffer cache. No need to do
+                 * it here separately for each inode.
+                 */
+                if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
                        sync_dirty_buffer(iloc.bh);
                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
                        EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a2a837f00407..0f2252ec274d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -104,21 +104,15 @@ static long swap_inode_boot_loader(struct super_block *sb,
        struct ext4_inode_info *ei_bl;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
+        if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
-                err = -EINVAL;
+                return -EINVAL;
-                goto swap_boot_out;
-        }
-        if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+        if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
-                err = -EPERM;
+                return -EPERM;
-                goto swap_boot_out;
-        }
        inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
-        if (IS_ERR(inode_bl)) {
+        if (IS_ERR(inode_bl))
-                err = PTR_ERR(inode_bl);
+                return PTR_ERR(inode_bl);
-                goto swap_boot_out;
-        }
        ei_bl = EXT4_I(inode_bl);
        filemap_flush(inode->i_mapping);
@@ -193,20 +187,14 @@ static long swap_inode_boot_loader(struct super_block *sb,
                        ext4_mark_inode_dirty(handle, inode);
                }
        }
        ext4_journal_stop(handle);
        ext4_double_up_write_data_sem(inode, inode_bl);
 journal_err_out:
        ext4_inode_resume_unlocked_dio(inode);
        ext4_inode_resume_unlocked_dio(inode_bl);
        unlock_two_nondirectories(inode, inode_bl);
        iput(inode_bl);
-swap_boot_out:
        return err;
 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 04a5c7504be9..c8238a26818c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -989,7 +989,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
        poff = block % blocks_per_page;
        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
        if (!page)
-                return -EIO;
+                return -ENOMEM;
        BUG_ON(page->mapping != inode->i_mapping);
        e4b->bd_bitmap_page = page;
        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
@@ -1003,7 +1003,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
        pnum = block / blocks_per_page;
        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
        if (!page)
-                return -EIO;
+                return -ENOMEM;
        BUG_ON(page->mapping != inode->i_mapping);
        e4b->bd_buddy_page = page;
        return 0;
@@ -1168,7 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                        unlock_page(page);
                }
        }
-        if (page == NULL || !PageUptodate(page)) {
+        if (page == NULL) {
+                ret = -ENOMEM;
+                goto err;
+        }
+        if (!PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }
@@ -1197,7 +1201,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                        unlock_page(page);
                }
        }
-        if (page == NULL || !PageUptodate(page)) {
+        if (page == NULL) {
+                ret = -ENOMEM;
+                goto err;
+        }
+        if (!PageUptodate(page)) {
                ret = -EIO;
                goto err;
        }
@@ -1808,6 +1816,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
        ext4_lock_group(ac->ac_sb, group);
        max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
                             ac->ac_g_ex.fe_len, &ex);
+        ex.fe_logical = 0xDEADFA11; /* debug value */
        if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
                ext4_fsblk_t start;
@@ -1936,7 +1945,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                         */
                        break;
                }
+                ex.fe_logical = 0xDEADC0DE; /* debug value */
                ext4_mb_measure_extent(ac, &ex, e4b);
                i += ex.fe_len;
@@ -1977,6 +1986,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
                        max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
                        if (max >= sbi->s_stripe) {
                                ac->ac_found++;
+                                ex.fe_logical = 0xDEADF00D; /* debug value */
                                ac->ac_b_ex = ex;
                                ext4_mb_use_best_found(ac, e4b);
                                break;
@@ -4006,8 +4016,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
                        (unsigned long)ac->ac_b_ex.fe_len,
                        (unsigned long)ac->ac_b_ex.fe_logical,
                        (int)ac->ac_criteria);
-        ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
+        ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
-                 ac->ac_ex_scanned, ac->ac_found);
        ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
        ngroups = ext4_get_groups_count(sb);
        for (i = 0; i < ngroups; i++) {
@@ -5007,6 +5016,8 @@ error_return:
 */
 static int ext4_trim_extent(struct super_block *sb, int start, int count,
                             ext4_group_t group, struct ext4_buddy *e4b)
+__releases(bitlock)
+__acquires(bitlock)
 {
        struct ext4_free_extent ex;
        int ret = 0;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 08481ee84cd5..d634e183b4d4 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -48,7 +48,7 @@ extern ushort ext4_mballoc_debug;
                }                                                       \
        } while (0)
 #else
-#define mb_debug(n, fmt, a...)
+#define mb_debug(n, fmt, a...)          no_printk(fmt, ## a)
 #endif
 #define EXT4_MB_HISTORY_ALLOC           1       /* allocation */
@@ -175,8 +175,6 @@ struct ext4_allocation_context {
        /* copy of the best found extent taken before preallocation efforts */
        struct ext4_free_extent ac_f_ex;
-        /* number of iterations done. we have to track to limit searching */
-        unsigned long ac_ex_scanned;
        __u16 ac_groups_scanned;
        __u16 ac_found;
        __u16 ac_tail;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 773b503bd18c..58ee7dc87669 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
 * ext4_ext_path structure refers to the last extent, or a negative error
 * value on failure.
 */
-static int
+int
 mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                      struct ext4_extent **extent)
 {
@@ -861,8 +861,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
                        }
                        if (!buffer_mapped(bh)) {
                                zero_user(page, block_start, blocksize);
-                                if (!err)
+                                set_buffer_uptodate(bh);
-                                        set_buffer_uptodate(bh);
                                continue;
                        }
                }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d050e043e884..1cb84f78909e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3000,6 +3000,154 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
        return ext4_get_first_inline_block(inode, parent_de, retval);
 }
+struct ext4_renament {
+        struct inode *dir;
+        struct dentry *dentry;
+        struct inode *inode;
+        bool is_dir;
+        int dir_nlink_delta;
+        /* entry for "dentry" */
+        struct buffer_head *bh;
+        struct ext4_dir_entry_2 *de;
+        int inlined;
+        /* entry for ".." in inode if it's a directory */
+        struct buffer_head *dir_bh;
+        struct ext4_dir_entry_2 *parent_de;
+        int dir_inlined;
+};
+static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
+{
+        int retval;
+        ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
+                                              &retval, &ent->parent_de,
+                                              &ent->dir_inlined);
+        if (!ent->dir_bh)
+                return retval;
+        if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
+                return -EIO;
+        BUFFER_TRACE(ent->dir_bh, "get_write_access");
+        return ext4_journal_get_write_access(handle, ent->dir_bh);
+}
+static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
+                                  unsigned dir_ino)
+{
+        int retval;
+        ent->parent_de->inode = cpu_to_le32(dir_ino);
+        BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
+        if (!ent->dir_inlined) {
+                if (is_dx(ent->inode)) {
+                        retval = ext4_handle_dirty_dx_node(handle,
+                                                           ent->inode,
+                                                           ent->dir_bh);
+                } else {
+                        retval = ext4_handle_dirty_dirent_node(handle,
+                                                               ent->inode,
+                                                               ent->dir_bh);
+                }
+        } else {
+                retval = ext4_mark_inode_dirty(handle, ent->inode);
+        }
+        if (retval) {
+                ext4_std_error(ent->dir->i_sb, retval);
+                return retval;
+        }
+        return 0;
+}
+static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
+                       unsigned ino, unsigned file_type)
+{
+        int retval;
+        BUFFER_TRACE(ent->bh, "get write access");
+        retval = ext4_journal_get_write_access(handle, ent->bh);
+        if (retval)
+                return retval;
+        ent->de->inode = cpu_to_le32(ino);
+        if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb,
+                                      EXT4_FEATURE_INCOMPAT_FILETYPE))
+                ent->de->file_type = file_type;
+        ent->dir->i_version++;
+        ent->dir->i_ctime = ent->dir->i_mtime =
+                ext4_current_time(ent->dir);
+        ext4_mark_inode_dirty(handle, ent->dir);
+        BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
+        if (!ent->inlined) {
+                retval = ext4_handle_dirty_dirent_node(handle,
+                                                       ent->dir, ent->bh);
+                if (unlikely(retval)) {
+                        ext4_std_error(ent->dir->i_sb, retval);
+                        return retval;
+                }
+        }
+        brelse(ent->bh);
+        ent->bh = NULL;
+        return 0;
+}
+static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
+                                  const struct qstr *d_name)
+{
+        int retval = -ENOENT;
+        struct buffer_head *bh;
+        struct ext4_dir_entry_2 *de;
+        bh = ext4_find_entry(dir, d_name, &de, NULL);
+        if (bh) {
+                retval = ext4_delete_entry(handle, dir, de, bh);
+                brelse(bh);
+        }
+        return retval;
+}
+static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
+{
+        int retval;
+        /*
+         * ent->de could have moved from under us during htree split, so make
+         * sure that we are deleting the right entry.  We might also be pointing
+         * to a stale entry in the unused part of ent->bh so just checking inum
+         * and the name isn't enough.
+         */
+        if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
+            ent->de->name_len != ent->dentry->d_name.len ||
+            strncmp(ent->de->name, ent->dentry->d_name.name,
+                    ent->de->name_len)) {
+                retval = ext4_find_delete_entry(handle, ent->dir,
+                                                &ent->dentry->d_name);
+        } else {
+                retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh);
+                if (retval == -ENOENT) {
+                        retval = ext4_find_delete_entry(handle, ent->dir,
+                                                        &ent->dentry->d_name);
+                }
+        }
+        if (retval) {
+                ext4_warning(ent->dir->i_sb,
+                                "Deleting old file (%lu), %d, error=%d",
+                                ent->dir->i_ino, ent->dir->i_nlink, retval);
+        }
+}
+static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
+{
+        if (ent->dir_nlink_delta) {
+                if (ent->dir_nlink_delta == -1)
+                        ext4_dec_count(handle, ent->dir);
+                else
+                        ext4_inc_count(handle, ent->dir);
+                ext4_mark_inode_dirty(handle, ent->dir);
+        }
+}
 /*
 * Anybody can rename anything with this: the permission checks are left to the
 * higher-level routines.
@@ -3012,198 +3160,267 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                       struct inode *new_dir, struct dentry *new_dentry)
 {
        handle_t *handle = NULL;
-        struct inode *old_inode, *new_inode;
+        struct ext4_renament old = {
-        struct buffer_head *old_bh, *new_bh, *dir_bh;
+                .dir = old_dir,
-        struct ext4_dir_entry_2 *old_de, *new_de;
+                .dentry = old_dentry,
+                .inode = old_dentry->d_inode,
+        };
+        struct ext4_renament new = {
+                .dir = new_dir,
+                .dentry = new_dentry,
+                .inode = new_dentry->d_inode,
+        };
        int retval;
-        int inlined = 0, new_inlined = 0;
-        struct ext4_dir_entry_2 *parent_de;
-        dquot_initialize(old_dir);
+        dquot_initialize(old.dir);
-        dquot_initialize(new_dir);
+        dquot_initialize(new.dir);
-        old_bh = new_bh = dir_bh = NULL;
        /* Initialize quotas before so that eventual writes go
         * in separate transaction */
-        if (new_dentry->d_inode)
+        if (new.inode)
-                dquot_initialize(new_dentry->d_inode);
+                dquot_initialize(new.inode);
-        old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
+        old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
        /*
         *  Check for inode number is _not_ due to possible IO errors.
         *  We might rmdir the source, keep it as pwd of some process
         *  and merrily kill the link to whatever was created under the
         *  same name. Goodbye sticky bit ;-<
         */
-        old_inode = old_dentry->d_inode;
        retval = -ENOENT;
-        if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+        if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
                goto end_rename;
-        new_inode = new_dentry->d_inode;
+        new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
-        new_bh = ext4_find_entry(new_dir, &new_dentry->d_name,
+                                 &new.de, &new.inlined);
-                                 &new_de, &new_inlined);
+        if (new.bh) {
-        if (new_bh) {
+                if (!new.inode) {
-                if (!new_inode) {
+                        brelse(new.bh);
-                        brelse(new_bh);
+                        new.bh = NULL;
-                        new_bh = NULL;
                }
        }
-        if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
+        if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
-                ext4_alloc_da_blocks(old_inode);
+                ext4_alloc_da_blocks(old.inode);
-        handle = ext4_journal_start(old_dir, EXT4_HT_DIR,
+        handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
-                (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+                (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
                 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
-        if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+        if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
                ext4_handle_sync(handle);
-        if (S_ISDIR(old_inode->i_mode)) {
+        if (S_ISDIR(old.inode->i_mode)) {
-                if (new_inode) {
+                if (new.inode) {
                        retval = -ENOTEMPTY;
-                        if (!empty_dir(new_inode))
+                        if (!empty_dir(new.inode))
+                                goto end_rename;
+                } else {
+                        retval = -EMLINK;
+                        if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
                                goto end_rename;
                }
-                retval = -EIO;
+                retval = ext4_rename_dir_prepare(handle, &old);
-                dir_bh = ext4_get_first_dir_block(handle, old_inode,
-                                                  &retval, &parent_de,
-                                                  &inlined);
-                if (!dir_bh)
-                        goto end_rename;
-                if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
-                        goto end_rename;
-                retval = -EMLINK;
-                if (!new_inode && new_dir != old_dir &&
-                    EXT4_DIR_LINK_MAX(new_dir))
-                        goto end_rename;
-                BUFFER_TRACE(dir_bh, "get_write_access");
-                retval = ext4_journal_get_write_access(handle, dir_bh);
                if (retval)
                        goto end_rename;
        }
-        if (!new_bh) {
+        if (!new.bh) {
-                retval = ext4_add_entry(handle, new_dentry, old_inode);
+                retval = ext4_add_entry(handle, new.dentry, old.inode);
                if (retval)
                        goto end_rename;
        } else {
-                BUFFER_TRACE(new_bh, "get write access");
+                retval = ext4_setent(handle, &new,
-                retval = ext4_journal_get_write_access(handle, new_bh);
+                                     old.inode->i_ino, old.de->file_type);
                if (retval)
                        goto end_rename;
-                new_de->inode = cpu_to_le32(old_inode->i_ino);
-                if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
-                                              EXT4_FEATURE_INCOMPAT_FILETYPE))
-                        new_de->file_type = old_de->file_type;
-                new_dir->i_version++;
-                new_dir->i_ctime = new_dir->i_mtime =
-                                        ext4_current_time(new_dir);
-                ext4_mark_inode_dirty(handle, new_dir);
-                BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
-                if (!new_inlined) {
-                        retval = ext4_handle_dirty_dirent_node(handle,
-                                                               new_dir, new_bh);
-                        if (unlikely(retval)) {
-                                ext4_std_error(new_dir->i_sb, retval);
-                                goto end_rename;
-                        }
-                }
-                brelse(new_bh);
-                new_bh = NULL;
        }
        /*
         * Like most other Unix systems, set the ctime for inodes on a
         * rename.
         */
-        old_inode->i_ctime = ext4_current_time(old_inode);
+        old.inode->i_ctime = ext4_current_time(old.inode);
-        ext4_mark_inode_dirty(handle, old_inode);
+        ext4_mark_inode_dirty(handle, old.inode);
        /*
         * ok, that's it
         */
-        if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
+        ext4_rename_delete(handle, &old);
-            old_de->name_len != old_dentry->d_name.len ||
-            strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
+        if (new.inode) {
-            (retval = ext4_delete_entry(handle, old_dir,
+                ext4_dec_count(handle, new.inode);
-                                        old_de, old_bh)) == -ENOENT) {
+                new.inode->i_ctime = ext4_current_time(new.inode);
-                /* old_de could have moved from under us during htree split, so
-                 * make sure that we are deleting the right entry.  We might
-                 * also be pointing to a stale entry in the unused part of
-                 * old_bh so just checking inum and the name isn't enough. */
-                struct buffer_head *old_bh2;
-                struct ext4_dir_entry_2 *old_de2;
-                old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
-                                          &old_de2, NULL);
-                if (old_bh2) {
-                        retval = ext4_delete_entry(handle, old_dir,
-                                                   old_de2, old_bh2);
-                        brelse(old_bh2);
-                }
        }
-        if (retval) {
+        old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir);
-                ext4_warning(old_dir->i_sb,
+        ext4_update_dx_flag(old.dir);
-                                "Deleting old file (%lu), %d, error=%d",
+        if (old.dir_bh) {
-                                old_dir->i_ino, old_dir->i_nlink, retval);
+                retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
-        }
+                if (retval)
-        if (new_inode) {
-                ext4_dec_count(handle, new_inode);
-                new_inode->i_ctime = ext4_current_time(new_inode);
-        }
-        old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
-        ext4_update_dx_flag(old_dir);
-        if (dir_bh) {
-                parent_de->inode = cpu_to_le32(new_dir->i_ino);
-                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-                if (!inlined) {
-                        if (is_dx(old_inode)) {
-                                retval = ext4_handle_dirty_dx_node(handle,
-                                                                   old_inode,
-                                                                   dir_bh);
-                        } else {
-                                retval = ext4_handle_dirty_dirent_node(handle,
-                                                        old_inode, dir_bh);
-                        }
-                } else {
-                        retval = ext4_mark_inode_dirty(handle, old_inode);
-                }
-                if (retval) {
-                        ext4_std_error(old_dir->i_sb, retval);
                        goto end_rename;
-                }
-                ext4_dec_count(handle, old_dir);
+                ext4_dec_count(handle, old.dir);
-                if (new_inode) {
+                if (new.inode) {
                        /* checked empty_dir above, can't have another parent,
                         * ext4_dec_count() won't work for many-linked dirs */
-                        clear_nlink(new_inode);
+                        clear_nlink(new.inode);
                } else {
-                        ext4_inc_count(handle, new_dir);
+                        ext4_inc_count(handle, new.dir);
-                        ext4_update_dx_flag(new_dir);
+                        ext4_update_dx_flag(new.dir);
-                        ext4_mark_inode_dirty(handle, new_dir);
+                        ext4_mark_inode_dirty(handle, new.dir);
                }
        }
-        ext4_mark_inode_dirty(handle, old_dir);
+        ext4_mark_inode_dirty(handle, old.dir);
-        if (new_inode) {
+        if (new.inode) {
-                ext4_mark_inode_dirty(handle, new_inode);
+                ext4_mark_inode_dirty(handle, new.inode);
-                if (!new_inode->i_nlink)
+                if (!new.inode->i_nlink)
-                        ext4_orphan_add(handle, new_inode);
+                        ext4_orphan_add(handle, new.inode);
        }
        retval = 0;
 end_rename:
-        brelse(dir_bh);
+        brelse(old.dir_bh);
-        brelse(old_bh);
+        brelse(old.bh);
-        brelse(new_bh);
+        brelse(new.bh);
        if (handle)
                ext4_journal_stop(handle);
        return retval;
 }
+static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
+                             struct inode *new_dir, struct dentry *new_dentry)
+{
+        handle_t *handle = NULL;
+        struct ext4_renament old = {
+                .dir = old_dir,
+                .dentry = old_dentry,
+                .inode = old_dentry->d_inode,
+        };
+        struct ext4_renament new = {
+                .dir = new_dir,
+                .dentry = new_dentry,
+                .inode = new_dentry->d_inode,
+        };
+        u8 new_file_type;
+        int retval;
+        dquot_initialize(old.dir);
+        dquot_initialize(new.dir);
+        old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
+                                 &old.de, &old.inlined);
+        /*
+         *  Check for inode number is _not_ due to possible IO errors.
+         *  We might rmdir the source, keep it as pwd of some process
+         *  and merrily kill the link to whatever was created under the
+         *  same name. Goodbye sticky bit ;-<
+         */
+        retval = -ENOENT;
+        if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
+                goto end_rename;
+        new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
+                                 &new.de, &new.inlined);
+        /* RENAME_EXCHANGE case: old *and* new must both exist */
+        if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
+                goto end_rename;
+        handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
+                (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+                 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+        if (IS_ERR(handle))
+                return PTR_ERR(handle);
+        if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
+                ext4_handle_sync(handle);
+        if (S_ISDIR(old.inode->i_mode)) {
+                old.is_dir = true;
+                retval = ext4_rename_dir_prepare(handle, &old);
+                if (retval)
+                        goto end_rename;
+        }
+        if (S_ISDIR(new.inode->i_mode)) {
+                new.is_dir = true;
+                retval = ext4_rename_dir_prepare(handle, &new);
+                if (retval)
+                        goto end_rename;
+        }
+        /*
+         * Other than the special case of overwriting a directory, parents'
+         * nlink only needs to be modified if this is a cross directory rename.
+         */
+        if (old.dir != new.dir && old.is_dir != new.is_dir) {
+                old.dir_nlink_delta = old.is_dir ? -1 : 1;
+                new.dir_nlink_delta = -old.dir_nlink_delta;
+                retval = -EMLINK;
+                if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) ||
+                    (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir)))
+                        goto end_rename;
+        }
+        new_file_type = new.de->file_type;
+        retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type);
+        if (retval)
+                goto end_rename;
+        retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type);
+        if (retval)
+                goto end_rename;
+        /*
+         * Like most other Unix systems, set the ctime for inodes on a
+         * rename.
+         */
+        old.inode->i_ctime = ext4_current_time(old.inode);
+        new.inode->i_ctime = ext4_current_time(new.inode);
+        ext4_mark_inode_dirty(handle, old.inode);
+        ext4_mark_inode_dirty(handle, new.inode);
+        if (old.dir_bh) {
+                retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
+                if (retval)
+                        goto end_rename;
+        }
+        if (new.dir_bh) {
+                retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino);
+                if (retval)
+                        goto end_rename;
+        }
+        ext4_update_dir_count(handle, &old);
+        ext4_update_dir_count(handle, &new);
+        retval = 0;
+end_rename:
+        brelse(old.dir_bh);
+        brelse(new.dir_bh);
+        brelse(old.bh);
+        brelse(new.bh);
+        if (handle)
+                ext4_journal_stop(handle);
+        return retval;
+}
+static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir, struct dentry *new_dentry,
+                        unsigned int flags)
+{
+        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+                return -EINVAL;
+        if (flags & RENAME_EXCHANGE) {
+                return ext4_cross_rename(old_dir, old_dentry,
+                                         new_dir, new_dentry);
+        }
+        /*
+         * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE"
+         * is equivalent to regular rename.
+         */
+        return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
 /*
 * directories can handle most operations...
 */
@@ -3218,6 +3435,7 @@ const struct inode_operations ext4_dir_inode_operations = {
        .mknod          = ext4_mknod,
        .tmpfile        = ext4_tmpfile,
        .rename         = ext4_rename,
+        .rename2        = ext4_rename2,
        .setattr        = ext4_setattr,
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index ab95508e3d40..c18d95b50540 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -308,13 +308,14 @@ static void ext4_end_bio(struct bio *bio, int error)
        if (error) {
                struct inode *inode = io_end->inode;
-                ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+                ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
                             "(offset %llu size %ld starting block %llu)",
-                             inode->i_ino,
+                             error, inode->i_ino,
                             (unsigned long long) io_end->offset,
                             (long) io_end->size,
                             (unsigned long long)
                             bi_sector >> (inode->i_blkbits - 9));
+                mapping_set_error(inode->i_mapping, error);
        }
        if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 710fed2377d4..6f9e6fadac04 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -59,6 +59,7 @@ static struct kset *ext4_kset;
 static struct ext4_lazy_init *ext4_li_info;
 static struct mutex ext4_li_mtx;
 static struct ext4_features *ext4_feat;
+static int ext4_mballoc_ready;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
@@ -845,6 +846,10 @@ static void ext4_put_super(struct super_block *sb)
                invalidate_bdev(sbi->journal_bdev);
                ext4_blkdev_remove(sbi);
        }
+        if (sbi->s_mb_cache) {
+                ext4_xattr_destroy_cache(sbi->s_mb_cache);
+                sbi->s_mb_cache = NULL;
+        }
        if (sbi->s_mmp_tsk)
                kthread_stop(sbi->s_mmp_tsk);
        sb->s_fs_info = NULL;
@@ -940,7 +945,7 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
 }
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
        ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
                                             sizeof(struct ext4_inode_info),
@@ -3575,6 +3580,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                       "feature flags set on rev 0 fs, "
                       "running e2fsck is recommended");
+        if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
+                set_opt2(sb, HURD_COMPAT);
+                if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+                                              EXT4_FEATURE_INCOMPAT_64BIT)) {
+                        ext4_msg(sb, KERN_ERR,
+                                 "The Hurd can't support 64-bit file systems");
+                        goto failed_mount;
+                }
+        }
        if (IS_EXT2_SB(sb)) {
                if (ext2_feature_set_ok(sb))
                        ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
@@ -3854,19 +3869,38 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        goto failed_mount2;
                }
        }
+        /*
+         * set up enough so that it can read an inode,
+         * and create new inode for buddy allocator
+         */
+        sbi->s_gdb_count = db_count;
+        if (!test_opt(sb, NOLOAD) &&
+            EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+                sb->s_op = &ext4_sops;
+        else
+                sb->s_op = &ext4_nojournal_sops;
+        ext4_ext_init(sb);
+        err = ext4_mb_init(sb);
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
+                         err);
+                goto failed_mount2;
+        }
        if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
-                goto failed_mount2;
+                goto failed_mount2a;
        }
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
                if (!ext4_fill_flex_info(sb)) {
                        ext4_msg(sb, KERN_ERR,
                               "unable to initialize "
                               "flex_bg meta info!");
-                        goto failed_mount2;
+                        goto failed_mount2a;
                }
-        sbi->s_gdb_count = db_count;
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
@@ -3901,14 +3935,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_stripe = ext4_get_stripe_size(sbi);
        sbi->s_extent_max_zeroout_kb = 32;
-        /*
-         * set up enough so that it can read an inode
-         */
-        if (!test_opt(sb, NOLOAD) &&
-            EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
-                sb->s_op = &ext4_sops;
-        else
-                sb->s_op = &ext4_nojournal_sops;
        sb->s_export_op = &ext4_export_ops;
        sb->s_xattr = ext4_xattr_handlers;
 #ifdef CONFIG_QUOTA
@@ -4010,6 +4036,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
 no_journal:
+        if (ext4_mballoc_ready) {
+                sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
+                if (!sbi->s_mb_cache) {
+                        ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+                        goto failed_mount_wq;
+                }
+        }
        /*
         * Get the # of file system overhead blocks from the
         * superblock if present.
@@ -4090,21 +4124,13 @@ no_journal:
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
                         "reserved pool", ext4_calculate_resv_clusters(sb));
-                goto failed_mount4a;
+                goto failed_mount5;
        }
        err = ext4_setup_system_zone(sb);
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize system "
                         "zone (%d)", err);
-                goto failed_mount4a;
-        }
-        ext4_ext_init(sb);
-        err = ext4_mb_init(sb);
-        if (err) {
-                ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
-                         err);
                goto failed_mount5;
        }
@@ -4181,11 +4207,8 @@ failed_mount8:
 failed_mount7:
        ext4_unregister_li_request(sb);
 failed_mount6:
-        ext4_mb_release(sb);
-failed_mount5:
-        ext4_ext_release(sb);
        ext4_release_system_zone(sb);
-failed_mount4a:
+failed_mount5:
        dput(sb->s_root);
        sb->s_root = NULL;
 failed_mount4:
@@ -4209,11 +4232,14 @@ failed_mount3:
        percpu_counter_destroy(&sbi->s_extent_cache_cnt);
        if (sbi->s_mmp_tsk)
                kthread_stop(sbi->s_mmp_tsk);
+failed_mount2a:
+        ext4_mb_release(sb);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
        ext4_kvfree(sbi->s_group_desc);
 failed_mount:
+        ext4_ext_release(sb);
        if (sbi->s_chksum_driver)
                crypto_free_shash(sbi->s_chksum_driver);
        if (sbi->s_proc) {
@@ -4835,6 +4861,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                }
                if (*flags & MS_RDONLY) {
+                        err = sync_filesystem(sb);
+                        if (err < 0)
+                                goto restore_opts;
                        err = dquot_suspend(sb, -1);
                        if (err < 0)
                                goto restore_opts;
@@ -5516,11 +5545,9 @@ static int __init ext4_init_fs(void)
        err = ext4_init_mballoc();
        if (err)
-                goto out3;
-        err = ext4_init_xattr();
-        if (err)
                goto out2;
+        else
+                ext4_mballoc_ready = 1;
        err = init_inodecache();
        if (err)
                goto out1;
@@ -5536,10 +5563,9 @@ out:
        unregister_as_ext3();
        destroy_inodecache();
 out1:
-        ext4_exit_xattr();
+        ext4_mballoc_ready = 0;
-out2:
        ext4_exit_mballoc();
-out3:
+out2:
        ext4_exit_feat_adverts();
 out4:
        if (ext4_proc_root)
@@ -5562,7 +5588,6 @@ static void __exit ext4_exit_fs(void)
        unregister_as_ext3();
        unregister_filesystem(&ext4_fs_type);
        destroy_inodecache();
-        ext4_exit_xattr();
        ext4_exit_mballoc();
        ext4_exit_feat_adverts();
        remove_proc_entry("fs/ext4", NULL);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e175e94116ac..4eec399ec807 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -81,7 +81,7 @@
 # define ea_bdebug(bh, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
 #endif
-static void ext4_xattr_cache_insert(struct buffer_head *);
+static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
 static struct buffer_head *ext4_xattr_cache_find(struct inode *,
                                                 struct ext4_xattr_header *,
                                                 struct mb_cache_entry **);
@@ -90,8 +90,6 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *,
 static int ext4_xattr_list(struct dentry *dentry, char *buffer,
                           size_t buffer_size);
-static struct mb_cache *ext4_xattr_cache;
 static const struct xattr_handler *ext4_xattr_handler_map[] = {
        [EXT4_XATTR_INDEX_USER]              = &ext4_xattr_user_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -117,6 +115,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
        NULL
 };
+#define EXT4_GET_MB_CACHE(inode)        (((struct ext4_sb_info *) \
+                                inode->i_sb->s_fs_info)->s_mb_cache)
 static __le32 ext4_xattr_block_csum(struct inode *inode,
                                    sector_t block_nr,
                                    struct ext4_xattr_header *hdr)
@@ -265,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
        struct ext4_xattr_entry *entry;
        size_t size;
        int error;
+        struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
        ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
                  name_index, name, buffer, (long)buffer_size);
@@ -286,7 +288,7 @@ bad_block:
                error = -EIO;
                goto cleanup;
        }
-        ext4_xattr_cache_insert(bh);
+        ext4_xattr_cache_insert(ext4_mb_cache, bh);
        entry = BFIRST(bh);
        error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
        if (error == -EIO)
@@ -409,6 +411,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        struct inode *inode = dentry->d_inode;
        struct buffer_head *bh = NULL;
        int error;
+        struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
        ea_idebug(inode, "buffer=%p, buffer_size=%ld",
                  buffer, (long)buffer_size);
@@ -430,7 +433,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
                error = -EIO;
                goto cleanup;
        }
-        ext4_xattr_cache_insert(bh);
+        ext4_xattr_cache_insert(ext4_mb_cache, bh);
        error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
 cleanup:
@@ -517,8 +520,8 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 }
 /*
- * Release the xattr block BH: If the reference count is > 1, decrement
+ * Release the xattr block BH: If the reference count is > 1, decrement it;
- * it; otherwise free the block.
+ * otherwise free the block.
 */
 static void
 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
@@ -526,8 +529,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 {
        struct mb_cache_entry *ce = NULL;
        int error = 0;
+        struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
-        ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+        ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
        error = ext4_journal_get_write_access(handle, bh);
        if (error)
                goto out;
@@ -538,16 +542,31 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                if (ce)
                        mb_cache_entry_free(ce);
                get_bh(bh);
+                unlock_buffer(bh);
                ext4_free_blocks(handle, inode, bh, 0, 1,
                                 EXT4_FREE_BLOCKS_METADATA |
                                 EXT4_FREE_BLOCKS_FORGET);
-                unlock_buffer(bh);
        } else {
                le32_add_cpu(&BHDR(bh)->h_refcount, -1);
                if (ce)
                        mb_cache_entry_release(ce);
+                /*
+                 * Beware of this ugliness: Releasing of xattr block references
+                 * from different inodes can race and so we have to protect
+                 * from a race where someone else frees the block (and releases
+                 * its journal_head) before we are done dirtying the buffer. In
+                 * nojournal mode this race is harmless and we actually cannot
+                 * call ext4_handle_dirty_xattr_block() with locked buffer as
+                 * that function can call sync_dirty_buffer() so for that case
+                 * we handle the dirtying after unlocking the buffer.
+                 */
+                if (ext4_handle_valid(handle))
+                        error = ext4_handle_dirty_xattr_block(handle, inode,
+                                                              bh);
                unlock_buffer(bh);
-                error = ext4_handle_dirty_xattr_block(handle, inode, bh);
+                if (!ext4_handle_valid(handle))
+                        error = ext4_handle_dirty_xattr_block(handle, inode,
+                                                              bh);
                if (IS_SYNC(inode))
                        ext4_handle_sync(handle);
                dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
@@ -567,12 +586,13 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
                                    size_t *min_offs, void *base, int *total)
 {
        for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
-                *total += EXT4_XATTR_LEN(last->e_name_len);
                if (!last->e_value_block && last->e_value_size) {
                        size_t offs = le16_to_cpu(last->e_value_offs);
                        if (offs < *min_offs)
                                *min_offs = offs;
                }
+                if (total)
+                        *total += EXT4_XATTR_LEN(last->e_name_len);
        }
        return (*min_offs - ((void *)last - base) - sizeof(__u32));
 }
@@ -745,13 +765,14 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
        struct ext4_xattr_search *s = &bs->s;
        struct mb_cache_entry *ce = NULL;
        int error = 0;
+        struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 #define header(x) ((struct ext4_xattr_header *)(x))
        if (i->value && i->value_len > sb->s_blocksize)
                return -ENOSPC;
        if (s->base) {
-                ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+                ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
                                        bs->bh->b_blocknr);
                error = ext4_journal_get_write_access(handle, bs->bh);
                if (error)
@@ -769,7 +790,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                                if (!IS_LAST_ENTRY(s->first))
                                        ext4_xattr_rehash(header(s->base),
                                                          s->here);
-                                ext4_xattr_cache_insert(bs->bh);
+                                ext4_xattr_cache_insert(ext4_mb_cache,
+                                        bs->bh);
                        }
                        unlock_buffer(bs->bh);
                        if (error == -EIO)
@@ -905,7 +927,7 @@ getblk_failed:
                        memcpy(new_bh->b_data, s->base, new_bh->b_size);
                        set_buffer_uptodate(new_bh);
                        unlock_buffer(new_bh);
-                        ext4_xattr_cache_insert(new_bh);
+                        ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
                        error = ext4_handle_dirty_xattr_block(handle,
                                                              inode, new_bh);
                        if (error)
@@ -1228,7 +1250,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
        struct ext4_xattr_block_find *bs = NULL;
        char *buffer = NULL, *b_entry_name = NULL;
        size_t min_offs, free;
-        int total_ino, total_blk;
+        int total_ino;
        void *base, *start, *end;
        int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
        int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
@@ -1286,8 +1308,7 @@ retry:
                first = BFIRST(bh);
                end = bh->b_data + bh->b_size;
                min_offs = end - base;
-                free = ext4_xattr_free_space(first, &min_offs, base,
+                free = ext4_xattr_free_space(first, &min_offs, base, NULL);
-                                             &total_blk);
                if (free < new_extra_isize) {
                        if (!tried_min_extra_isize && s_min_extra_isize) {
                                tried_min_extra_isize++;
@@ -1495,13 +1516,13 @@ ext4_xattr_put_super(struct super_block *sb)
 * Returns 0, or a negative error number on failure.
 */
 static void
-ext4_xattr_cache_insert(struct buffer_head *bh)
+ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
 {
        __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
        struct mb_cache_entry *ce;
        int error;
-        ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS);
+        ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
        if (!ce) {
                ea_bdebug(bh, "out of memory");
                return;
@@ -1573,12 +1594,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
 {
        __u32 hash = le32_to_cpu(header->h_hash);
        struct mb_cache_entry *ce;
+        struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
        if (!header->h_hash)
                return NULL;  /* never share */
        ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
 again:
-        ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
+        ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
                                       hash);
        while (ce) {
                struct buffer_head *bh;
@@ -1676,19 +1698,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
 #undef BLOCK_HASH_SHIFT
-int __init
+#define HASH_BUCKET_BITS        10
-ext4_init_xattr(void)
+struct mb_cache *
+ext4_xattr_create_cache(char *name)
 {
-        ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
+        return mb_cache_create(name, HASH_BUCKET_BITS);
-        if (!ext4_xattr_cache)
-                return -ENOMEM;
-        return 0;
 }
-void
+void ext4_xattr_destroy_cache(struct mb_cache *cache)
-ext4_exit_xattr(void)
 {
-        if (ext4_xattr_cache)
+        if (cache)
-                mb_cache_destroy(ext4_xattr_cache);
+                mb_cache_destroy(cache);
-        ext4_xattr_cache = NULL;
 }
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 819d6398833f..29bedf5589f6 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -110,9 +110,6 @@ extern void ext4_xattr_put_super(struct super_block *);
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
                            struct ext4_inode *raw_inode, handle_t *handle);
-extern int __init ext4_init_xattr(void);
-extern void ext4_exit_xattr(void);
 extern const struct xattr_handler *ext4_xattr_handlers[];
 extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
@@ -124,6 +121,9 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
                                       struct ext4_xattr_info *i,
                                       struct ext4_xattr_ibody_find *is);
+extern struct mb_cache *ext4_xattr_create_cache(char *name);
+extern void ext4_xattr_destroy_cache(struct mb_cache *);
 #ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
                              struct inode *dir, const struct qstr *qstr);
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index fa8da4cb8c4b..e93e4ec7d165 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -174,7 +174,7 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
        retval = f2fs_getxattr(inode, name_index, "", NULL, 0);
        if (retval > 0) {
-                value = kmalloc(retval, GFP_KERNEL);
+                value = kmalloc(retval, GFP_F2FS_ZERO);
                if (!value)
                        return ERR_PTR(-ENOMEM);
                retval = f2fs_getxattr(inode, name_index, "", value, retval);
@@ -203,6 +203,12 @@ static int __f2fs_set_acl(struct inode *inode, int type,
        size_t size = 0;
        int error;
+        if (acl) {
+                error = posix_acl_valid(acl);
+                if (error < 0)
+                        return error;
+        }
        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 293d0486a40f..4aa521aa9bc3 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -33,14 +33,12 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
        struct address_space *mapping = META_MAPPING(sbi);
        struct page *page = NULL;
 repeat:
-        page = grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
        if (!page) {
                cond_resched();
                goto repeat;
        }
-        /* We wait writeback only inside grab_meta_page() */
-        wait_on_page_writeback(page);
        SetPageUptodate(page);
        return page;
 }
@@ -75,23 +73,102 @@ out:
        return page;
 }
+inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
+{
+        switch (type) {
+        case META_NAT:
+                return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK;
+        case META_SIT:
+                return SIT_BLK_CNT(sbi);
+        case META_SSA:
+        case META_CP:
+                return 0;
+        default:
+                BUG();
+        }
+}
+/*
+ * Readahead CP/NAT/SIT/SSA pages
+ */
+int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
+{
+        block_t prev_blk_addr = 0;
+        struct page *page;
+        int blkno = start;
+        int max_blks = get_max_meta_blks(sbi, type);
+        struct f2fs_io_info fio = {
+                .type = META,
+                .rw = READ_SYNC | REQ_META | REQ_PRIO
+        };
+        for (; nrpages-- > 0; blkno++) {
+                block_t blk_addr;
+                switch (type) {
+                case META_NAT:
+                        /* get nat block addr */
+                        if (unlikely(blkno >= max_blks))
+                                blkno = 0;
+                        blk_addr = current_nat_addr(sbi,
+                                        blkno * NAT_ENTRY_PER_BLOCK);
+                        break;
+                case META_SIT:
+                        /* get sit block addr */
+                        if (unlikely(blkno >= max_blks))
+                                goto out;
+                        blk_addr = current_sit_addr(sbi,
+                                        blkno * SIT_ENTRY_PER_BLOCK);
+                        if (blkno != start && prev_blk_addr + 1 != blk_addr)
+                                goto out;
+                        prev_blk_addr = blk_addr;
+                        break;
+                case META_SSA:
+                case META_CP:
+                        /* get ssa/cp block addr */
+                        blk_addr = blkno;
+                        break;
+                default:
+                        BUG();
+                }
+                page = grab_cache_page(META_MAPPING(sbi), blk_addr);
+                if (!page)
+                        continue;
+                if (PageUptodate(page)) {
+                        mark_page_accessed(page);
+                        f2fs_put_page(page, 1);
+                        continue;
+                }
+                f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
+                mark_page_accessed(page);
+                f2fs_put_page(page, 0);
+        }
+out:
+        f2fs_submit_merged_bio(sbi, META, READ);
+        return blkno - start;
+}
 static int f2fs_write_meta_page(struct page *page,
                                struct writeback_control *wbc)
 {
        struct inode *inode = page->mapping->host;
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-        /* Should not write any meta pages, if any IO error was occurred */
+        if (unlikely(sbi->por_doing))
-        if (unlikely(sbi->por_doing ||
-                        is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
                goto redirty_out;
        if (wbc->for_reclaim)
                goto redirty_out;
-        wait_on_page_writeback(page);
+        /* Should not write any meta pages, if any IO error was occurred */
+        if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
+                goto no_write;
+        f2fs_wait_on_page_writeback(page, META);
        write_meta_page(sbi, page);
+no_write:
        dec_page_count(sbi, F2FS_DIRTY_META);
        unlock_page(page);
        return 0;
@@ -99,6 +176,7 @@ static int f2fs_write_meta_page(struct page *page,
 redirty_out:
        dec_page_count(sbi, F2FS_DIRTY_META);
        wbc->pages_skipped++;
+        account_page_redirty(page);
        set_page_dirty(page);
        return AOP_WRITEPAGE_ACTIVATE;
 }
@@ -107,21 +185,23 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
                                struct writeback_control *wbc)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
-        int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+        long diff, written;
-        long written;
-        if (wbc->for_kupdate)
-                return 0;
        /* collect a number of dirty meta pages and write together */
-        if (get_pages(sbi, F2FS_DIRTY_META) < nrpages)
+        if (wbc->for_kupdate ||
-                return 0;
+                get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
+                goto skip_write;
        /* if mounting is failed, skip writing node pages */
        mutex_lock(&sbi->cp_mutex);
-        written = sync_meta_pages(sbi, META, nrpages);
+        diff = nr_pages_to_write(sbi, META, wbc);
+        written = sync_meta_pages(sbi, META, wbc->nr_to_write);
        mutex_unlock(&sbi->cp_mutex);
-        wbc->nr_to_write -= written;
+        wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
+        return 0;
+skip_write:
+        wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
        return 0;
 }
@@ -148,10 +228,22 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
                        lock_page(page);
-                        f2fs_bug_on(page->mapping != mapping);
-                        f2fs_bug_on(!PageDirty(page));
+                        if (unlikely(page->mapping != mapping)) {
-                        clear_page_dirty_for_io(page);
+continue_unlock:
+                                unlock_page(page);
+                                continue;
+                        }
+                        if (!PageDirty(page)) {
+                                /* someone wrote it for us */
+                                goto continue_unlock;
+                        }
+                        if (!clear_page_dirty_for_io(page))
+                                goto continue_unlock;
                        if (f2fs_write_meta_page(page, &wbc)) {
                                unlock_page(page);
                                break;
@@ -216,16 +308,15 @@ void release_orphan_inode(struct f2fs_sb_info *sbi)
 void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 {
-        struct list_head *head, *this;
+        struct list_head *head;
-        struct orphan_inode_entry *new = NULL, *orphan = NULL;
+        struct orphan_inode_entry *new, *orphan;
        new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
        new->ino = ino;
        spin_lock(&sbi->orphan_inode_lock);
        head = &sbi->orphan_inode_list;
-        list_for_each(this, head) {
+        list_for_each_entry(orphan, head, list) {
-                orphan = list_entry(this, struct orphan_inode_entry, list);
                if (orphan->ino == ino) {
                        spin_unlock(&sbi->orphan_inode_lock);
                        kmem_cache_free(orphan_entry_slab, new);
@@ -234,14 +325,10 @@ void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
                if (orphan->ino > ino)
                        break;
-                orphan = NULL;
        }
-        /* add new_oentry into list which is sorted by inode number */
+        /* add new orphan entry into list which is sorted by inode number */
-        if (orphan)
+        list_add_tail(&new->list, &orphan->list);
-                list_add(&new->list, this->prev);
-        else
-                list_add_tail(&new->list, head);
        spin_unlock(&sbi->orphan_inode_lock);
 }
@@ -255,10 +342,11 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
        list_for_each_entry(orphan, head, list) {
                if (orphan->ino == ino) {
                        list_del(&orphan->list);
-                        kmem_cache_free(orphan_entry_slab, orphan);
                        f2fs_bug_on(sbi->n_orphans == 0);
                        sbi->n_orphans--;
-                        break;
+                        spin_unlock(&sbi->orphan_inode_lock);
+                        kmem_cache_free(orphan_entry_slab, orphan);
+                        return;
                }
        }
        spin_unlock(&sbi->orphan_inode_lock);
@@ -285,6 +373,8 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
        start_blk = __start_cp_addr(sbi) + 1;
        orphan_blkaddr = __start_sum_addr(sbi) - 1;
+        ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP);
        for (i = 0; i < orphan_blkaddr; i++) {
                struct page *page = get_meta_page(sbi, start_blk + i);
                struct f2fs_orphan_block *orphan_blk;
@@ -466,14 +556,12 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct list_head *head = &sbi->dir_inode_list;
-        struct list_head *this;
+        struct dir_inode_entry *entry;
-        list_for_each(this, head) {
+        list_for_each_entry(entry, head, list)
-                struct dir_inode_entry *entry;
-                entry = list_entry(this, struct dir_inode_entry, list);
                if (unlikely(entry->inode == inode))
                        return -EEXIST;
-        }
        list_add_tail(&new->list, head);
        stat_inc_dirty_dir(sbi);
        return 0;
@@ -483,6 +571,7 @@ void set_dirty_dir_page(struct inode *inode, struct page *page)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct dir_inode_entry *new;
+        int ret = 0;
        if (!S_ISDIR(inode->i_mode))
                return;
@@ -492,13 +581,13 @@ void set_dirty_dir_page(struct inode *inode, struct page *page)
        INIT_LIST_HEAD(&new->list);
        spin_lock(&sbi->dir_inode_lock);
-        if (__add_dirty_inode(inode, new))
+        ret = __add_dirty_inode(inode, new);
-                kmem_cache_free(inode_entry_slab, new);
-        inc_page_count(sbi, F2FS_DIRTY_DENTS);
        inode_inc_dirty_dents(inode);
        SetPagePrivate(page);
        spin_unlock(&sbi->dir_inode_lock);
+        if (ret)
+                kmem_cache_free(inode_entry_slab, new);
 }
 void add_dirty_dir_inode(struct inode *inode)
@@ -506,44 +595,47 @@ void add_dirty_dir_inode(struct inode *inode)
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct dir_inode_entry *new =
                        f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+        int ret = 0;
        new->inode = inode;
        INIT_LIST_HEAD(&new->list);
        spin_lock(&sbi->dir_inode_lock);
-        if (__add_dirty_inode(inode, new))
+        ret = __add_dirty_inode(inode, new);
-                kmem_cache_free(inode_entry_slab, new);
        spin_unlock(&sbi->dir_inode_lock);
+        if (ret)
+                kmem_cache_free(inode_entry_slab, new);
 }
 void remove_dirty_dir_inode(struct inode *inode)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        struct list_head *head;
-        struct list_head *this, *head;
+        struct dir_inode_entry *entry;
        if (!S_ISDIR(inode->i_mode))
                return;
        spin_lock(&sbi->dir_inode_lock);
-        if (atomic_read(&F2FS_I(inode)->dirty_dents)) {
+        if (get_dirty_dents(inode)) {
                spin_unlock(&sbi->dir_inode_lock);
                return;
        }
        head = &sbi->dir_inode_list;
-        list_for_each(this, head) {
+        list_for_each_entry(entry, head, list) {
-                struct dir_inode_entry *entry;
-                entry = list_entry(this, struct dir_inode_entry, list);
                if (entry->inode == inode) {
                        list_del(&entry->list);
-                        kmem_cache_free(inode_entry_slab, entry);
                        stat_dec_dirty_dir(sbi);
-                        break;
+                        spin_unlock(&sbi->dir_inode_lock);
+                        kmem_cache_free(inode_entry_slab, entry);
+                        goto done;
                }
        }
        spin_unlock(&sbi->dir_inode_lock);
+done:
        /* Only from the recovery routine */
        if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
                clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
@@ -554,15 +646,14 @@ void remove_dirty_dir_inode(struct inode *inode)
 struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
 {
-        struct list_head *this, *head;
+        struct list_head *head;
        struct inode *inode = NULL;
+        struct dir_inode_entry *entry;
        spin_lock(&sbi->dir_inode_lock);
        head = &sbi->dir_inode_list;
-        list_for_each(this, head) {
+        list_for_each_entry(entry, head, list) {
-                struct dir_inode_entry *entry;
-                entry = list_entry(this, struct dir_inode_entry, list);
                if (entry->inode->i_ino == ino) {
                        inode = entry->inode;
                        break;
@@ -589,7 +680,7 @@ retry:
        inode = igrab(entry->inode);
        spin_unlock(&sbi->dir_inode_lock);
        if (inode) {
-                filemap_flush(inode->i_mapping);
+                filemap_fdatawrite(inode->i_mapping);
                iput(inode);
        } else {
                /*
@@ -824,6 +915,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        unblock_operations(sbi);
        mutex_unlock(&sbi->cp_mutex);
+        stat_inc_cp_count(sbi->stat_info);
        trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
 }
@@ -845,11 +937,11 @@ void init_orphan_info(struct f2fs_sb_info *sbi)
 int __init create_checkpoint_caches(void)
 {
        orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
-                        sizeof(struct orphan_inode_entry), NULL);
+                        sizeof(struct orphan_inode_entry));
        if (!orphan_entry_slab)
                return -ENOMEM;
        inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
-                        sizeof(struct dir_inode_entry), NULL);
+                        sizeof(struct dir_inode_entry));
        if (!inode_entry_slab) {
                kmem_cache_destroy(orphan_entry_slab);
                return -ENOMEM;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 2261ccdd0b5f..45abd60e2bff 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -45,7 +45,7 @@ static void f2fs_read_end_io(struct bio *bio, int err)
 static void f2fs_write_end_io(struct bio *bio, int err)
 {
-        struct f2fs_sb_info *sbi = F2FS_SB(bio->bi_io_vec->bv_page->mapping->host->i_sb);
+        struct f2fs_sb_info *sbi = bio->bi_private;
        struct bio_vec *bvec;
        int i;
@@ -55,15 +55,16 @@ static void f2fs_write_end_io(struct bio *bio, int err)
                if (unlikely(err)) {
                        SetPageError(page);
                        set_bit(AS_EIO, &page->mapping->flags);
-                        set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+                        f2fs_stop_checkpoint(sbi);
-                        sbi->sb->s_flags |= MS_RDONLY;
                }
                end_page_writeback(page);
                dec_page_count(sbi, F2FS_WRITEBACK);
        }
-        if (bio->bi_private)
+        if (sbi->wait_io) {
-                complete(bio->bi_private);
+                complete(sbi->wait_io);
+                sbi->wait_io = NULL;
+        }
        if (!get_pages(sbi, F2FS_WRITEBACK) &&
                        !list_empty(&sbi->cp_wait.task_list))
@@ -86,6 +87,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
        bio->bi_bdev = sbi->sb->s_bdev;
        bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
        bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
+        bio->bi_private = sbi;
        return bio;
 }
@@ -113,7 +115,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
                 */
                if (fio->type == META_FLUSH) {
                        DECLARE_COMPLETION_ONSTACK(wait);
-                        io->bio->bi_private = &wait;
+                        io->sbi->wait_io = &wait;
                        submit_bio(rw, io->bio);
                        wait_for_completion(&wait);
                } else {
@@ -132,7 +134,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
        io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
-        mutex_lock(&io->io_mutex);
+        down_write(&io->io_rwsem);
        /* change META to META_FLUSH in the checkpoint procedure */
        if (type >= META_FLUSH) {
@@ -140,7 +142,7 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
                io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
        }
        __submit_merged_bio(io);
-        mutex_unlock(&io->io_mutex);
+        up_write(&io->io_rwsem);
 }
 /*
@@ -178,7 +180,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
        verify_block_addr(sbi, blk_addr);
-        mutex_lock(&io->io_mutex);
+        down_write(&io->io_rwsem);
        if (!is_read)
                inc_page_count(sbi, F2FS_WRITEBACK);
@@ -202,7 +204,7 @@ alloc_new:
        io->last_block_in_bio = blk_addr;
-        mutex_unlock(&io->io_mutex);
+        up_write(&io->io_rwsem);
        trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
 }
@@ -797,48 +799,36 @@ static int f2fs_write_data_page(struct page *page,
         */
        offset = i_size & (PAGE_CACHE_SIZE - 1);
        if ((page->index >= end_index + 1) || !offset) {
-                if (S_ISDIR(inode->i_mode)) {
+                inode_dec_dirty_dents(inode);
-                        dec_page_count(sbi, F2FS_DIRTY_DENTS);
-                        inode_dec_dirty_dents(inode);
-                }
                goto out;
        }
        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 write:
-        if (unlikely(sbi->por_doing)) {
+        if (unlikely(sbi->por_doing))
-                err = AOP_WRITEPAGE_ACTIVATE;
                goto redirty_out;
-        }
        /* Dentry blocks are controlled by checkpoint */
        if (S_ISDIR(inode->i_mode)) {
-                dec_page_count(sbi, F2FS_DIRTY_DENTS);
                inode_dec_dirty_dents(inode);
                err = do_write_data_page(page, &fio);
-        } else {
+                goto done;
-                f2fs_lock_op(sbi);
+        }
-                if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) {
-                        err = f2fs_write_inline_data(inode, page, offset);
-                        f2fs_unlock_op(sbi);
-                        goto out;
-                } else {
-                        err = do_write_data_page(page, &fio);
-                }
-                f2fs_unlock_op(sbi);
+        if (!wbc->for_reclaim)
                need_balance_fs = true;
-        }
+        else if (has_not_enough_free_secs(sbi, 0))
-        if (err == -ENOENT)
-                goto out;
-        else if (err)
                goto redirty_out;
-        if (wbc->for_reclaim) {
+        f2fs_lock_op(sbi);
-                f2fs_submit_merged_bio(sbi, DATA, WRITE);
+        if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode))
-                need_balance_fs = false;
+                err = f2fs_write_inline_data(inode, page, offset);
-        }
+        else
+                err = do_write_data_page(page, &fio);
+        f2fs_unlock_op(sbi);
+done:
+        if (err && err != -ENOENT)
+                goto redirty_out;
        clear_cold_data(page);
 out:
@@ -849,12 +839,11 @@ out:
 redirty_out:
        wbc->pages_skipped++;
+        account_page_redirty(page);
        set_page_dirty(page);
-        return err;
+        return AOP_WRITEPAGE_ACTIVATE;
 }
-#define MAX_DESIRED_PAGES_WP    4096
 static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
                        void *data)
 {
@@ -871,17 +860,17 @@ static int f2fs_write_data_pages(struct address_space *mapping,
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        bool locked = false;
        int ret;
-        long excess_nrtw = 0, desired_nrtw;
+        long diff;
        /* deal with chardevs and other special file */
        if (!mapping->a_ops->writepage)
                return 0;
-        if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) {
+        if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
-                desired_nrtw = MAX_DESIRED_PAGES_WP;
+                        get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA))
-                excess_nrtw = desired_nrtw - wbc->nr_to_write;
+                goto skip_write;
-                wbc->nr_to_write = desired_nrtw;
-        }
+        diff = nr_pages_to_write(sbi, DATA, wbc);
        if (!S_ISDIR(inode->i_mode)) {
                mutex_lock(&sbi->writepages);
@@ -895,8 +884,12 @@ static int f2fs_write_data_pages(struct address_space *mapping,
        remove_dirty_dir_inode(inode);
-        wbc->nr_to_write -= excess_nrtw;
+        wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
        return ret;
+skip_write:
+        wbc->pages_skipped += get_dirty_dents(inode);
+        return 0;
 }
 static int f2fs_write_begin(struct file *file, struct address_space *mapping,
@@ -949,13 +942,19 @@ inline_data:
        if (dn.data_blkaddr == NEW_ADDR) {
                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
        } else {
-                if (f2fs_has_inline_data(inode))
+                if (f2fs_has_inline_data(inode)) {
                        err = f2fs_read_inline_data(inode, page);
-                else
+                        if (err) {
+                                page_cache_release(page);
+                                return err;
+                        }
+                } else {
                        err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
                                                        READ_SYNC);
-                if (err)
+                        if (err)
-                        return err;
+                                return err;
+                }
                lock_page(page);
                if (unlikely(!PageUptodate(page))) {
                        f2fs_put_page(page, 1);
@@ -1031,11 +1030,8 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
                                      unsigned int length)
 {
        struct inode *inode = page->mapping->host;
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        if (PageDirty(page))
-        if (S_ISDIR(inode->i_mode) && PageDirty(page)) {
-                dec_page_count(sbi, F2FS_DIRTY_DENTS);
                inode_dec_dirty_dents(inode);
-        }
        ClearPagePrivate(page);
 }
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 3de9d20d0c14..b52c12cf5873 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -86,7 +86,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
 {
        struct f2fs_stat_info *si = F2FS_STAT(sbi);
        unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
-        struct sit_info *sit_i = SIT_I(sbi);
        unsigned int segno, vblocks;
        int ndirty = 0;
@@ -94,7 +93,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
        total_vblocks = 0;
        blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
        hblks_per_sec = blks_per_sec / 2;
-        mutex_lock(&sit_i->sentry_lock);
        for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
                vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
                dist = abs(vblocks - hblks_per_sec);
@@ -105,7 +103,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
                        ndirty++;
                }
        }
-        mutex_unlock(&sit_i->sentry_lock);
        dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
        si->bimodal = bimodal / dist;
        if (si->dirty_count)
@@ -236,6 +233,7 @@ static int stat_show(struct seq_file *s, void *v)
                           si->dirty_count);
                seq_printf(s, "  - Prefree: %d\n  - Free: %d (%d)\n\n",
                           si->prefree_count, si->free_segs, si->free_secs);
+                seq_printf(s, "CP calls: %d\n", si->cp_count);
                seq_printf(s, "GC calls: %d (BG: %d)\n",
                           si->call_count, si->bg_gc);
                seq_printf(s, "  - data segments : %d\n", si->data_segs);
@@ -252,10 +250,10 @@ static int stat_show(struct seq_file *s, void *v)
                           si->ndirty_dent, si->ndirty_dirs);
                seq_printf(s, "  - meta: %4d in %4d\n",
                           si->ndirty_meta, si->meta_pages);
-                seq_printf(s, "  - NATs: %5d > %lu\n",
+                seq_printf(s, "  - NATs: %9d\n  - SITs: %9d\n",
-                           si->nats, NM_WOUT_THRESHOLD);
+                           si->nats, si->sits);
-                seq_printf(s, "  - SITs: %5d\n  - free_nids: %5d\n",
+                seq_printf(s, "  - free_nids: %9d\n",
-                           si->sits, si->fnids);
+                           si->fnids);
                seq_puts(s, "\nDistribution of User Blocks:");
                seq_puts(s, " [ valid | invalid | free ]\n");
                seq_puts(s, "  [");
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 2b7c255bcbdf..972fd0ef230f 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -21,12 +21,12 @@ static unsigned long dir_blocks(struct inode *inode)
                                                        >> PAGE_CACHE_SHIFT;
 }
-static unsigned int dir_buckets(unsigned int level)
+static unsigned int dir_buckets(unsigned int level, int dir_level)
 {
        if (level < MAX_DIR_HASH_DEPTH / 2)
-                return 1 << level;
+                return 1 << (level + dir_level);
        else
-                return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1);
+                return 1 << ((MAX_DIR_HASH_DEPTH / 2 + dir_level) - 1);
 }
 static unsigned int bucket_blocks(unsigned int level)
@@ -65,13 +65,14 @@ static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
        de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
 }
-static unsigned long dir_block_index(unsigned int level, unsigned int idx)
+static unsigned long dir_block_index(unsigned int level,
+                                int dir_level, unsigned int idx)
 {
        unsigned long i;
        unsigned long bidx = 0;
        for (i = 0; i < level; i++)
-                bidx += dir_buckets(i) * bucket_blocks(i);
+                bidx += dir_buckets(i, dir_level) * bucket_blocks(i);
        bidx += idx * bucket_blocks(level);
        return bidx;
 }
@@ -93,16 +94,21 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
                        f2fs_hash_t namehash, struct page **res_page)
 {
        struct f2fs_dir_entry *de;
-        unsigned long bit_pos, end_pos, next_pos;
+        unsigned long bit_pos = 0;
        struct f2fs_dentry_block *dentry_blk = kmap(dentry_page);
-        int slots;
+        const void *dentry_bits = &dentry_blk->dentry_bitmap;
+        int max_len = 0;
-        bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
-                                        NR_DENTRY_IN_BLOCK, 0);
        while (bit_pos < NR_DENTRY_IN_BLOCK) {
+                if (!test_bit_le(bit_pos, dentry_bits)) {
+                        if (bit_pos == 0)
+                                max_len = 1;
+                        else if (!test_bit_le(bit_pos - 1, dentry_bits))
+                                max_len++;
+                        bit_pos++;
+                        continue;
+                }
                de = &dentry_blk->dentry[bit_pos];
-                slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
                if (early_match_name(name, namelen, namehash, de)) {
                        if (!memcmp(dentry_blk->filename[bit_pos],
                                                        name, namelen)) {
@@ -110,20 +116,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
                                goto found;
                        }
                }
-                next_pos = bit_pos + slots;
+                if (max_len > *max_slots) {
-                bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+                        *max_slots = max_len;
-                                NR_DENTRY_IN_BLOCK, next_pos);
+                        max_len = 0;
-                if (bit_pos >= NR_DENTRY_IN_BLOCK)
+                }
-                        end_pos = NR_DENTRY_IN_BLOCK;
+                bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
-                else
-                        end_pos = bit_pos;
-                if (*max_slots < end_pos - next_pos)
-                        *max_slots = end_pos - next_pos;
        }
        de = NULL;
        kunmap(dentry_page);
 found:
+        if (max_len > *max_slots)
+                *max_slots = max_len;
        return de;
 }
@@ -141,10 +145,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
        f2fs_bug_on(level > MAX_DIR_HASH_DEPTH);
-        nbucket = dir_buckets(level);
+        nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
        nblock = bucket_blocks(level);
-        bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket);
+        bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
+                                        le32_to_cpu(namehash) % nbucket);
        end_block = bidx + nblock;
        for (; bidx < end_block; bidx++) {
@@ -248,7 +253,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
                struct page *page, struct inode *inode)
 {
        lock_page(page);
-        wait_on_page_writeback(page);
+        f2fs_wait_on_page_writeback(page, DATA);
        de->ino = cpu_to_le32(inode->i_ino);
        set_de_type(de, inode);
        kunmap(page);
@@ -347,14 +352,11 @@ static struct page *init_inode_metadata(struct inode *inode,
                err = f2fs_init_security(inode, dir, name, page);
                if (err)
                        goto put_error;
-                wait_on_page_writeback(page);
        } else {
                page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
                if (IS_ERR(page))
                        return page;
-                wait_on_page_writeback(page);
                set_cold_node(inode, page);
        }
@@ -372,6 +374,10 @@ static struct page *init_inode_metadata(struct inode *inode,
 put_error:
        f2fs_put_page(page, 1);
+        /* once the failed inode becomes a bad inode, i_mode is S_IFREG */
+        truncate_inode_pages(&inode->i_data, 0);
+        truncate_blocks(inode, 0);
+        remove_dirty_dir_inode(inode);
 error:
        remove_inode_page(inode);
        return ERR_PTR(err);
@@ -395,9 +401,6 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
                set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
        }
-        if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
-                update_inode_page(dir);
        if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
                clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
 }
@@ -464,10 +467,11 @@ start:
        if (level == current_depth)
                ++current_depth;
-        nbucket = dir_buckets(level);
+        nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
        nblock = bucket_blocks(level);
-        bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
+        bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
+                                (le32_to_cpu(dentry_hash) % nbucket));
        for (block = bidx; block <= (bidx + nblock - 1); block++) {
                dentry_page = get_new_data_page(dir, NULL, block, true);
@@ -487,8 +491,9 @@ start:
        ++level;
        goto start;
 add_dentry:
-        wait_on_page_writeback(dentry_page);
+        f2fs_wait_on_page_writeback(dentry_page, DATA);
+        down_write(&F2FS_I(inode)->i_sem);
        page = init_inode_metadata(inode, dir, name);
        if (IS_ERR(page)) {
                err = PTR_ERR(page);
@@ -511,7 +516,12 @@ add_dentry:
        update_parent_metadata(dir, inode, current_depth);
 fail:
-        clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+        up_write(&F2FS_I(inode)->i_sem);
+        if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
+                update_inode_page(dir);
+                clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+        }
        kunmap(dentry_page);
        f2fs_put_page(dentry_page, 1);
        return err;
@@ -528,13 +538,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
        unsigned int bit_pos;
        struct address_space *mapping = page->mapping;
        struct inode *dir = mapping->host;
-        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
        int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
        void *kaddr = page_address(page);
        int i;
        lock_page(page);
-        wait_on_page_writeback(page);
+        f2fs_wait_on_page_writeback(page, DATA);
        dentry_blk = (struct f2fs_dentry_block *)kaddr;
        bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry;
@@ -551,6 +560,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        if (inode) {
+                struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+                down_write(&F2FS_I(inode)->i_sem);
                if (S_ISDIR(inode->i_mode)) {
                        drop_nlink(dir);
                        update_inode_page(dir);
@@ -561,6 +574,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
                        drop_nlink(inode);
                        i_size_write(inode, 0);
                }
+                up_write(&F2FS_I(inode)->i_sem);
                update_inode_page(inode);
                if (inode->i_nlink == 0)
@@ -573,7 +587,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
                truncate_hole(dir, page->index, page->index + 1);
                clear_page_dirty_for_io(page);
                ClearPageUptodate(page);
-                dec_page_count(sbi, F2FS_DIRTY_DENTS);
                inode_dec_dirty_dents(dir);
        }
        f2fs_put_page(page, 1);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index fc3c558cb4f3..2ecac8312359 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -40,6 +40,7 @@
 #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
 #define F2FS_MOUNT_INLINE_XATTR         0x00000080
 #define F2FS_MOUNT_INLINE_DATA          0x00000100
+#define F2FS_MOUNT_FLUSH_MERGE          0x00000200
 #define clear_opt(sbi, option)  (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)    (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -88,6 +89,16 @@ enum {
        SIT_BITMAP
 };
+/*
+ * For CP/NAT/SIT/SSA readahead
+ */
+enum {
+        META_CP,
+        META_NAT,
+        META_SIT,
+        META_SSA
+};
 /* for the list of orphan inodes */
 struct orphan_inode_entry {
        struct list_head list;  /* list head */
@@ -187,16 +198,20 @@ struct extent_info {
 #define FADVISE_COLD_BIT        0x01
 #define FADVISE_LOST_PINO_BIT   0x02
+#define DEF_DIR_LEVEL           0
 struct f2fs_inode_info {
        struct inode vfs_inode;         /* serve a vfs inode */
        unsigned long i_flags;          /* keep an inode flags for ioctl */
        unsigned char i_advise;         /* use to give file attribute hints */
+        unsigned char i_dir_level;      /* use for dentry level for large dir */
        unsigned int i_current_depth;   /* use only in directory structure */
        unsigned int i_pino;            /* parent inode number */
        umode_t i_acl_mode;             /* keep file acl mode temporarily */
        /* Use below internally in f2fs*/
        unsigned long flags;            /* use to pass per-file flags */
+        struct rw_semaphore i_sem;      /* protect fi info */
        atomic_t dirty_dents;           /* # of dirty dentry pages */
        f2fs_hash_t chash;              /* hash value of given file name */
        unsigned int clevel;            /* maximum level of given file name */
@@ -229,6 +244,7 @@ struct f2fs_nm_info {
        block_t nat_blkaddr;            /* base disk address of NAT */
        nid_t max_nid;                  /* maximum possible node ids */
        nid_t next_scan_nid;            /* the next nid to be scanned */
+        unsigned int ram_thresh;        /* control the memory footprint */
        /* NAT cache management */
        struct radix_tree_root nat_root;/* root of the nat entry cache */
@@ -238,6 +254,7 @@ struct f2fs_nm_info {
        struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
        /* free node ids management */
+        struct radix_tree_root free_nid_root;/* root of the free_nid cache */
        struct list_head free_nid_list; /* a list for free nids */
        spinlock_t free_nid_list_lock;  /* protect free nid list */
        unsigned int fcnt;              /* the number of free node id */
@@ -300,6 +317,12 @@ enum {
        NO_CHECK_TYPE
 };
+struct flush_cmd {
+        struct flush_cmd *next;
+        struct completion wait;
+        int ret;
+};
 struct f2fs_sm_info {
        struct sit_info *sit_info;              /* whole segment information */
        struct free_segmap_info *free_info;     /* free segment information */
@@ -328,6 +351,14 @@ struct f2fs_sm_info {
        unsigned int ipu_policy;        /* in-place-update policy */
        unsigned int min_ipu_util;      /* in-place-update threshold */
+        /* for flush command control */
+        struct task_struct *f2fs_issue_flush;   /* flush thread */
+        wait_queue_head_t flush_wait_queue;     /* waiting queue for wake-up */
+        struct flush_cmd *issue_list;           /* list for command issue */
+        struct flush_cmd *dispatch_list;        /* list for command dispatch */
+        spinlock_t issue_lock;                  /* for issue list lock */
+        struct flush_cmd *issue_tail;           /* list tail of issue list */
 };
 /*
@@ -378,7 +409,7 @@ struct f2fs_bio_info {
        struct bio *bio;                /* bios to merge */
        sector_t last_block_in_bio;     /* last block number */
        struct f2fs_io_info fio;        /* store buffered io info. */
-        struct mutex io_mutex;          /* mutex for bio */
+        struct rw_semaphore io_rwsem;   /* blocking op for bio */
 };
 struct f2fs_sb_info {
@@ -398,6 +429,7 @@ struct f2fs_sb_info {
        /* for bio operations */
        struct f2fs_bio_info read_io;                   /* for read bios */
        struct f2fs_bio_info write_io[NR_PAGE_TYPE];    /* for write bios */
+        struct completion *wait_io;             /* for completion bios */
        /* for checkpoint */
        struct f2fs_checkpoint *ckpt;           /* raw checkpoint pointer */
@@ -407,7 +439,6 @@ struct f2fs_sb_info {
        struct mutex node_write;                /* locking node writes */
        struct mutex writepages;                /* mutex for writepages() */
        bool por_doing;                         /* recovery is doing or not */
-        bool on_build_free_nids;                /* build_free_nids is doing */
        wait_queue_head_t cp_wait;
        /* for orphan inode management */
@@ -436,6 +467,7 @@ struct f2fs_sb_info {
        unsigned int total_valid_node_count;    /* valid node block count */
        unsigned int total_valid_inode_count;   /* valid inode count */
        int active_logs;                        /* # of active logs */
+        int dir_level;                          /* directory level */
        block_t user_block_count;               /* # of user blocks */
        block_t total_valid_block_count;        /* # of valid blocks */
@@ -622,6 +654,11 @@ static inline int F2FS_HAS_BLOCKS(struct inode *inode)
                return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS;
 }
+static inline bool f2fs_has_xattr_block(unsigned int ofs)
+{
+        return ofs == XATTR_NODE_OFFSET;
+}
 static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
                                 struct inode *inode, blkcnt_t count)
 {
@@ -661,6 +698,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
 static inline void inode_inc_dirty_dents(struct inode *inode)
 {
+        inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
        atomic_inc(&F2FS_I(inode)->dirty_dents);
 }
@@ -671,6 +709,10 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
 static inline void inode_dec_dirty_dents(struct inode *inode)
 {
+        if (!S_ISDIR(inode->i_mode))
+                return;
+        dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
        atomic_dec(&F2FS_I(inode)->dirty_dents);
 }
@@ -679,6 +721,11 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
        return atomic_read(&sbi->nr_pages[count_type]);
 }
+static inline int get_dirty_dents(struct inode *inode)
+{
+        return atomic_read(&F2FS_I(inode)->dirty_dents);
+}
 static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
 {
        unsigned int pages_per_sec = sbi->segs_per_sec *
@@ -689,11 +736,7 @@ static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
 static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
 {
-        block_t ret;
+        return sbi->total_valid_block_count;
-        spin_lock(&sbi->stat_lock);
-        ret = sbi->total_valid_block_count;
-        spin_unlock(&sbi->stat_lock);
-        return ret;
 }
 static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
@@ -789,11 +832,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
 static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
 {
-        unsigned int ret;
+        return sbi->total_valid_node_count;
-        spin_lock(&sbi->stat_lock);
-        ret = sbi->total_valid_node_count;
-        spin_unlock(&sbi->stat_lock);
-        return ret;
 }
 static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
@@ -814,11 +853,7 @@ static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
 static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
 {
-        unsigned int ret;
+        return sbi->total_valid_inode_count;
-        spin_lock(&sbi->stat_lock);
-        ret = sbi->total_valid_inode_count;
-        spin_unlock(&sbi->stat_lock);
-        return ret;
 }
 static inline void f2fs_put_page(struct page *page, int unlock)
@@ -844,9 +879,9 @@ static inline void f2fs_put_dnode(struct dnode_of_data *dn)
 }
 static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
-                                        size_t size, void (*ctor)(void *))
+                                        size_t size)
 {
-        return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor);
+        return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL);
 }
 static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
@@ -983,24 +1018,28 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi,
                ri->i_inline |= F2FS_INLINE_DATA;
 }
+static inline int f2fs_has_inline_xattr(struct inode *inode)
+{
+        return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR);
+}
 static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
 {
-        if (is_inode_flag_set(fi, FI_INLINE_XATTR))
+        if (f2fs_has_inline_xattr(&fi->vfs_inode))
                return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
        return DEF_ADDRS_PER_INODE;
 }
 static inline void *inline_xattr_addr(struct page *page)
 {
-        struct f2fs_inode *ri;
+        struct f2fs_inode *ri = F2FS_INODE(page);
-        ri = (struct f2fs_inode *)page_address(page);
        return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
                                        F2FS_INLINE_XATTR_ADDRS]);
 }
 static inline int inline_xattr_size(struct inode *inode)
 {
-        if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR))
+        if (f2fs_has_inline_xattr(inode))
                return F2FS_INLINE_XATTR_ADDRS << 2;
        else
                return 0;
@@ -1013,8 +1052,7 @@ static inline int f2fs_has_inline_data(struct inode *inode)
 static inline void *inline_data_addr(struct page *page)
 {
-        struct f2fs_inode *ri;
+        struct f2fs_inode *ri = F2FS_INODE(page);
-        ri = (struct f2fs_inode *)page_address(page);
        return (void *)&(ri->i_addr[1]);
 }
@@ -1023,6 +1061,12 @@ static inline int f2fs_readonly(struct super_block *sb)
        return sb->s_flags & MS_RDONLY;
 }
+static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
+{
+        set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+        sbi->sb->s_flags |= MS_RDONLY;
+}
 #define get_inode_mode(i) \
        ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
         (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -1048,7 +1092,7 @@ void f2fs_set_inode_flags(struct inode *);
 struct inode *f2fs_iget(struct super_block *, unsigned long);
 int try_to_free_nats(struct f2fs_sb_info *, int);
 void update_inode(struct inode *, struct page *);
-int update_inode_page(struct inode *);
+void update_inode_page(struct inode *);
 int f2fs_write_inode(struct inode *, struct writeback_control *);
 void f2fs_evict_inode(struct inode *);
@@ -1097,6 +1141,7 @@ struct dnode_of_data;
 struct node_info;
 int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
+bool fsync_mark_done(struct f2fs_sb_info *, nid_t);
 void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
 int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
 int truncate_inode_blocks(struct inode *, pgoff_t);
@@ -1115,6 +1160,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t);
 void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
 void recover_node_page(struct f2fs_sb_info *, struct page *,
                struct f2fs_summary *, struct node_info *, block_t);
+bool recover_xattr_data(struct inode *, struct page *, block_t);
 int recover_inode_page(struct f2fs_sb_info *, struct page *);
 int restore_node_summary(struct f2fs_sb_info *, unsigned int,
                                struct f2fs_summary_block *);
@@ -1129,7 +1175,9 @@ void destroy_node_manager_caches(void);
 */
 void f2fs_balance_fs(struct f2fs_sb_info *);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *);
+int f2fs_issue_flush(struct f2fs_sb_info *);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
+void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
 void clear_prefree_segments(struct f2fs_sb_info *);
 int npages_for_summary_flush(struct f2fs_sb_info *);
 void allocate_new_segments(struct f2fs_sb_info *);
@@ -1162,6 +1210,7 @@ void destroy_segment_manager_caches(void);
 */
 struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
+int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
 long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
 int acquire_orphan_inode(struct f2fs_sb_info *);
 void release_orphan_inode(struct f2fs_sb_info *);
@@ -1231,7 +1280,7 @@ struct f2fs_stat_info {
        int util_free, util_valid, util_invalid;
        int rsvd_segs, overp_segs;
        int dirty_count, node_pages, meta_pages;
-        int prefree_count, call_count;
+        int prefree_count, call_count, cp_count;
        int tot_segs, node_segs, data_segs, free_segs, free_secs;
        int tot_blks, data_blks, node_blks;
        int curseg[NR_CURSEG_TYPE];
@@ -1248,6 +1297,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
        return (struct f2fs_stat_info *)sbi->stat_info;
 }
+#define stat_inc_cp_count(si)           ((si)->cp_count++)
 #define stat_inc_call_count(si)         ((si)->call_count++)
 #define stat_inc_bggc_count(sbi)        ((sbi)->bg_gc++)
 #define stat_inc_dirty_dir(sbi)         ((sbi)->n_dirty_dirs++)
@@ -1302,6 +1352,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *);
 void __init f2fs_create_root_stats(void);
 void f2fs_destroy_root_stats(void);
 #else
+#define stat_inc_cp_count(si)
 #define stat_inc_call_count(si)
 #define stat_inc_bggc_count(si)
 #define stat_inc_dirty_dir(sbi)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 0dfcef53a6ed..60e7d5448a1d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -76,7 +76,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
        trace_f2fs_vm_page_mkwrite(page, DATA);
 mapped:
        /* fill the page */
-        wait_on_page_writeback(page);
+        f2fs_wait_on_page_writeback(page, DATA);
 out:
        sb_end_pagefault(inode->i_sb);
        return block_page_mkwrite_return(err);
@@ -84,6 +84,7 @@ out:
 static const struct vm_operations_struct f2fs_file_vm_ops = {
        .fault          = filemap_fault,
+        .map_pages      = filemap_map_pages,
        .page_mkwrite   = f2fs_vm_page_mkwrite,
        .remap_pages    = generic_file_remap_pages,
 };
@@ -111,11 +112,12 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 {
        struct inode *inode = file->f_mapping->host;
+        struct f2fs_inode_info *fi = F2FS_I(inode);
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        int ret = 0;
        bool need_cp = false;
        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_NONE,
+                .sync_mode = WB_SYNC_ALL,
                .nr_to_write = LONG_MAX,
                .for_reclaim = 0,
        };
@@ -133,7 +135,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        /* guarantee free sections for fsync */
        f2fs_balance_fs(sbi);
-        mutex_lock(&inode->i_mutex);
+        down_read(&fi->i_sem);
        /*
         * Both of fdatasync() and fsync() are able to be recovered from
@@ -150,25 +152,33 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
                need_cp = true;
+        up_read(&fi->i_sem);
        if (need_cp) {
                nid_t pino;
-                F2FS_I(inode)->xattr_ver = 0;
                /* all the dirty node pages should be flushed for POR */
                ret = f2fs_sync_fs(inode->i_sb, 1);
+                down_write(&fi->i_sem);
+                F2FS_I(inode)->xattr_ver = 0;
                if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
                                        get_parent_ino(inode, &pino)) {
                        F2FS_I(inode)->i_pino = pino;
                        file_got_pino(inode);
+                        up_write(&fi->i_sem);
                        mark_inode_dirty_sync(inode);
                        ret = f2fs_write_inode(inode, NULL);
                        if (ret)
                                goto out;
+                } else {
+                        up_write(&fi->i_sem);
                }
        } else {
                /* if there is no written node page, write its inode page */
                while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
+                        if (fsync_mark_done(sbi, inode->i_ino))
+                                goto out;
                        mark_inode_dirty_sync(inode);
                        ret = f2fs_write_inode(inode, NULL);
                        if (ret)
@@ -177,10 +187,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                ret = wait_on_node_pages_writeback(sbi, inode->i_ino);
                if (ret)
                        goto out;
-                ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+                ret = f2fs_issue_flush(F2FS_SB(inode->i_sb));
        }
 out:
-        mutex_unlock(&inode->i_mutex);
        trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
        return ret;
 }
@@ -245,7 +254,7 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
                f2fs_put_page(page, 1);
                return;
        }
-        wait_on_page_writeback(page);
+        f2fs_wait_on_page_writeback(page, DATA);
        zero_user(page, offset, PAGE_CACHE_SIZE - offset);
        set_page_dirty(page);
        f2fs_put_page(page, 1);
@@ -422,7 +431,7 @@ static void fill_zero(struct inode *inode, pgoff_t index,
        f2fs_unlock_op(sbi);
        if (!IS_ERR(page)) {
-                wait_on_page_writeback(page);
+                f2fs_wait_on_page_writeback(page, DATA);
                zero_user(page, start, len);
                set_page_dirty(page);
                f2fs_put_page(page, 1);
@@ -560,6 +569,8 @@ static long f2fs_fallocate(struct file *file, int mode,
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
                return -EOPNOTSUPP;
+        mutex_lock(&inode->i_mutex);
        if (mode & FALLOC_FL_PUNCH_HOLE)
                ret = punch_hole(inode, offset, len);
        else
@@ -569,6 +580,9 @@ static long f2fs_fallocate(struct file *file, int mode,
                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                mark_inode_dirty(inode);
        }
+        mutex_unlock(&inode->i_mutex);
        trace_f2fs_fallocate(inode, mode, offset, len, ret);
        return ret;
 }
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ea0371e854b4..b90dbe55403a 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -531,15 +531,10 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
                set_page_dirty(page);
                set_cold_data(page);
        } else {
-                struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
                f2fs_wait_on_page_writeback(page, DATA);
-                if (clear_page_dirty_for_io(page) &&
+                if (clear_page_dirty_for_io(page))
-                        S_ISDIR(inode->i_mode)) {
-                        dec_page_count(sbi, F2FS_DIRTY_DENTS);
                        inode_dec_dirty_dents(inode);
-                }
                set_cold_data(page);
                do_write_data_page(page, &fio);
                clear_cold_data(page);
@@ -701,6 +696,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
 gc_more:
        if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
                goto stop;
+        if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
+                goto stop;
        if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
                gc_type = FG_GC;
@@ -711,6 +708,11 @@ gc_more:
                goto stop;
        ret = 0;
+        /* readahead multi ssa blocks those have contiguous address */
+        if (sbi->segs_per_sec > 1)
+                ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
+                                                                META_SSA);
        for (i = 0; i < sbi->segs_per_sec; i++)
                do_garbage_collect(sbi, segno + i, &ilist, gc_type);
@@ -740,7 +742,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
 int __init create_gc_caches(void)
 {
        winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
-                        sizeof(struct inode_entry), NULL);
+                        sizeof(struct inode_entry));
        if (!winode_slab)
                return -ENOMEM;
        return 0;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 31ee5b164ff9..383db1fabcf4 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -45,8 +45,10 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
        }
        ipage = get_node_page(sbi, inode->i_ino);
-        if (IS_ERR(ipage))
+        if (IS_ERR(ipage)) {
+                unlock_page(page);
                return PTR_ERR(ipage);
+        }
        zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 4d67ed736dca..ee829d360468 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -107,6 +107,7 @@ static int do_read_inode(struct inode *inode)
        fi->flags = 0;
        fi->i_advise = ri->i_advise;
        fi->i_pino = le32_to_cpu(ri->i_pino);
+        fi->i_dir_level = ri->i_dir_level;
        get_extent_info(&fi->ext, ri->i_ext);
        get_inline_info(fi, ri);
@@ -204,6 +205,7 @@ void update_inode(struct inode *inode, struct page *node_page)
        ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
        ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
        ri->i_generation = cpu_to_le32(inode->i_generation);
+        ri->i_dir_level = F2FS_I(inode)->i_dir_level;
        __set_inode_rdev(inode, ri);
        set_cold_node(inode, node_page);
@@ -212,24 +214,29 @@ void update_inode(struct inode *inode, struct page *node_page)
        clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
 }
-int update_inode_page(struct inode *inode)
+void update_inode_page(struct inode *inode)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        struct page *node_page;
+retry:
        node_page = get_node_page(sbi, inode->i_ino);
-        if (IS_ERR(node_page))
+        if (IS_ERR(node_page)) {
-                return PTR_ERR(node_page);
+                int err = PTR_ERR(node_page);
+                if (err == -ENOMEM) {
+                        cond_resched();
+                        goto retry;
+                } else if (err != -ENOENT) {
+                        f2fs_stop_checkpoint(sbi);
+                }
+                return;
+        }
        update_inode(inode, node_page);
        f2fs_put_page(node_page, 1);
-        return 0;
 }
 int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-        int ret;
        if (inode->i_ino == F2FS_NODE_INO(sbi) ||
                        inode->i_ino == F2FS_META_INO(sbi))
@@ -243,13 +250,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
         * during the urgent cleaning time when runing out of free sections.
         */
        f2fs_lock_op(sbi);
-        ret = update_inode_page(inode);
+        update_inode_page(inode);
        f2fs_unlock_op(sbi);
        if (wbc)
                f2fs_balance_fs(sbi);
-        return ret;
+        return 0;
 }
 /*
@@ -260,13 +267,13 @@ void f2fs_evict_inode(struct inode *inode)
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        trace_f2fs_evict_inode(inode);
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        if (inode->i_ino == F2FS_NODE_INO(sbi) ||
                        inode->i_ino == F2FS_META_INO(sbi))
                goto no_delete;
-        f2fs_bug_on(atomic_read(&F2FS_I(inode)->dirty_dents));
+        f2fs_bug_on(get_dirty_dents(inode));
        remove_dirty_dir_inode(inode);
        if (inode->i_nlink || is_bad_inode(inode))
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 397d459e97bf..a9409d19dfd4 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -207,6 +207,8 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
                inode = f2fs_iget(dir->i_sb, ino);
                if (IS_ERR(inode))
                        return ERR_CAST(inode);
+                stat_inc_inline_inode(inode);
        }
        return d_splice_alias(inode, dentry);
@@ -424,12 +426,17 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                }
                f2fs_set_link(new_dir, new_entry, new_page, old_inode);
+                down_write(&F2FS_I(old_inode)->i_sem);
                F2FS_I(old_inode)->i_pino = new_dir->i_ino;
+                up_write(&F2FS_I(old_inode)->i_sem);
                new_inode->i_ctime = CURRENT_TIME;
+                down_write(&F2FS_I(new_inode)->i_sem);
                if (old_dir_entry)
                        drop_nlink(new_inode);
                drop_nlink(new_inode);
+                up_write(&F2FS_I(new_inode)->i_sem);
                mark_inode_dirty(new_inode);
                if (!new_inode->i_nlink)
@@ -459,7 +466,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (old_dir != new_dir) {
                        f2fs_set_link(old_inode, old_dir_entry,
                                                old_dir_page, new_dir);
+                        down_write(&F2FS_I(old_inode)->i_sem);
                        F2FS_I(old_inode)->i_pino = new_dir->i_ino;
+                        up_write(&F2FS_I(old_inode)->i_sem);
                        update_inode_page(old_inode);
                } else {
                        kunmap(old_dir_page);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b0649b76eb4f..a161e955c4c8 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -21,9 +21,27 @@
 #include "segment.h"
 #include <trace/events/f2fs.h>
+#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
 static struct kmem_cache *nat_entry_slab;
 static struct kmem_cache *free_nid_slab;
+static inline bool available_free_memory(struct f2fs_nm_info *nm_i, int type)
+{
+        struct sysinfo val;
+        unsigned long mem_size = 0;
+        si_meminfo(&val);
+        if (type == FREE_NIDS)
+                mem_size = nm_i->fcnt * sizeof(struct free_nid);
+        else if (type == NAT_ENTRIES)
+                mem_size += nm_i->nat_cnt * sizeof(struct nat_entry);
+        mem_size >>= 12;
+        /* give 50:50 memory for free nids and nat caches respectively */
+        return (mem_size < ((val.totalram * nm_i->ram_thresh) >> 11));
+}
 static void clear_node_page_dirty(struct page *page)
 {
        struct address_space *mapping = page->mapping;
@@ -82,42 +100,6 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
        return dst_page;
 }
-/*
- * Readahead NAT pages
- */
-static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
-{
-        struct address_space *mapping = META_MAPPING(sbi);
-        struct f2fs_nm_info *nm_i = NM_I(sbi);
-        struct page *page;
-        pgoff_t index;
-        int i;
-        struct f2fs_io_info fio = {
-                .type = META,
-                .rw = READ_SYNC | REQ_META | REQ_PRIO
-        };
-        for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
-                if (unlikely(nid >= nm_i->max_nid))
-                        nid = 0;
-                index = current_nat_addr(sbi, nid);
-                page = grab_cache_page(mapping, index);
-                if (!page)
-                        continue;
-                if (PageUptodate(page)) {
-                        mark_page_accessed(page);
-                        f2fs_put_page(page, 1);
-                        continue;
-                }
-                f2fs_submit_page_mbio(sbi, page, index, &fio);
-                mark_page_accessed(page);
-                f2fs_put_page(page, 0);
-        }
-        f2fs_submit_merged_bio(sbi, META, READ);
-}
 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
 {
        return radix_tree_lookup(&nm_i->nat_root, n);
@@ -151,6 +133,20 @@ int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
        return is_cp;
 }
+bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct nat_entry *e;
+        bool fsync_done = false;
+        read_lock(&nm_i->nat_tree_lock);
+        e = __lookup_nat_cache(nm_i, nid);
+        if (e)
+                fsync_done = e->fsync_done;
+        read_unlock(&nm_i->nat_tree_lock);
+        return fsync_done;
+}
 static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
 {
        struct nat_entry *new;
@@ -164,6 +160,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
        }
        memset(new, 0, sizeof(struct nat_entry));
        nat_set_nid(new, nid);
+        new->checkpointed = true;
        list_add_tail(&new->list, &nm_i->nat_entries);
        nm_i->nat_cnt++;
        return new;
@@ -185,13 +182,12 @@ retry:
                nat_set_blkaddr(e, le32_to_cpu(ne->block_addr));
                nat_set_ino(e, le32_to_cpu(ne->ino));
                nat_set_version(e, ne->version);
-                e->checkpointed = true;
        }
        write_unlock(&nm_i->nat_tree_lock);
 }
 static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
-                        block_t new_blkaddr)
+                        block_t new_blkaddr, bool fsync_done)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct nat_entry *e;
@@ -205,7 +201,6 @@ retry:
                        goto retry;
                }
                e->ni = *ni;
-                e->checkpointed = true;
                f2fs_bug_on(ni->blk_addr == NEW_ADDR);
        } else if (new_blkaddr == NEW_ADDR) {
                /*
@@ -217,9 +212,6 @@ retry:
                f2fs_bug_on(ni->blk_addr != NULL_ADDR);
        }
-        if (new_blkaddr == NEW_ADDR)
-                e->checkpointed = false;
        /* sanity check */
        f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr);
        f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR &&
@@ -239,6 +231,11 @@ retry:
        /* change address */
        nat_set_blkaddr(e, new_blkaddr);
        __set_nat_cache_dirty(nm_i, e);
+        /* update fsync_mark if its inode nat entry is still alive */
+        e = __lookup_nat_cache(nm_i, ni->ino);
+        if (e)
+                e->fsync_done = fsync_done;
        write_unlock(&nm_i->nat_tree_lock);
 }
@@ -246,7 +243,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
-        if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD)
+        if (available_free_memory(nm_i, NAT_ENTRIES))
                return 0;
        write_lock(&nm_i->nat_tree_lock);
@@ -505,7 +502,7 @@ static void truncate_node(struct dnode_of_data *dn)
        /* Deallocate node address */
        invalidate_blocks(sbi, ni.blk_addr);
        dec_valid_node_count(sbi, dn->inode);
-        set_node_addr(sbi, &ni, NULL_ADDR);
+        set_node_addr(sbi, &ni, NULL_ADDR, false);
        if (dn->nid == dn->inode->i_ino) {
                remove_orphan_inode(sbi, dn->nid);
@@ -763,7 +760,7 @@ skip_partial:
                                f2fs_put_page(page, 1);
                                goto restart;
                        }
-                        wait_on_page_writeback(page);
+                        f2fs_wait_on_page_writeback(page, NODE);
                        ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
                        set_page_dirty(page);
                        unlock_page(page);
@@ -852,7 +849,8 @@ struct page *new_node_page(struct dnode_of_data *dn,
        if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
                return ERR_PTR(-EPERM);
-        page = grab_cache_page(NODE_MAPPING(sbi), dn->nid);
+        page = grab_cache_page_write_begin(NODE_MAPPING(sbi),
+                                        dn->nid, AOP_FLAG_NOFS);
        if (!page)
                return ERR_PTR(-ENOMEM);
@@ -867,14 +865,14 @@ struct page *new_node_page(struct dnode_of_data *dn,
        f2fs_bug_on(old_ni.blk_addr != NULL_ADDR);
        new_ni = old_ni;
        new_ni.ino = dn->inode->i_ino;
-        set_node_addr(sbi, &new_ni, NEW_ADDR);
+        set_node_addr(sbi, &new_ni, NEW_ADDR, false);
        fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
        set_cold_node(dn->inode, page);
        SetPageUptodate(page);
        set_page_dirty(page);
-        if (ofs == XATTR_NODE_OFFSET)
+        if (f2fs_has_xattr_block(ofs))
                F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
        dn->node_page = page;
@@ -948,7 +946,8 @@ struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
        struct page *page;
        int err;
 repeat:
-        page = grab_cache_page(NODE_MAPPING(sbi), nid);
+        page = grab_cache_page_write_begin(NODE_MAPPING(sbi),
+                                        nid, AOP_FLAG_NOFS);
        if (!page)
                return ERR_PTR(-ENOMEM);
@@ -959,7 +958,7 @@ repeat:
                goto got_it;
        lock_page(page);
-        if (unlikely(!PageUptodate(page))) {
+        if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
                f2fs_put_page(page, 1);
                return ERR_PTR(-EIO);
        }
@@ -968,7 +967,6 @@ repeat:
                goto repeat;
        }
 got_it:
-        f2fs_bug_on(nid != nid_of_node(page));
        mark_page_accessed(page);
        return page;
 }
@@ -1168,7 +1166,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
                                continue;
                        if (ino && ino_of_node(page) == ino) {
-                                wait_on_page_writeback(page);
+                                f2fs_wait_on_page_writeback(page, NODE);
                                if (TestClearPageError(page))
                                        ret = -EIO;
                        }
@@ -1201,7 +1199,7 @@ static int f2fs_write_node_page(struct page *page,
        if (unlikely(sbi->por_doing))
                goto redirty_out;
-        wait_on_page_writeback(page);
+        f2fs_wait_on_page_writeback(page, NODE);
        /* get old block addr of this node page */
        nid = nid_of_node(page);
@@ -1222,7 +1220,7 @@ static int f2fs_write_node_page(struct page *page,
        mutex_lock(&sbi->node_write);
        set_page_writeback(page);
        write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
-        set_node_addr(sbi, &ni, new_addr);
+        set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page));
        dec_page_count(sbi, F2FS_DIRTY_NODES);
        mutex_unlock(&sbi->node_write);
        unlock_page(page);
@@ -1231,35 +1229,32 @@ static int f2fs_write_node_page(struct page *page,
 redirty_out:
        dec_page_count(sbi, F2FS_DIRTY_NODES);
        wbc->pages_skipped++;
+        account_page_redirty(page);
        set_page_dirty(page);
        return AOP_WRITEPAGE_ACTIVATE;
 }
-/*
- * It is very important to gather dirty pages and write at once, so that we can
- * submit a big bio without interfering other data writes.
- * Be default, 512 pages (2MB) * 3 node types, is more reasonable.
- */
-#define COLLECT_DIRTY_NODES     1536
 static int f2fs_write_node_pages(struct address_space *mapping,
                            struct writeback_control *wbc)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
-        long nr_to_write = wbc->nr_to_write;
+        long diff;
        /* balancing f2fs's metadata in background */
        f2fs_balance_fs_bg(sbi);
        /* collect a number of dirty node pages and write together */
-        if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES)
+        if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
-                return 0;
+                goto skip_write;
-        /* if mounting is failed, skip writing node pages */
+        diff = nr_pages_to_write(sbi, NODE, wbc);
-        wbc->nr_to_write = 3 * max_hw_blocks(sbi);
        wbc->sync_mode = WB_SYNC_NONE;
        sync_node_pages(sbi, 0, wbc);
-        wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) -
+        wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
-                                                wbc->nr_to_write);
+        return 0;
+skip_write:
+        wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
        return 0;
 }
@@ -1307,22 +1302,17 @@ const struct address_space_operations f2fs_node_aops = {
        .releasepage    = f2fs_release_node_page,
 };
-static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head)
+static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
+                                                nid_t n)
 {
-        struct list_head *this;
+        return radix_tree_lookup(&nm_i->free_nid_root, n);
-        struct free_nid *i;
-        list_for_each(this, head) {
-                i = list_entry(this, struct free_nid, list);
-                if (i->nid == n)
-                        return i;
-        }
-        return NULL;
 }
-static void __del_from_free_nid_list(struct free_nid *i)
+static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
+                                                struct free_nid *i)
 {
        list_del(&i->list);
-        kmem_cache_free(free_nid_slab, i);
+        radix_tree_delete(&nm_i->free_nid_root, i->nid);
 }
 static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
@@ -1331,7 +1321,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
        struct nat_entry *ne;
        bool allocated = false;
-        if (nm_i->fcnt > 2 * MAX_FREE_NIDS)
+        if (!available_free_memory(nm_i, FREE_NIDS))
                return -1;
        /* 0 nid should not be used */
@@ -1342,7 +1332,8 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
                /* do not add allocated nids */
                read_lock(&nm_i->nat_tree_lock);
                ne = __lookup_nat_cache(nm_i, nid);
-                if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
+                if (ne &&
+                        (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR))
                        allocated = true;
                read_unlock(&nm_i->nat_tree_lock);
                if (allocated)
@@ -1354,7 +1345,7 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
        i->state = NID_NEW;
        spin_lock(&nm_i->free_nid_list_lock);
-        if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) {
+        if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
                spin_unlock(&nm_i->free_nid_list_lock);
                kmem_cache_free(free_nid_slab, i);
                return 0;
@@ -1368,13 +1359,19 @@ static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
 static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
 {
        struct free_nid *i;
+        bool need_free = false;
        spin_lock(&nm_i->free_nid_list_lock);
-        i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+        i = __lookup_free_nid_list(nm_i, nid);
        if (i && i->state == NID_NEW) {
-                __del_from_free_nid_list(i);
+                __del_from_free_nid_list(nm_i, i);
                nm_i->fcnt--;
+                need_free = true;
        }
        spin_unlock(&nm_i->free_nid_list_lock);
+        if (need_free)
+                kmem_cache_free(free_nid_slab, i);
 }
 static void scan_nat_page(struct f2fs_nm_info *nm_i,
@@ -1413,7 +1410,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
                return;
        /* readahead nat pages to be scanned */
-        ra_nat_pages(sbi, nid);
+        ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT);
        while (1) {
                struct page *page = get_current_nat_page(sbi, nid);
@@ -1454,7 +1451,6 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct free_nid *i = NULL;
-        struct list_head *this;
 retry:
        if (unlikely(sbi->total_valid_node_count + 1 >= nm_i->max_nid))
                return false;
@@ -1462,13 +1458,11 @@ retry:
        spin_lock(&nm_i->free_nid_list_lock);
        /* We should not use stale free nids created by build_free_nids */
-        if (nm_i->fcnt && !sbi->on_build_free_nids) {
+        if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
                f2fs_bug_on(list_empty(&nm_i->free_nid_list));
-                list_for_each(this, &nm_i->free_nid_list) {
+                list_for_each_entry(i, &nm_i->free_nid_list, list)
-                        i = list_entry(this, struct free_nid, list);
                        if (i->state == NID_NEW)
                                break;
-                }
                f2fs_bug_on(i->state != NID_NEW);
                *nid = i->nid;
@@ -1481,9 +1475,7 @@ retry:
        /* Let's scan nat pages and its caches to get free nids */
        mutex_lock(&nm_i->build_lock);
-        sbi->on_build_free_nids = true;
        build_free_nids(sbi);
-        sbi->on_build_free_nids = false;
        mutex_unlock(&nm_i->build_lock);
        goto retry;
 }
@@ -1497,10 +1489,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
        struct free_nid *i;
        spin_lock(&nm_i->free_nid_list_lock);
-        i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+        i = __lookup_free_nid_list(nm_i, nid);
        f2fs_bug_on(!i || i->state != NID_ALLOC);
-        __del_from_free_nid_list(i);
+        __del_from_free_nid_list(nm_i, i);
        spin_unlock(&nm_i->free_nid_list_lock);
+        kmem_cache_free(free_nid_slab, i);
 }
 /*
@@ -1510,20 +1504,25 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct free_nid *i;
+        bool need_free = false;
        if (!nid)
                return;
        spin_lock(&nm_i->free_nid_list_lock);
-        i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+        i = __lookup_free_nid_list(nm_i, nid);
        f2fs_bug_on(!i || i->state != NID_ALLOC);
-        if (nm_i->fcnt > 2 * MAX_FREE_NIDS) {
+        if (!available_free_memory(nm_i, FREE_NIDS)) {
-                __del_from_free_nid_list(i);
+                __del_from_free_nid_list(nm_i, i);
+                need_free = true;
        } else {
                i->state = NID_NEW;
                nm_i->fcnt++;
        }
        spin_unlock(&nm_i->free_nid_list_lock);
+        if (need_free)
+                kmem_cache_free(free_nid_slab, i);
 }
 void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -1531,10 +1530,83 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
                block_t new_blkaddr)
 {
        rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
-        set_node_addr(sbi, ni, new_blkaddr);
+        set_node_addr(sbi, ni, new_blkaddr, false);
        clear_node_page_dirty(page);
 }
+void recover_inline_xattr(struct inode *inode, struct page *page)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        void *src_addr, *dst_addr;
+        size_t inline_size;
+        struct page *ipage;
+        struct f2fs_inode *ri;
+        if (!f2fs_has_inline_xattr(inode))
+                return;
+        if (!IS_INODE(page))
+                return;
+        ri = F2FS_INODE(page);
+        if (!(ri->i_inline & F2FS_INLINE_XATTR))
+                return;
+        ipage = get_node_page(sbi, inode->i_ino);
+        f2fs_bug_on(IS_ERR(ipage));
+        dst_addr = inline_xattr_addr(ipage);
+        src_addr = inline_xattr_addr(page);
+        inline_size = inline_xattr_size(inode);
+        memcpy(dst_addr, src_addr, inline_size);
+        update_inode(inode, ipage);
+        f2fs_put_page(ipage, 1);
+}
+bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
+        nid_t new_xnid = nid_of_node(page);
+        struct node_info ni;
+        recover_inline_xattr(inode, page);
+        if (!f2fs_has_xattr_block(ofs_of_node(page)))
+                return false;
+        /* 1: invalidate the previous xattr nid */
+        if (!prev_xnid)
+                goto recover_xnid;
+        /* Deallocate node address */
+        get_node_info(sbi, prev_xnid, &ni);
+        f2fs_bug_on(ni.blk_addr == NULL_ADDR);
+        invalidate_blocks(sbi, ni.blk_addr);
+        dec_valid_node_count(sbi, inode);
+        set_node_addr(sbi, &ni, NULL_ADDR, false);
+recover_xnid:
+        /* 2: allocate new xattr nid */
+        if (unlikely(!inc_valid_node_count(sbi, inode)))
+                f2fs_bug_on(1);
+        remove_free_nid(NM_I(sbi), new_xnid);
+        get_node_info(sbi, new_xnid, &ni);
+        ni.ino = inode->i_ino;
+        set_node_addr(sbi, &ni, NEW_ADDR, false);
+        F2FS_I(inode)->i_xattr_nid = new_xnid;
+        /* 3: update xattr blkaddr */
+        refresh_sit_entry(sbi, NEW_ADDR, blkaddr);
+        set_node_addr(sbi, &ni, blkaddr, false);
+        update_inode_page(inode);
+        return true;
+}
 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
 {
        struct f2fs_inode *src, *dst;
@@ -1567,7 +1639,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
        if (unlikely(!inc_valid_node_count(sbi, NULL)))
                WARN_ON(1);
-        set_node_addr(sbi, &new_ni, NEW_ADDR);
+        set_node_addr(sbi, &new_ni, NEW_ADDR, false);
        inc_valid_inode_count(sbi);
        f2fs_put_page(ipage, 1);
        return 0;
@@ -1590,15 +1662,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
        for (; page_idx < start + nrpages; page_idx++) {
                /* alloc temporal page for read node summary info*/
                page = alloc_page(GFP_F2FS_ZERO);
-                if (!page) {
+                if (!page)
-                        struct page *tmp;
+                        break;
-                        list_for_each_entry_safe(page, tmp, pages, lru) {
-                                list_del(&page->lru);
-                                unlock_page(page);
-                                __free_pages(page, 0);
-                        }
-                        return -ENOMEM;
-                }
                lock_page(page);
                page->index = page_idx;
@@ -1609,7 +1674,8 @@ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct list_head *pages,
                f2fs_submit_page_mbio(sbi, page, page->index, &fio);
        f2fs_submit_merged_bio(sbi, META, READ);
-        return 0;
+        return page_idx - start;
 }
 int restore_node_summary(struct f2fs_sb_info *sbi,
@@ -1628,15 +1694,17 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
        addr = START_BLOCK(sbi, segno);
        sum_entry = &sum->entries[0];
-        for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
+        for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
                nrpages = min(last_offset - i, bio_blocks);
                /* read ahead node pages */
-                err = ra_sum_pages(sbi, &page_list, addr, nrpages);
+                nrpages = ra_sum_pages(sbi, &page_list, addr, nrpages);
-                if (err)
+                if (!nrpages)
-                        return err;
+                        return -ENOMEM;
                list_for_each_entry_safe(page, tmp, &page_list, lru) {
+                        if (err)
+                                goto skip;
                        lock_page(page);
                        if (unlikely(!PageUptodate(page))) {
@@ -1648,9 +1716,9 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
                                sum_entry->ofs_in_node = 0;
                                sum_entry++;
                        }
-                        list_del(&page->lru);
                        unlock_page(page);
+skip:
+                        list_del(&page->lru);
                        __free_pages(page, 0);
                }
        }
@@ -1709,7 +1777,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
        struct f2fs_summary_block *sum = curseg->sum_blk;
-        struct list_head *cur, *n;
+        struct nat_entry *ne, *cur;
        struct page *page = NULL;
        struct f2fs_nat_block *nat_blk = NULL;
        nid_t start_nid = 0, end_nid = 0;
@@ -1721,18 +1789,17 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
                mutex_lock(&curseg->curseg_mutex);
        /* 1) flush dirty nat caches */
-        list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) {
+        list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) {
-                struct nat_entry *ne;
                nid_t nid;
                struct f2fs_nat_entry raw_ne;
                int offset = -1;
                block_t new_blkaddr;
-                ne = list_entry(cur, struct nat_entry, list);
-                nid = nat_get_nid(ne);
                if (nat_get_blkaddr(ne) == NEW_ADDR)
                        continue;
+                nid = nat_get_nid(ne);
                if (flushed)
                        goto to_nat_page;
@@ -1783,16 +1850,12 @@ flush_now:
                } else {
                        write_lock(&nm_i->nat_tree_lock);
                        __clear_nat_cache_dirty(nm_i, ne);
-                        ne->checkpointed = true;
                        write_unlock(&nm_i->nat_tree_lock);
                }
        }
        if (!flushed)
                mutex_unlock(&curseg->curseg_mutex);
        f2fs_put_page(page, 1);
-        /* 2) shrink nat caches if necessary */
-        try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD);
 }
 static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -1807,10 +1870,14 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
        /* segment_count_nat includes pair segment so divide to 2. */
        nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
        nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
-        nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
+        /* not used nids: 0, node, meta, (and root counted as valid node) */
+        nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks - 3;
        nm_i->fcnt = 0;
        nm_i->nat_cnt = 0;
+        nm_i->ram_thresh = DEF_RAM_THRESHOLD;
+        INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
        INIT_LIST_HEAD(&nm_i->free_nid_list);
        INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
        INIT_LIST_HEAD(&nm_i->nat_entries);
@@ -1864,8 +1931,11 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
        spin_lock(&nm_i->free_nid_list_lock);
        list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
                f2fs_bug_on(i->state == NID_ALLOC);
-                __del_from_free_nid_list(i);
+                __del_from_free_nid_list(nm_i, i);
                nm_i->fcnt--;
+                spin_unlock(&nm_i->free_nid_list_lock);
+                kmem_cache_free(free_nid_slab, i);
+                spin_lock(&nm_i->free_nid_list_lock);
        }
        f2fs_bug_on(nm_i->fcnt);
        spin_unlock(&nm_i->free_nid_list_lock);
@@ -1875,11 +1945,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
        while ((found = __gang_lookup_nat_cache(nm_i,
                                        nid, NATVEC_SIZE, natvec))) {
                unsigned idx;
-                for (idx = 0; idx < found; idx++) {
+                nid = nat_get_nid(natvec[found - 1]) + 1;
-                        struct nat_entry *e = natvec[idx];
+                for (idx = 0; idx < found; idx++)
-                        nid = nat_get_nid(e) + 1;
+                        __del_from_nat_cache(nm_i, natvec[idx]);
-                        __del_from_nat_cache(nm_i, e);
-                }
        }
        f2fs_bug_on(nm_i->nat_cnt);
        write_unlock(&nm_i->nat_tree_lock);
@@ -1892,12 +1960,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 int __init create_node_manager_caches(void)
 {
        nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
-                        sizeof(struct nat_entry), NULL);
+                        sizeof(struct nat_entry));
        if (!nat_entry_slab)
                return -ENOMEM;
        free_nid_slab = f2fs_kmem_cache_create("free_nid",
-                        sizeof(struct free_nid), NULL);
+                        sizeof(struct free_nid));
        if (!free_nid_slab) {
                kmem_cache_destroy(nat_entry_slab);
                return -ENOMEM;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index c4c79885c993..5decc1a375f0 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -17,14 +17,11 @@
 /* # of pages to perform readahead before building free nids */
 #define FREE_NID_PAGES 4
-/* maximum # of free node ids to produce during build_free_nids */
-#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
 /* maximum readahead size for node during getting data blocks */
 #define MAX_RA_NODE             128
-/* maximum cached nat entries to manage memory footprint */
+/* control the memory footprint threshold (10MB per 1GB ram) */
-#define NM_WOUT_THRESHOLD       (64 * NAT_ENTRY_PER_BLOCK)
+#define DEF_RAM_THRESHOLD       10
 /* vector size for gang look-up from nat cache that consists of radix tree */
 #define NATVEC_SIZE     64
@@ -45,6 +42,7 @@ struct node_info {
 struct nat_entry {
        struct list_head list;  /* for clean or dirty nat list */
        bool checkpointed;      /* whether it is checkpointed or not */
+        bool fsync_done;        /* whether the latest node has fsync mark */
        struct node_info ni;    /* in-memory node information */
 };
@@ -58,9 +56,15 @@ struct nat_entry {
 #define nat_set_version(nat, v)         (nat->ni.version = v)
 #define __set_nat_cache_dirty(nm_i, ne)                                 \
-        list_move_tail(&ne->list, &nm_i->dirty_nat_entries);
+        do {                                                            \
+                ne->checkpointed = false;                               \
+                list_move_tail(&ne->list, &nm_i->dirty_nat_entries);    \
+        } while (0);
 #define __clear_nat_cache_dirty(nm_i, ne)                               \
-        list_move_tail(&ne->list, &nm_i->nat_entries);
+        do {                                                            \
+                ne->checkpointed = true;                                \
+                list_move_tail(&ne->list, &nm_i->nat_entries);          \
+        } while (0);
 #define inc_node_version(version)       (++version)
 static inline void node_info_from_raw_nat(struct node_info *ni,
@@ -71,6 +75,11 @@ static inline void node_info_from_raw_nat(struct node_info *ni,
        ni->version = raw_ne->version;
 }
+enum nid_type {
+        FREE_NIDS,      /* indicates the free nid list */
+        NAT_ENTRIES     /* indicates the cached nat entry */
+};
 /*
 * For free nid mangement
 */
@@ -236,7 +245,7 @@ static inline bool IS_DNODE(struct page *node_page)
 {
        unsigned int ofs = ofs_of_node(node_page);
-        if (ofs == XATTR_NODE_OFFSET)
+        if (f2fs_has_xattr_block(ofs))
                return false;
        if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 976a7a934db5..b1ae89f0f44e 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -27,14 +27,12 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi)
 static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
                                                                nid_t ino)
 {
-        struct list_head *this;
        struct fsync_inode_entry *entry;
-        list_for_each(this, head) {
+        list_for_each_entry(entry, head, list)
-                entry = list_entry(this, struct fsync_inode_entry, list);
                if (entry->inode->i_ino == ino)
                        return entry;
-        }
        return NULL;
 }
@@ -136,7 +134,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
        /* get node pages in the current segment */
        curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
-        blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff;
+        blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
        /* read node page */
        page = alloc_page(GFP_F2FS_ZERO);
@@ -218,13 +216,12 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
 {
        struct seg_entry *sentry;
        unsigned int segno = GET_SEGNO(sbi, blkaddr);
-        unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
+        unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
-                                        (sbi->blocks_per_seg - 1);
+        struct f2fs_summary_block *sum_node;
        struct f2fs_summary sum;
+        struct page *sum_page, *node_page;
        nid_t ino, nid;
-        void *kaddr;
        struct inode *inode;
-        struct page *node_page;
        unsigned int offset;
        block_t bidx;
        int i;
@@ -238,18 +235,15 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
                struct curseg_info *curseg = CURSEG_I(sbi, i);
                if (curseg->segno == segno) {
                        sum = curseg->sum_blk->entries[blkoff];
-                        break;
+                        goto got_it;
                }
        }
-        if (i > CURSEG_COLD_DATA) {
-                struct page *sum_page = get_sum_page(sbi, segno);
-                struct f2fs_summary_block *sum_node;
-                kaddr = page_address(sum_page);
-                sum_node = (struct f2fs_summary_block *)kaddr;
-                sum = sum_node->entries[blkoff];
-                f2fs_put_page(sum_page, 1);
-        }
+        sum_page = get_sum_page(sbi, segno);
+        sum_node = (struct f2fs_summary_block *)page_address(sum_page);
+        sum = sum_node->entries[blkoff];
+        f2fs_put_page(sum_page, 1);
+got_it:
        /* Use the locked dnode page and inode */
        nid = le32_to_cpu(sum.nid);
        if (dn->inode->i_ino == nid) {
@@ -301,6 +295,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        if (recover_inline_data(inode, page))
                goto out;
+        if (recover_xattr_data(inode, page, blkaddr))
+                goto out;
        start = start_bidx_of_node(ofs_of_node(page), fi);
        if (IS_INODE(page))
                end = start + ADDRS_PER_INODE(fi);
@@ -317,7 +314,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                goto out;
        }
-        wait_on_page_writeback(dn.node_page);
+        f2fs_wait_on_page_writeback(dn.node_page, NODE);
        get_node_info(sbi, dn.nid, &ni);
        f2fs_bug_on(ni.ino != ino_of_node(page));
@@ -437,7 +434,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
        bool need_writecp = false;
        fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
-                        sizeof(struct fsync_inode_entry), NULL);
+                        sizeof(struct fsync_inode_entry));
        if (!fsync_entry_slab)
                return -ENOMEM;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7caac5f2ca9e..085f548be7a3 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -13,6 +13,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/prefetch.h>
+#include <linux/kthread.h>
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
@@ -24,6 +25,7 @@
 #define __reverse_ffz(x) __reverse_ffs(~(x))
 static struct kmem_cache *discard_entry_slab;
+static struct kmem_cache *flush_cmd_slab;
 /*
 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
@@ -195,6 +197,73 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
                f2fs_sync_fs(sbi->sb, true);
 }
+static int issue_flush_thread(void *data)
+{
+        struct f2fs_sb_info *sbi = data;
+        struct f2fs_sm_info *sm_i = SM_I(sbi);
+        wait_queue_head_t *q = &sm_i->flush_wait_queue;
+repeat:
+        if (kthread_should_stop())
+                return 0;
+        spin_lock(&sm_i->issue_lock);
+        if (sm_i->issue_list) {
+                sm_i->dispatch_list = sm_i->issue_list;
+                sm_i->issue_list = sm_i->issue_tail = NULL;
+        }
+        spin_unlock(&sm_i->issue_lock);
+        if (sm_i->dispatch_list) {
+                struct bio *bio = bio_alloc(GFP_NOIO, 0);
+                struct flush_cmd *cmd, *next;
+                int ret;
+                bio->bi_bdev = sbi->sb->s_bdev;
+                ret = submit_bio_wait(WRITE_FLUSH, bio);
+                for (cmd = sm_i->dispatch_list; cmd; cmd = next) {
+                        cmd->ret = ret;
+                        next = cmd->next;
+                        complete(&cmd->wait);
+                }
+                sm_i->dispatch_list = NULL;
+        }
+        wait_event_interruptible(*q, kthread_should_stop() || sm_i->issue_list);
+        goto repeat;
+}
+int f2fs_issue_flush(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_sm_info *sm_i = SM_I(sbi);
+        struct flush_cmd *cmd;
+        int ret;
+        if (!test_opt(sbi, FLUSH_MERGE))
+                return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
+        cmd = f2fs_kmem_cache_alloc(flush_cmd_slab, GFP_ATOMIC);
+        cmd->next = NULL;
+        cmd->ret = 0;
+        init_completion(&cmd->wait);
+        spin_lock(&sm_i->issue_lock);
+        if (sm_i->issue_list)
+                sm_i->issue_tail->next = cmd;
+        else
+                sm_i->issue_list = cmd;
+        sm_i->issue_tail = cmd;
+        spin_unlock(&sm_i->issue_lock);
+        if (!sm_i->dispatch_list)
+                wake_up(&sm_i->flush_wait_queue);
+        wait_for_completion(&cmd->wait);
+        ret = cmd->ret;
+        kmem_cache_free(flush_cmd_slab, cmd);
+        return ret;
+}
 static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
                enum dirty_type dirty_type)
 {
@@ -340,8 +409,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 void clear_prefree_segments(struct f2fs_sb_info *sbi)
 {
        struct list_head *head = &(SM_I(sbi)->discard_list);
-        struct list_head *this, *next;
+        struct discard_entry *entry, *this;
-        struct discard_entry *entry;
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
        unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
        unsigned int total_segs = TOTAL_SEGS(sbi);
@@ -370,8 +438,7 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
        mutex_unlock(&dirty_i->seglist_lock);
        /* send small discards */
-        list_for_each_safe(this, next, head) {
+        list_for_each_entry_safe(entry, this, head, list) {
-                entry = list_entry(this, struct discard_entry, list);
                f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
                list_del(&entry->list);
                SM_I(sbi)->nr_discards -= entry->len;
@@ -405,7 +472,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
        se = get_seg_entry(sbi, segno);
        new_vblocks = se->valid_blocks + del;
-        offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1);
+        offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
        f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) ||
                                (new_vblocks > sbi->blocks_per_seg)));
@@ -434,12 +501,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
                get_sec_entry(sbi, segno)->valid_blocks += del;
 }
-static void refresh_sit_entry(struct f2fs_sb_info *sbi,
+void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new)
-                        block_t old_blkaddr, block_t new_blkaddr)
 {
-        update_sit_entry(sbi, new_blkaddr, 1);
+        update_sit_entry(sbi, new, 1);
-        if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+        if (GET_SEGNO(sbi, old) != NULL_SEGNO)
-                update_sit_entry(sbi, old_blkaddr, -1);
+                update_sit_entry(sbi, old, -1);
+        locate_dirty_segment(sbi, GET_SEGNO(sbi, old));
+        locate_dirty_segment(sbi, GET_SEGNO(sbi, new));
 }
 void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
@@ -881,17 +950,15 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
        stat_inc_block_count(sbi, curseg);
+        if (!__has_curseg_space(sbi, type))
+                sit_i->s_ops->allocate_segment(sbi, type, false);
        /*
         * SIT information should be updated before segment allocation,
         * since SSR needs latest valid block information.
         */
        refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
-        if (!__has_curseg_space(sbi, type))
-                sit_i->s_ops->allocate_segment(sbi, type, false);
        locate_dirty_segment(sbi, old_cursegno);
-        locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
        mutex_unlock(&sit_i->sentry_lock);
        if (page && IS_NODESEG(type))
@@ -987,14 +1054,11 @@ void recover_data_page(struct f2fs_sb_info *sbi,
                change_curseg(sbi, type, true);
        }
-        curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
+        curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
-                                        (sbi->blocks_per_seg - 1);
        __add_sum_entry(sbi, type, sum);
        refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
        locate_dirty_segment(sbi, old_cursegno);
-        locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
        mutex_unlock(&sit_i->sentry_lock);
        mutex_unlock(&curseg->curseg_mutex);
@@ -1028,8 +1092,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
                curseg->next_segno = segno;
                change_curseg(sbi, type, true);
        }
-        curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
+        curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
-                                        (sbi->blocks_per_seg - 1);
        __add_sum_entry(sbi, type, sum);
        /* change the current log to the next block addr in advance */
@@ -1037,28 +1100,50 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
                curseg->next_segno = next_segno;
                change_curseg(sbi, type, true);
        }
-        curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) &
+        curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr);
-                                        (sbi->blocks_per_seg - 1);
        /* rewrite node page */
        set_page_writeback(page);
        f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
        f2fs_submit_merged_bio(sbi, NODE, WRITE);
        refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
        locate_dirty_segment(sbi, old_cursegno);
-        locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
        mutex_unlock(&sit_i->sentry_lock);
        mutex_unlock(&curseg->curseg_mutex);
 }
+static inline bool is_merged_page(struct f2fs_sb_info *sbi,
+                                        struct page *page, enum page_type type)
+{
+        enum page_type btype = PAGE_TYPE_OF_BIO(type);
+        struct f2fs_bio_info *io = &sbi->write_io[btype];
+        struct bio_vec *bvec;
+        int i;
+        down_read(&io->io_rwsem);
+        if (!io->bio)
+                goto out;
+        bio_for_each_segment_all(bvec, io->bio, i) {
+                if (page == bvec->bv_page) {
+                        up_read(&io->io_rwsem);
+                        return true;
+                }
+        }
+out:
+        up_read(&io->io_rwsem);
+        return false;
+}
 void f2fs_wait_on_page_writeback(struct page *page,
                                enum page_type type)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
        if (PageWriteback(page)) {
-                f2fs_submit_merged_bio(sbi, type, WRITE);
+                if (is_merged_page(sbi, page, type))
+                        f2fs_submit_merged_bio(sbi, type, WRITE);
                wait_on_page_writeback(page);
        }
 }
@@ -1167,9 +1252,12 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
                                ns->ofs_in_node = 0;
                        }
                } else {
-                        if (restore_node_summary(sbi, segno, sum)) {
+                        int err;
+                        err = restore_node_summary(sbi, segno, sum);
+                        if (err) {
                                f2fs_put_page(new, 1);
-                                return -EINVAL;
+                                return err;
                        }
                }
        }
@@ -1190,6 +1278,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
 static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
 {
        int type = CURSEG_HOT_DATA;
+        int err;
        if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
                /* restore for compacted data summary */
@@ -1198,9 +1287,12 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
                type = CURSEG_HOT_NODE;
        }
-        for (; type <= CURSEG_COLD_NODE; type++)
+        for (; type <= CURSEG_COLD_NODE; type++) {
-                if (read_normal_summaries(sbi, type))
+                err = read_normal_summaries(sbi, type);
-                        return -EINVAL;
+                if (err)
+                        return err;
+        }
        return 0;
 }
@@ -1583,47 +1675,6 @@ static int build_curseg(struct f2fs_sb_info *sbi)
        return restore_curseg_summaries(sbi);
 }
-static int ra_sit_pages(struct f2fs_sb_info *sbi, int start, int nrpages)
-{
-        struct address_space *mapping = META_MAPPING(sbi);
-        struct page *page;
-        block_t blk_addr, prev_blk_addr = 0;
-        int sit_blk_cnt = SIT_BLK_CNT(sbi);
-        int blkno = start;
-        struct f2fs_io_info fio = {
-                .type = META,
-                .rw = READ_SYNC | REQ_META | REQ_PRIO
-        };
-        for (; blkno < start + nrpages && blkno < sit_blk_cnt; blkno++) {
-                blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK);
-                if (blkno != start && prev_blk_addr + 1 != blk_addr)
-                        break;
-                prev_blk_addr = blk_addr;
-repeat:
-                page = grab_cache_page(mapping, blk_addr);
-                if (!page) {
-                        cond_resched();
-                        goto repeat;
-                }
-                if (PageUptodate(page)) {
-                        mark_page_accessed(page);
-                        f2fs_put_page(page, 1);
-                        continue;
-                }
-                f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
-                mark_page_accessed(page);
-                f2fs_put_page(page, 0);
-        }
-        f2fs_submit_merged_bio(sbi, META, READ);
-        return blkno - start;
-}
 static void build_sit_entries(struct f2fs_sb_info *sbi)
 {
        struct sit_info *sit_i = SIT_I(sbi);
@@ -1635,7 +1686,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
        int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
        do {
-                readed = ra_sit_pages(sbi, start_blk, nrpages);
+                readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
                start = start_blk * sit_i->sents_per_block;
                end = (start_blk + readed) * sit_i->sents_per_block;
@@ -1781,6 +1832,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
 {
        struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+        dev_t dev = sbi->sb->s_bdev->bd_dev;
        struct f2fs_sm_info *sm_info;
        int err;
@@ -1799,7 +1851,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
        sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
        sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
        sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
-        sm_info->rec_prefree_segments = DEF_RECLAIM_PREFREE_SEGMENTS;
+        sm_info->rec_prefree_segments = sm_info->main_segments *
+                                        DEF_RECLAIM_PREFREE_SEGMENTS / 100;
        sm_info->ipu_policy = F2FS_IPU_DISABLE;
        sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
@@ -1807,6 +1860,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
        sm_info->nr_discards = 0;
        sm_info->max_discards = 0;
+        if (test_opt(sbi, FLUSH_MERGE)) {
+                spin_lock_init(&sm_info->issue_lock);
+                init_waitqueue_head(&sm_info->flush_wait_queue);
+                sm_info->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
+                                "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
+                if (IS_ERR(sm_info->f2fs_issue_flush))
+                        return PTR_ERR(sm_info->f2fs_issue_flush);
+        }
        err = build_sit_info(sbi);
        if (err)
                return err;
@@ -1915,6 +1978,8 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
        struct f2fs_sm_info *sm_info = SM_I(sbi);
        if (!sm_info)
                return;
+        if (sm_info->f2fs_issue_flush)
+                kthread_stop(sm_info->f2fs_issue_flush);
        destroy_dirty_segmap(sbi);
        destroy_curseg(sbi);
        destroy_free_segmap(sbi);
@@ -1926,13 +1991,20 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
 int __init create_segment_manager_caches(void)
 {
        discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
-                        sizeof(struct discard_entry), NULL);
+                        sizeof(struct discard_entry));
        if (!discard_entry_slab)
                return -ENOMEM;
+        flush_cmd_slab = f2fs_kmem_cache_create("flush_command",
+                        sizeof(struct flush_cmd));
+        if (!flush_cmd_slab) {
+                kmem_cache_destroy(discard_entry_slab);
+                return -ENOMEM;
+        }
        return 0;
 }
 void destroy_segment_manager_caches(void)
 {
        kmem_cache_destroy(discard_entry_slab);
+        kmem_cache_destroy(flush_cmd_slab);
 }
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 5731682d7516..7091204680f4 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -14,7 +14,7 @@
 #define NULL_SEGNO                      ((unsigned int)(~0))
 #define NULL_SECNO                      ((unsigned int)(~0))
-#define DEF_RECLAIM_PREFREE_SEGMENTS    100     /* 200MB of prefree segments */
+#define DEF_RECLAIM_PREFREE_SEGMENTS    5       /* 5% over total segments */
 /* L: Logical segment # in volume, R: Relative segment # in main area */
 #define GET_L2R_SEGNO(free_i, segno)    (segno - free_i->start_segno)
@@ -57,6 +57,9 @@
        ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
 #define GET_SEGNO_FROM_SEG0(sbi, blk_addr)                              \
        (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
+#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr)                             \
+        (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
 #define GET_SEGNO(sbi, blk_addr)                                        \
        (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ?          \
        NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi),                 \
@@ -377,26 +380,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
 static inline block_t written_block_count(struct f2fs_sb_info *sbi)
 {
-        struct sit_info *sit_i = SIT_I(sbi);
+        return SIT_I(sbi)->written_valid_blocks;
-        block_t vblocks;
-        mutex_lock(&sit_i->sentry_lock);
-        vblocks = sit_i->written_valid_blocks;
-        mutex_unlock(&sit_i->sentry_lock);
-        return vblocks;
 }
 static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
 {
-        struct free_segmap_info *free_i = FREE_I(sbi);
+        return FREE_I(sbi)->free_segments;
-        unsigned int free_segs;
-        read_lock(&free_i->segmap_lock);
-        free_segs = free_i->free_segments;
-        read_unlock(&free_i->segmap_lock);
-        return free_segs;
 }
 static inline int reserved_segments(struct f2fs_sb_info *sbi)
@@ -406,14 +395,7 @@ static inline int reserved_segments(struct f2fs_sb_info *sbi)
 static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
 {
-        struct free_segmap_info *free_i = FREE_I(sbi);
+        return FREE_I(sbi)->free_sections;
-        unsigned int free_secs;
-        read_lock(&free_i->segmap_lock);
-        free_secs = free_i->free_sections;
-        read_unlock(&free_i->segmap_lock);
-        return free_secs;
 }
 static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
@@ -682,3 +664,46 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
        struct request_queue *q = bdev_get_queue(bdev);
        return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q));
 }
+/*
+ * It is very important to gather dirty pages and write at once, so that we can
+ * submit a big bio without interfering other data writes.
+ * By default, 512 pages for directory data,
+ * 512 pages (2MB) * 3 for three types of nodes, and
+ * max_bio_blocks for meta are set.
+ */
+static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
+{
+        if (type == DATA)
+                return sbi->blocks_per_seg;
+        else if (type == NODE)
+                return 3 * sbi->blocks_per_seg;
+        else if (type == META)
+                return MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+        else
+                return 0;
+}
+/*
+ * When writing pages, it'd better align nr_to_write for segment size.
+ */
+static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
+                                        struct writeback_control *wbc)
+{
+        long nr_to_write, desired;
+        if (wbc->sync_mode != WB_SYNC_NONE)
+                return 0;
+        nr_to_write = wbc->nr_to_write;
+        if (type == DATA)
+                desired = 4096;
+        else if (type == NODE)
+                desired = 3 * max_hw_blocks(sbi);
+        else
+                desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+        wbc->nr_to_write = desired;
+        return desired - nr_to_write;
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1a85f83abd53..c756923a7302 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -51,6 +51,7 @@ enum {
        Opt_disable_ext_identify,
        Opt_inline_xattr,
        Opt_inline_data,
+        Opt_flush_merge,
        Opt_err,
 };
@@ -67,6 +68,7 @@ static match_table_t f2fs_tokens = {
        {Opt_disable_ext_identify, "disable_ext_identify"},
        {Opt_inline_xattr, "inline_xattr"},
        {Opt_inline_data, "inline_data"},
+        {Opt_flush_merge, "flush_merge"},
        {Opt_err, NULL},
 };
@@ -74,6 +76,7 @@ static match_table_t f2fs_tokens = {
 enum {
        GC_THREAD,      /* struct f2fs_gc_thread */
        SM_INFO,        /* struct f2fs_sm_info */
+        NM_INFO,        /* struct f2fs_nm_info */
        F2FS_SBI,       /* struct f2fs_sb_info */
 };
@@ -92,6 +95,8 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
                return (unsigned char *)sbi->gc_thread;
        else if (struct_type == SM_INFO)
                return (unsigned char *)SM_I(sbi);
+        else if (struct_type == NM_INFO)
+                return (unsigned char *)NM_I(sbi);
        else if (struct_type == F2FS_SBI)
                return (unsigned char *)sbi;
        return NULL;
@@ -183,7 +188,9 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -196,6 +203,8 @@ static struct attribute *f2fs_attrs[] = {
        ATTR_LIST(ipu_policy),
        ATTR_LIST(min_ipu_util),
        ATTR_LIST(max_victim_search),
+        ATTR_LIST(dir_level),
+        ATTR_LIST(ram_thresh),
        NULL,
 };
@@ -256,9 +265,9 @@ static int parse_options(struct super_block *sb, char *options)
                        if (!name)
                                return -ENOMEM;
-                        if (!strncmp(name, "on", 2))
+                        if (strlen(name) == 2 && !strncmp(name, "on", 2))
                                set_opt(sbi, BG_GC);
-                        else if (!strncmp(name, "off", 3))
+                        else if (strlen(name) == 3 && !strncmp(name, "off", 3))
                                clear_opt(sbi, BG_GC);
                        else {
                                kfree(name);
@@ -327,6 +336,9 @@ static int parse_options(struct super_block *sb, char *options)
                case Opt_inline_data:
                        set_opt(sbi, INLINE_DATA);
                        break;
+                case Opt_flush_merge:
+                        set_opt(sbi, FLUSH_MERGE);
+                        break;
                default:
                        f2fs_msg(sb, KERN_ERR,
                                "Unrecognized mount option \"%s\" or missing value",
@@ -353,12 +365,16 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
        fi->i_current_depth = 1;
        fi->i_advise = 0;
        rwlock_init(&fi->ext.ext_lock);
+        init_rwsem(&fi->i_sem);
        set_inode_flag(fi, FI_NEW_INODE);
        if (test_opt(F2FS_SB(sb), INLINE_XATTR))
                set_inode_flag(fi, FI_INLINE_XATTR);
+        /* Will be used by directory only */
+        fi->i_dir_level = F2FS_SB(sb)->dir_level;
        return &fi->vfs_inode;
 }
@@ -526,6 +542,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
                seq_puts(seq, ",disable_ext_identify");
        if (test_opt(sbi, INLINE_DATA))
                seq_puts(seq, ",inline_data");
+        if (test_opt(sbi, FLUSH_MERGE))
+                seq_puts(seq, ",flush_merge");
        seq_printf(seq, ",active_logs=%u", sbi->active_logs);
        return 0;
@@ -539,13 +557,22 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
                        le32_to_cpu(sbi->raw_super->segment_count_main);
        int i;
+        seq_puts(seq, "format: segment_type|valid_blocks\n"
+                "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
        for (i = 0; i < total_segs; i++) {
-                seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1));
+                struct seg_entry *se = get_seg_entry(sbi, i);
-                if (i != 0 && (i % 10) == 0)
-                        seq_puts(seq, "\n");
+                if ((i % 10) == 0)
+                        seq_printf(seq, "%-5d", i);
+                seq_printf(seq, "%d|%-3u", se->type,
+                                        get_valid_blocks(sbi, i, 1));
+                if ((i % 10) == 9 || i == (total_segs - 1))
+                        seq_putc(seq, '\n');
                else
-                        seq_puts(seq, " ");
+                        seq_putc(seq, ' ');
        }
        return 0;
 }
@@ -568,6 +595,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
        struct f2fs_mount_info org_mount_opt;
        int err, active_logs;
+        sync_filesystem(sb);
        /*
         * Save the old mount options in case we
         * need to restore them.
@@ -638,6 +667,8 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
        if (unlikely(ino < F2FS_ROOT_INO(sbi)))
                return ERR_PTR(-ESTALE);
+        if (unlikely(ino >= NM_I(sbi)->max_nid))
+                return ERR_PTR(-ESTALE);
        /*
         * f2fs_iget isn't quite right if the inode is currently unallocated!
@@ -785,6 +816,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
        for (i = 0; i < NR_COUNT_TYPE; i++)
                atomic_set(&sbi->nr_pages[i], 0);
+        sbi->dir_level = DEF_DIR_LEVEL;
 }
 /*
@@ -896,11 +929,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        sbi->por_doing = false;
        spin_lock_init(&sbi->stat_lock);
-        mutex_init(&sbi->read_io.io_mutex);
+        init_rwsem(&sbi->read_io.io_rwsem);
        sbi->read_io.sbi = sbi;
        sbi->read_io.bio = NULL;
        for (i = 0; i < NR_PAGE_TYPE; i++) {
-                mutex_init(&sbi->write_io[i].io_mutex);
+                init_rwsem(&sbi->write_io[i].io_rwsem);
                sbi->write_io[i].sbi = sbi;
                sbi->write_io[i].bio = NULL;
        }
@@ -989,28 +1022,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_root_inode;
        }
-        /* recover fsynced data */
-        if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
-                err = recover_fsync_data(sbi);
-                if (err)
-                        f2fs_msg(sb, KERN_ERR,
-                                "Cannot recover all fsync data errno=%ld", err);
-        }
-        /*
-         * If filesystem is not mounted as read-only then
-         * do start the gc_thread.
-         */
-        if (!(sb->s_flags & MS_RDONLY)) {
-                /* After POR, we can run background GC thread.*/
-                err = start_gc_thread(sbi);
-                if (err)
-                        goto free_gc;
-        }
        err = f2fs_build_stats(sbi);
        if (err)
-                goto free_gc;
+                goto free_root_inode;
        if (f2fs_proc_root)
                sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@@ -1032,17 +1046,36 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL,
                                                        "%s", sb->s_id);
        if (err)
-                goto fail;
+                goto free_proc;
+        /* recover fsynced data */
+        if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+                err = recover_fsync_data(sbi);
+                if (err)
+                        f2fs_msg(sb, KERN_ERR,
+                                "Cannot recover all fsync data errno=%ld", err);
+        }
+        /*
+         * If filesystem is not mounted as read-only then
+         * do start the gc_thread.
+         */
+        if (!(sb->s_flags & MS_RDONLY)) {
+                /* After POR, we can run background GC thread.*/
+                err = start_gc_thread(sbi);
+                if (err)
+                        goto free_kobj;
+        }
        return 0;
-fail:
+free_kobj:
+        kobject_del(&sbi->s_kobj);
+free_proc:
        if (sbi->s_proc) {
                remove_proc_entry("segment_info", sbi->s_proc);
                remove_proc_entry(sb->s_id, f2fs_proc_root);
        }
        f2fs_destroy_stats(sbi);
-free_gc:
-        stop_gc_thread(sbi);
 free_root_inode:
        dput(sb->s_root);
        sb->s_root = NULL;
@@ -1082,7 +1115,7 @@ MODULE_ALIAS_FS("f2fs");
 static int __init init_inodecache(void)
 {
        f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
-                        sizeof(struct f2fs_inode_info), NULL);
+                        sizeof(struct f2fs_inode_info));
        if (!f2fs_inode_cachep)
                return -ENOMEM;
        return 0;
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 89d0422a91a8..503c2451131e 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -275,7 +275,7 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
        inline_size = inline_xattr_size(inode);
-        txattr_addr = kzalloc(inline_size + size, GFP_KERNEL);
+        txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
        if (!txattr_addr)
                return NULL;
@@ -407,6 +407,8 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
        if (name == NULL)
                return -EINVAL;
        name_len = strlen(name);
+        if (name_len > F2FS_NAME_LEN)
+                return -ERANGE;
        base_addr = read_all_xattrs(inode, NULL);
        if (!base_addr)
@@ -590,7 +592,10 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
        f2fs_balance_fs(sbi);
        f2fs_lock_op(sbi);
+        /* protect xattr_ver */
+        down_write(&F2FS_I(inode)->i_sem);
        err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage);
+        up_write(&F2FS_I(inode)->i_sem);
        f2fs_unlock_op(sbi);
        return err;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 854b578f6695..b3361fe2bcb5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -490,7 +490,7 @@ EXPORT_SYMBOL_GPL(fat_build_inode);
 static void fat_evict_inode(struct inode *inode)
 {
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        if (!inode->i_nlink) {
                inode->i_size = 0;
                fat_truncate_blocks(inode, 0);
@@ -635,6 +635,8 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
+        sync_filesystem(sb);
        /* make sure we update state on remount. */
        new_rdonly = *flags & MS_RDONLY;
        if (new_rdonly != (sb->s_flags & MS_RDONLY)) {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ef6866592a0f..72c82f69b01b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -272,9 +272,19 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        case F_SETFL:
                err = setfl(fd, filp, arg);
                break;
+#if BITS_PER_LONG != 32
+        /* 32-bit arches must use fcntl64() */
+        case F_OFD_GETLK:
+#endif
        case F_GETLK:
-                err = fcntl_getlk(filp, (struct flock __user *) arg);
+                err = fcntl_getlk(filp, cmd, (struct flock __user *) arg);
                break;
+#if BITS_PER_LONG != 32
+        /* 32-bit arches must use fcntl64() */
+        case F_OFD_SETLK:
+        case F_OFD_SETLKW:
+#endif
+                /* Fallthrough */
        case F_SETLK:
        case F_SETLKW:
                err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg);
@@ -388,17 +398,20 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                goto out1;
        
        switch (cmd) {
-                case F_GETLK64:
+        case F_GETLK64:
-                        err = fcntl_getlk64(f.file, (struct flock64 __user *) arg);
+        case F_OFD_GETLK:
-                        break;
+                err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg);
-                case F_SETLK64:
+                break;
-                case F_SETLKW64:
+        case F_SETLK64:
-                        err = fcntl_setlk64(fd, f.file, cmd,
+        case F_SETLKW64:
-                                        (struct flock64 __user *) arg);
+        case F_OFD_SETLK:
-                        break;
+        case F_OFD_SETLKW:
-                default:
+                err = fcntl_setlk64(fd, f.file, cmd,
-                        err = do_fcntl(fd, cmd, arg, f.file);
+                                (struct flock64 __user *) arg);
-                        break;
+                break;
+        default:
+                err = do_fcntl(fd, cmd, arg, f.file);
+                break;
        }
 out1:
        fdput(f);
diff --git a/fs/file.c b/fs/file.c
index eb56a13dab3e..8f294cfac697 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -25,7 +25,10 @@
 int sysctl_nr_open __read_mostly = 1024*1024;
 int sysctl_nr_open_min = BITS_PER_LONG;
-int sysctl_nr_open_max = 1024 * 1024; /* raised later */
+/* our max() is unusable in constant expressions ;-/ */
+#define __const_max(x, y) ((x) < (y) ? (x) : (y))
+int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) &
+                         -BITS_PER_LONG;
 static void *alloc_fdmem(size_t size)
 {
@@ -429,12 +432,6 @@ void exit_files(struct task_struct *tsk)
        }
 }
-void __init files_defer_init(void)
-{
-        sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
-                             -BITS_PER_LONG;
-}
 struct files_struct init_files = {
        .count          = ATOMIC_INIT(1),
        .fdt            = &init_files.fdtab,
@@ -497,7 +494,7 @@ repeat:
        error = fd;
 #if 1
        /* Sanity check */
-        if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
+        if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
                printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
                rcu_assign_pointer(fdt->fd[fd], NULL);
        }
diff --git a/fs/file_table.c b/fs/file_table.c
index 5b24008ea4f6..a374f5033e97 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -52,7 +52,6 @@ static void file_free_rcu(struct rcu_head *head)
 static inline void file_free(struct file *f)
 {
        percpu_counter_dec(&nr_files);
-        file_check_state(f);
        call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
@@ -178,47 +177,12 @@ struct file *alloc_file(struct path *path, fmode_t mode,
        file->f_mapping = path->dentry->d_inode->i_mapping;
        file->f_mode = mode;
        file->f_op = fop;
-        /*
-         * These mounts don't really matter in practice
-         * for r/o bind mounts.  They aren't userspace-
-         * visible.  We do this for consistency, and so
-         * that we can do debugging checks at __fput()
-         */
-        if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) {
-                file_take_write(file);
-                WARN_ON(mnt_clone_write(path->mnt));
-        }
        if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_inc(path->dentry->d_inode);
        return file;
 }
 EXPORT_SYMBOL(alloc_file);
-/**
- * drop_file_write_access - give up ability to write to a file
- * @file: the file to which we will stop writing
- *
- * This is a central place which will give up the ability
- * to write to @file, along with access to write through
- * its vfsmount.
- */
-static void drop_file_write_access(struct file *file)
-{
-        struct vfsmount *mnt = file->f_path.mnt;
-        struct dentry *dentry = file->f_path.dentry;
-        struct inode *inode = dentry->d_inode;
-        put_write_access(inode);
-        if (special_file(inode->i_mode))
-                return;
-        if (file_check_writeable(file) != 0)
-                return;
-        __mnt_drop_write(mnt);
-        file_release_write(file);
-}
 /* the real guts of fput() - releasing the last reference to file
 */
 static void __fput(struct file *file)
@@ -235,7 +199,7 @@ static void __fput(struct file *file)
         * in the file cleanup chain.
         */
        eventpoll_release(file);
-        locks_remove_flock(file);
+        locks_remove_file(file);
        if (unlikely(file->f_flags & FASYNC)) {
                if (file->f_op->fasync)
@@ -253,8 +217,10 @@ static void __fput(struct file *file)
        put_pid(file->f_owner.pid);
        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_dec(inode);
-        if (file->f_mode & FMODE_WRITE)
+        if (file->f_mode & FMODE_WRITER) {
-                drop_file_write_access(file);
+                put_write_access(inode);
+                __mnt_drop_write(mnt);
+        }
        file->f_path.dentry = NULL;
        file->f_path.mnt = NULL;
        file->f_inode = NULL;
@@ -359,6 +325,5 @@ void __init files_init(unsigned long mempages)
        n = (mempages * (PAGE_SIZE / 1024)) / 10;
        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
-        files_defer_init();
        percpu_counter_init(&nr_files, 0);
 } 
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 92567d95ba6a..5797d45a78cb 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -121,6 +121,7 @@ int unregister_filesystem(struct file_system_type * fs)
 EXPORT_SYMBOL(unregister_filesystem);
+#ifdef CONFIG_SYSFS_SYSCALL
 static int fs_index(const char __user * __name)
 {
        struct file_system_type * tmp;
@@ -199,6 +200,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
        }
        return retval;
 }
+#endif
 int __init get_filesystem_list(char *buf)
 {
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index f47df72cef17..363e3ae25f6b 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -354,7 +354,7 @@ static void vxfs_i_callback(struct rcu_head *head)
 void
 vxfs_evict_inode(struct inode *ip)
 {
-        truncate_inode_pages(&ip->i_data, 0);
+        truncate_inode_pages_final(&ip->i_data);
        clear_inode(ip);
        call_rcu(&ip->i_rcu, vxfs_i_callback);
 }
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 25d4099a4aea..99c7f0a37af4 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -192,7 +192,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
 * vxfs_lookup - lookup pathname component
 * @dip:        dir in which we lookup
 * @dp:         dentry we lookup
- * @nd:         lookup nameidata
+ * @flags:      lookup flags
 *
 * Description:
 *   vxfs_lookup tries to lookup the pathname component described
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index e37eb274e492..7ca8c75d50d3 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -124,6 +124,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
 static int vxfs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_RDONLY;
        return 0;
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d754e3cf99a8..be568b7311d6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -89,16 +89,31 @@ static inline struct inode *wb_inode(struct list_head *head)
 #define CREATE_TRACE_POINTS
 #include <trace/events/writeback.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
+static void bdi_wakeup_thread(struct backing_dev_info *bdi)
+{
+        spin_lock_bh(&bdi->wb_lock);
+        if (test_bit(BDI_registered, &bdi->state))
+                mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+        spin_unlock_bh(&bdi->wb_lock);
+}
 static void bdi_queue_work(struct backing_dev_info *bdi,
                           struct wb_writeback_work *work)
 {
        trace_writeback_queue(bdi, work);
        spin_lock_bh(&bdi->wb_lock);
+        if (!test_bit(BDI_registered, &bdi->state)) {
+                if (work->done)
+                        complete(work->done);
+                goto out_unlock;
+        }
        list_add_tail(&work->list, &bdi->work_list);
-        spin_unlock_bh(&bdi->wb_lock);
        mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+out_unlock:
+        spin_unlock_bh(&bdi->wb_lock);
 }
 static void
@@ -114,7 +129,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
        work = kzalloc(sizeof(*work), GFP_ATOMIC);
        if (!work) {
                trace_writeback_nowork(bdi);
-                mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+                bdi_wakeup_thread(bdi);
                return;
        }
@@ -161,7 +176,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
         * writeback as soon as there is no other work to do.
         */
        trace_writeback_wake_background(bdi);
-        mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+        bdi_wakeup_thread(bdi);
 }
 /*
@@ -1017,7 +1032,7 @@ void bdi_writeback_workfn(struct work_struct *work)
        current->flags |= PF_SWAPWRITE;
        if (likely(!current_is_workqueue_rescuer() ||
-                   list_empty(&bdi->bdi_list))) {
+                   !test_bit(BDI_registered, &bdi->state))) {
                /*
                 * The normal path.  Keep writing back @bdi until its
                 * work_list is empty.  Note that this path is also taken
@@ -1039,10 +1054,10 @@ void bdi_writeback_workfn(struct work_struct *work)
                trace_writeback_pages_written(pages_written);
        }
-        if (!list_empty(&bdi->work_list) ||
+        if (!list_empty(&bdi->work_list))
-            (wb_has_dirty_io(wb) && dirty_writeback_interval))
+                mod_delayed_work(bdi_wq, &wb->dwork, 0);
-                queue_delayed_work(bdi_wq, &wb->dwork,
+        else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-                        msecs_to_jiffies(dirty_writeback_interval * 10));
+                bdi_wakeup_thread_delayed(bdi);
        current->flags &= ~PF_SWAPWRITE;
 }
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index a0b0855d00a9..205e0d5d5307 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -348,7 +348,7 @@ int __init fuse_ctl_init(void)
        return register_filesystem(&fuse_ctl_fs_type);
 }
-void fuse_ctl_cleanup(void)
+void __exit fuse_ctl_cleanup(void)
 {
        unregister_filesystem(&fuse_ctl_fs_type);
 }
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index b96a49b37d66..13b691a8a7d2 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -95,7 +95,7 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
        struct iovec iov = { .iov_base = buf, .iov_len = count };
        struct fuse_io_priv io = { .async = 0, .file = file };
-        return fuse_direct_io(&io, &iov, 1, count, &pos, 0);
+        return fuse_direct_io(&io, &iov, 1, count, &pos, FUSE_DIO_CUSE);
 }
 static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -109,7 +109,8 @@ static ssize_t cuse_write(struct file *file, const char __user *buf,
         * No locking or generic_write_checks(), the server is
         * responsible for locking and sanity checks.
         */
-        return fuse_direct_io(&io, &iov, 1, count, &pos, 1);
+        return fuse_direct_io(&io, &iov, 1, count, &pos,
+                              FUSE_DIO_WRITE | FUSE_DIO_CUSE);
 }
 static int cuse_open(struct inode *inode, struct file *file)
@@ -568,7 +569,7 @@ static ssize_t cuse_class_waiting_show(struct device *dev,
        return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
 }
-static DEVICE_ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL);
+static DEVICE_ATTR(waiting, 0400, cuse_class_waiting_show, NULL);
 static ssize_t cuse_class_abort_store(struct device *dev,
                                      struct device_attribute *attr,
@@ -579,7 +580,7 @@ static ssize_t cuse_class_abort_store(struct device *dev,
        fuse_abort_conn(&cc->fc);
        return count;
 }
-static DEVICE_ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store);
+static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store);
 static struct attribute *cuse_class_dev_attrs[] = {
        &dev_attr_waiting.attr,
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 0a648bb455ae..aac71ce373e4 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -667,15 +667,15 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)
                struct pipe_buffer *buf = cs->currbuf;
                if (!cs->write) {
-                        buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
+                        kunmap_atomic(cs->mapaddr);
                } else {
-                        kunmap(buf->page);
+                        kunmap_atomic(cs->mapaddr);
                        buf->len = PAGE_SIZE - cs->len;
                }
                cs->currbuf = NULL;
                cs->mapaddr = NULL;
        } else if (cs->mapaddr) {
-                kunmap(cs->pg);
+                kunmap_atomic(cs->mapaddr);
                if (cs->write) {
                        flush_dcache_page(cs->pg);
                        set_page_dirty_lock(cs->pg);
@@ -706,7 +706,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
                        BUG_ON(!cs->nr_segs);
                        cs->currbuf = buf;
-                        cs->mapaddr = buf->ops->map(cs->pipe, buf, 0);
+                        cs->mapaddr = kmap_atomic(buf->page);
                        cs->len = buf->len;
                        cs->buf = cs->mapaddr + buf->offset;
                        cs->pipebufs++;
@@ -726,7 +726,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
                        buf->len = 0;
                        cs->currbuf = buf;
-                        cs->mapaddr = kmap(page);
+                        cs->mapaddr = kmap_atomic(page);
                        cs->buf = cs->mapaddr;
                        cs->len = PAGE_SIZE;
                        cs->pipebufs++;
@@ -745,7 +745,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
                        return err;
                BUG_ON(err != 1);
                offset = cs->addr % PAGE_SIZE;
-                cs->mapaddr = kmap(cs->pg);
+                cs->mapaddr = kmap_atomic(cs->pg);
                cs->buf = cs->mapaddr + offset;
                cs->len = min(PAGE_SIZE - offset, cs->seglen);
                cs->seglen -= cs->len;
@@ -874,7 +874,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 out_fallback_unlock:
        unlock_page(newpage);
 out_fallback:
-        cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+        cs->mapaddr = kmap_atomic(buf->page);
        cs->buf = cs->mapaddr + buf->offset;
        err = lock_request(cs->fc, cs->req);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 1d1292c581c3..42198359fa1b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -679,6 +679,14 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
        return create_new_entry(fc, req, dir, entry, S_IFLNK);
 }
+static inline void fuse_update_ctime(struct inode *inode)
+{
+        if (!IS_NOCMTIME(inode)) {
+                inode->i_ctime = current_fs_time(inode->i_sb);
+                mark_inode_dirty_sync(inode);
+        }
+}
 static int fuse_unlink(struct inode *dir, struct dentry *entry)
 {
        int err;
@@ -713,6 +721,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
                fuse_invalidate_attr(inode);
                fuse_invalidate_attr(dir);
                fuse_invalidate_entry_cache(entry);
+                fuse_update_ctime(inode);
        } else if (err == -EINTR)
                fuse_invalidate_entry(entry);
        return err;
@@ -743,23 +752,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
        return err;
 }
-static int fuse_rename(struct inode *olddir, struct dentry *oldent,
+static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
-                       struct inode *newdir, struct dentry *newent)
+                              struct inode *newdir, struct dentry *newent,
+                              unsigned int flags, int opcode, size_t argsize)
 {
        int err;
-        struct fuse_rename_in inarg;
+        struct fuse_rename2_in inarg;
        struct fuse_conn *fc = get_fuse_conn(olddir);
-        struct fuse_req *req = fuse_get_req_nopages(fc);
+        struct fuse_req *req;
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
-        memset(&inarg, 0, sizeof(inarg));
+        memset(&inarg, 0, argsize);
        inarg.newdir = get_node_id(newdir);
-        req->in.h.opcode = FUSE_RENAME;
+        inarg.flags = flags;
+        req->in.h.opcode = opcode;
        req->in.h.nodeid = get_node_id(olddir);
        req->in.numargs = 3;
-        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].size = argsize;
        req->in.args[0].value = &inarg;
        req->in.args[1].size = oldent->d_name.len + 1;
        req->in.args[1].value = oldent->d_name.name;
@@ -771,15 +783,22 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
        if (!err) {
                /* ctime changes */
                fuse_invalidate_attr(oldent->d_inode);
+                fuse_update_ctime(oldent->d_inode);
+                if (flags & RENAME_EXCHANGE) {
+                        fuse_invalidate_attr(newent->d_inode);
+                        fuse_update_ctime(newent->d_inode);
+                }
                fuse_invalidate_attr(olddir);
                if (olddir != newdir)
                        fuse_invalidate_attr(newdir);
                /* newent will end up negative */
-                if (newent->d_inode) {
+                if (!(flags & RENAME_EXCHANGE) && newent->d_inode) {
                        fuse_invalidate_attr(newent->d_inode);
                        fuse_invalidate_entry_cache(newent);
+                        fuse_update_ctime(newent->d_inode);
                }
        } else if (err == -EINTR) {
                /* If request was interrupted, DEITY only knows if the
@@ -795,6 +814,36 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
        return err;
 }
+static int fuse_rename(struct inode *olddir, struct dentry *oldent,
+                       struct inode *newdir, struct dentry *newent)
+{
+        return fuse_rename_common(olddir, oldent, newdir, newent, 0,
+                                  FUSE_RENAME, sizeof(struct fuse_rename_in));
+}
+static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
+                        struct inode *newdir, struct dentry *newent,
+                        unsigned int flags)
+{
+        struct fuse_conn *fc = get_fuse_conn(olddir);
+        int err;
+        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+                return -EINVAL;
+        if (fc->no_rename2 || fc->minor < 23)
+                return -EINVAL;
+        err = fuse_rename_common(olddir, oldent, newdir, newent, flags,
+                                 FUSE_RENAME2, sizeof(struct fuse_rename2_in));
+        if (err == -ENOSYS) {
+                fc->no_rename2 = 1;
+                err = -EINVAL;
+        }
+        return err;
+}
 static int fuse_link(struct dentry *entry, struct inode *newdir,
                     struct dentry *newent)
 {
@@ -829,6 +878,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
                inc_nlink(inode);
                spin_unlock(&fc->lock);
                fuse_invalidate_attr(inode);
+                fuse_update_ctime(inode);
        } else if (err == -EINTR) {
                fuse_invalidate_attr(inode);
        }
@@ -839,6 +889,16 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
                          struct kstat *stat)
 {
        unsigned int blkbits;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        /* see the comment in fuse_change_attributes() */
+        if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
+                attr->size = i_size_read(inode);
+                attr->mtime = inode->i_mtime.tv_sec;
+                attr->mtimensec = inode->i_mtime.tv_nsec;
+                attr->ctime = inode->i_ctime.tv_sec;
+                attr->ctimensec = inode->i_ctime.tv_nsec;
+        }
        stat->dev = inode->i_sb->s_dev;
        stat->ino = attr->ino;
@@ -1477,12 +1537,16 @@ static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
                                 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
 }
-static bool update_mtime(unsigned ivalid)
+static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
 {
        /* Always update if mtime is explicitly set  */
        if (ivalid & ATTR_MTIME_SET)
                return true;
+        /* Or if kernel i_mtime is the official one */
+        if (trust_local_mtime)
+                return true;
        /* If it's an open(O_TRUNC) or an ftruncate(), don't update */
        if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE)))
                return false;
@@ -1491,7 +1555,8 @@ static bool update_mtime(unsigned ivalid)
        return true;
 }
-static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
+static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
+                           bool trust_local_cmtime)
 {
        unsigned ivalid = iattr->ia_valid;
@@ -1510,13 +1575,18 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
                if (!(ivalid & ATTR_ATIME_SET))
                        arg->valid |= FATTR_ATIME_NOW;
        }
-        if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) {
+        if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) {
                arg->valid |= FATTR_MTIME;
                arg->mtime = iattr->ia_mtime.tv_sec;
                arg->mtimensec = iattr->ia_mtime.tv_nsec;
-                if (!(ivalid & ATTR_MTIME_SET))
+                if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime)
                        arg->valid |= FATTR_MTIME_NOW;
        }
+        if ((ivalid & ATTR_CTIME) && trust_local_cmtime) {
+                arg->valid |= FATTR_CTIME;
+                arg->ctime = iattr->ia_ctime.tv_sec;
+                arg->ctimensec = iattr->ia_ctime.tv_nsec;
+        }
 }
 /*
@@ -1563,6 +1633,62 @@ void fuse_release_nowrite(struct inode *inode)
        spin_unlock(&fc->lock);
 }
+static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req,
+                              struct inode *inode,
+                              struct fuse_setattr_in *inarg_p,
+                              struct fuse_attr_out *outarg_p)
+{
+        req->in.h.opcode = FUSE_SETATTR;
+        req->in.h.nodeid = get_node_id(inode);
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(*inarg_p);
+        req->in.args[0].value = inarg_p;
+        req->out.numargs = 1;
+        if (fc->minor < 9)
+                req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+        else
+                req->out.args[0].size = sizeof(*outarg_p);
+        req->out.args[0].value = outarg_p;
+}
+/*
+ * Flush inode->i_mtime to the server
+ */
+int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
+{
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req;
+        struct fuse_setattr_in inarg;
+        struct fuse_attr_out outarg;
+        int err;
+        req = fuse_get_req_nopages(fc);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        memset(&inarg, 0, sizeof(inarg));
+        memset(&outarg, 0, sizeof(outarg));
+        inarg.valid = FATTR_MTIME;
+        inarg.mtime = inode->i_mtime.tv_sec;
+        inarg.mtimensec = inode->i_mtime.tv_nsec;
+        if (fc->minor >= 23) {
+                inarg.valid |= FATTR_CTIME;
+                inarg.ctime = inode->i_ctime.tv_sec;
+                inarg.ctimensec = inode->i_ctime.tv_nsec;
+        }
+        if (ff) {
+                inarg.valid |= FATTR_FH;
+                inarg.fh = ff->fh;
+        }
+        fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
+        fuse_request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        return err;
+}
 /*
 * Set attributes, and at the same time refresh them.
 *
@@ -1580,8 +1706,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
        struct fuse_setattr_in inarg;
        struct fuse_attr_out outarg;
        bool is_truncate = false;
+        bool is_wb = fc->writeback_cache;
        loff_t oldsize;
        int err;
+        bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
        if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
                attr->ia_valid |= ATTR_FORCE;
@@ -1606,11 +1734,13 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
        if (is_truncate) {
                fuse_set_nowrite(inode);
                set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+                if (trust_local_cmtime && attr->ia_size != inode->i_size)
+                        attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
        }
        memset(&inarg, 0, sizeof(inarg));
        memset(&outarg, 0, sizeof(outarg));
-        iattr_to_fattr(attr, &inarg);
+        iattr_to_fattr(attr, &inarg, trust_local_cmtime);
        if (file) {
                struct fuse_file *ff = file->private_data;
                inarg.valid |= FATTR_FH;
@@ -1621,17 +1751,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
                inarg.valid |= FATTR_LOCKOWNER;
                inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
        }
-        req->in.h.opcode = FUSE_SETATTR;
+        fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
-        req->in.h.nodeid = get_node_id(inode);
-        req->in.numargs = 1;
-        req->in.args[0].size = sizeof(inarg);
-        req->in.args[0].value = &inarg;
-        req->out.numargs = 1;
-        if (fc->minor < 9)
-                req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
-        else
-                req->out.args[0].size = sizeof(outarg);
-        req->out.args[0].value = &outarg;
        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
@@ -1648,10 +1768,21 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
        }
        spin_lock(&fc->lock);
+        /* the kernel maintains i_mtime locally */
+        if (trust_local_cmtime) {
+                if (attr->ia_valid & ATTR_MTIME)
+                        inode->i_mtime = attr->ia_mtime;
+                if (attr->ia_valid & ATTR_CTIME)
+                        inode->i_ctime = attr->ia_ctime;
+                /* FIXME: clear I_DIRTY_SYNC? */
+        }
        fuse_change_attributes_common(inode, &outarg.attr,
                                      attr_timeout(&outarg));
        oldsize = inode->i_size;
-        i_size_write(inode, outarg.attr.size);
+        /* see the comment in fuse_change_attributes() */
+        if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
+                i_size_write(inode, outarg.attr.size);
        if (is_truncate) {
                /* NOTE: this may release/reacquire fc->lock */
@@ -1663,7 +1794,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
         * Only call invalidate_inode_pages2() after removing
         * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
         */
-        if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+        if ((is_truncate || !is_wb) &&
+            S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
                truncate_pagecache(inode, outarg.attr.size);
                invalidate_inode_pages2(inode->i_mapping);
        }
@@ -1739,8 +1871,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
                fc->no_setxattr = 1;
                err = -EOPNOTSUPP;
        }
-        if (!err)
+        if (!err) {
                fuse_invalidate_attr(inode);
+                fuse_update_ctime(inode);
+        }
        return err;
 }
@@ -1870,8 +2004,10 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
                fc->no_removexattr = 1;
                err = -EOPNOTSUPP;
        }
-        if (!err)
+        if (!err) {
                fuse_invalidate_attr(inode);
+                fuse_update_ctime(inode);
+        }
        return err;
 }
@@ -1882,6 +2018,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
        .unlink         = fuse_unlink,
        .rmdir          = fuse_rmdir,
        .rename         = fuse_rename,
+        .rename2        = fuse_rename2,
        .link           = fuse_link,
        .setattr        = fuse_setattr,
        .create         = fuse_create,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 77bcc303c3ae..96d513e01a5d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -188,6 +188,22 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 }
 EXPORT_SYMBOL_GPL(fuse_do_open);
+static void fuse_link_write_file(struct file *file)
+{
+        struct inode *inode = file_inode(file);
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_inode *fi = get_fuse_inode(inode);
+        struct fuse_file *ff = file->private_data;
+        /*
+         * file may be written through mmap, so chain it onto the
+         * inodes's write_file list
+         */
+        spin_lock(&fc->lock);
+        if (list_empty(&ff->write_entry))
+                list_add(&ff->write_entry, &fi->write_files);
+        spin_unlock(&fc->lock);
+}
 void fuse_finish_open(struct inode *inode, struct file *file)
 {
        struct fuse_file *ff = file->private_data;
@@ -207,25 +223,37 @@ void fuse_finish_open(struct inode *inode, struct file *file)
                i_size_write(inode, 0);
                spin_unlock(&fc->lock);
                fuse_invalidate_attr(inode);
+                if (fc->writeback_cache)
+                        file_update_time(file);
        }
+        if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
+                fuse_link_write_file(file);
 }
 int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        int err;
+        bool lock_inode = (file->f_flags & O_TRUNC) &&
+                          fc->atomic_o_trunc &&
+                          fc->writeback_cache;
        err = generic_file_open(inode, file);
        if (err)
                return err;
+        if (lock_inode)
+                mutex_lock(&inode->i_mutex);
        err = fuse_do_open(fc, get_node_id(inode), file, isdir);
-        if (err)
-                return err;
-        fuse_finish_open(inode, file);
+        if (!err)
+                fuse_finish_open(inode, file);
-        return 0;
+        if (lock_inode)
+                mutex_unlock(&inode->i_mutex);
+        return err;
 }
 static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
@@ -292,6 +320,12 @@ static int fuse_open(struct inode *inode, struct file *file)
 static int fuse_release(struct inode *inode, struct file *file)
 {
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        /* see fuse_vma_close() for !writeback_cache case */
+        if (fc->writeback_cache)
+                write_inode_now(inode, 1);
        fuse_release_common(file, FUSE_RELEASE);
        /* return value is ignored by VFS */
@@ -333,12 +367,13 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
 }
 /*
- * Check if page is under writeback
+ * Check if any page in a range is under writeback
 *
 * This is currently done by walking the list of writepage requests
 * for the inode, which can be pretty inefficient.
 */
-static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
+                                   pgoff_t idx_to)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
@@ -351,8 +386,8 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
                BUG_ON(req->inode != inode);
                curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
-                if (curr_index <= index &&
+                if (idx_from < curr_index + req->num_pages &&
-                    index < curr_index + req->num_pages) {
+                    curr_index <= idx_to) {
                        found = true;
                        break;
                }
@@ -362,6 +397,11 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
        return found;
 }
+static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
+{
+        return fuse_range_is_writeback(inode, index, index);
+}
 /*
 * Wait for page writeback to be completed.
 *
@@ -376,6 +416,21 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
        return 0;
 }
+/*
+ * Wait for all pending writepages on the inode to finish.
+ *
+ * This is currently done by blocking further writes with FUSE_NOWRITE
+ * and waiting for all sent writes to complete.
+ *
+ * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
+ * could conflict with truncation.
+ */
+static void fuse_sync_writes(struct inode *inode)
+{
+        fuse_set_nowrite(inode);
+        fuse_release_nowrite(inode);
+}
 static int fuse_flush(struct file *file, fl_owner_t id)
 {
        struct inode *inode = file_inode(file);
@@ -391,6 +446,14 @@ static int fuse_flush(struct file *file, fl_owner_t id)
        if (fc->no_flush)
                return 0;
+        err = write_inode_now(inode, 1);
+        if (err)
+                return err;
+        mutex_lock(&inode->i_mutex);
+        fuse_sync_writes(inode);
+        mutex_unlock(&inode->i_mutex);
        req = fuse_get_req_nofail_nopages(fc, file);
        memset(&inarg, 0, sizeof(inarg));
        inarg.fh = ff->fh;
@@ -411,21 +474,6 @@ static int fuse_flush(struct file *file, fl_owner_t id)
        return err;
 }
-/*
- * Wait for all pending writepages on the inode to finish.
- *
- * This is currently done by blocking further writes with FUSE_NOWRITE
- * and waiting for all sent writes to complete.
- *
- * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
- * could conflict with truncation.
- */
-static void fuse_sync_writes(struct inode *inode)
-{
-        fuse_set_nowrite(inode);
-        fuse_release_nowrite(inode);
-}
 int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
                      int datasync, int isdir)
 {
@@ -439,13 +487,6 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
        if (is_bad_inode(inode))
                return -EIO;
-        err = filemap_write_and_wait_range(inode->i_mapping, start, end);
-        if (err)
-                return err;
-        if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
-                return 0;
        mutex_lock(&inode->i_mutex);
        /*
@@ -453,11 +494,17 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
         * wait for all outstanding writes, before sending the FSYNC
         * request.
         */
-        err = write_inode_now(inode, 0);
+        err = filemap_write_and_wait_range(inode->i_mapping, start, end);
        if (err)
                goto out;
        fuse_sync_writes(inode);
+        err = sync_inode_metadata(inode, 1);
+        if (err)
+                goto out;
+        if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
+                goto out;
        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req)) {
@@ -655,7 +702,33 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
        spin_unlock(&fc->lock);
 }
-static int fuse_readpage(struct file *file, struct page *page)
+static void fuse_short_read(struct fuse_req *req, struct inode *inode,
+                            u64 attr_ver)
+{
+        size_t num_read = req->out.args[0].size;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        if (fc->writeback_cache) {
+                /*
+                 * A hole in a file. Some data after the hole are in page cache,
+                 * but have not reached the client fs yet. So, the hole is not
+                 * present there.
+                 */
+                int i;
+                int start_idx = num_read >> PAGE_CACHE_SHIFT;
+                size_t off = num_read & (PAGE_CACHE_SIZE - 1);
+                for (i = start_idx; i < req->num_pages; i++) {
+                        zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE);
+                        off = 0;
+                }
+        } else {
+                loff_t pos = page_offset(req->pages[0]) + num_read;
+                fuse_read_update_size(inode, pos, attr_ver);
+        }
+}
+static int fuse_do_readpage(struct file *file, struct page *page)
 {
        struct fuse_io_priv io = { .async = 0, .file = file };
        struct inode *inode = page->mapping->host;
@@ -667,10 +740,6 @@ static int fuse_readpage(struct file *file, struct page *page)
        u64 attr_ver;
        int err;
-        err = -EIO;
-        if (is_bad_inode(inode))
-                goto out;
        /*
         * Page writeback can extend beyond the lifetime of the
         * page-cache page, so make sure we read a properly synced
@@ -679,9 +748,8 @@ static int fuse_readpage(struct file *file, struct page *page)
        fuse_wait_on_page_writeback(inode, page->index);
        req = fuse_get_req(fc, 1);
-        err = PTR_ERR(req);
        if (IS_ERR(req))
-                goto out;
+                return PTR_ERR(req);
        attr_ver = fuse_get_attr_version(fc);
@@ -692,18 +760,32 @@ static int fuse_readpage(struct file *file, struct page *page)
        req->page_descs[0].length = count;
        num_read = fuse_send_read(req, &io, pos, count, NULL);
        err = req->out.h.error;
-        fuse_put_request(fc, req);
        if (!err) {
                /*
                 * Short read means EOF.  If file size is larger, truncate it
                 */
                if (num_read < count)
-                        fuse_read_update_size(inode, pos + num_read, attr_ver);
+                        fuse_short_read(req, inode, attr_ver);
                SetPageUptodate(page);
        }
+        fuse_put_request(fc, req);
+        return err;
+}
+static int fuse_readpage(struct file *file, struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        int err;
+        err = -EIO;
+        if (is_bad_inode(inode))
+                goto out;
+        err = fuse_do_readpage(file, page);
        fuse_invalidate_atime(inode);
 out:
        unlock_page(page);
@@ -726,13 +808,9 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
                /*
                 * Short read means EOF. If file size is larger, truncate it
                 */
-                if (!req->out.h.error && num_read < count) {
+                if (!req->out.h.error && num_read < count)
-                        loff_t pos;
+                        fuse_short_read(req, inode, req->misc.read.attr_ver);
-                        pos = page_offset(req->pages[0]) + num_read;
-                        fuse_read_update_size(inode, pos,
-                                              req->misc.read.attr_ver);
-                }
                fuse_invalidate_atime(inode);
        }
@@ -922,16 +1000,21 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
        return req->misc.write.out.size;
 }
-void fuse_write_update_size(struct inode *inode, loff_t pos)
+bool fuse_write_update_size(struct inode *inode, loff_t pos)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
+        bool ret = false;
        spin_lock(&fc->lock);
        fi->attr_version = ++fc->attr_version;
-        if (pos > inode->i_size)
+        if (pos > inode->i_size) {
                i_size_write(inode, pos);
+                ret = true;
+        }
        spin_unlock(&fc->lock);
+        return ret;
 }
 static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
@@ -1003,9 +1086,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
                if (mapping_writably_mapped(mapping))
                        flush_dcache_page(page);
-                pagefault_disable();
                tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
-                pagefault_enable();
                flush_dcache_page(page);
                mark_page_accessed(page);
@@ -1116,6 +1197,15 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        struct iov_iter i;
        loff_t endbyte = 0;
+        if (get_fuse_conn(inode)->writeback_cache) {
+                /* Update size (EOF optimization) and mode (SUID clearing) */
+                err = fuse_update_attributes(mapping->host, NULL, file, NULL);
+                if (err)
+                        return err;
+                return generic_file_aio_write(iocb, iov, nr_segs, pos);
+        }
        WARN_ON(iocb->ki_pos != pos);
        ocount = 0;
@@ -1145,8 +1235,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                goto out;
        if (file->f_flags & O_DIRECT) {
-                written = generic_file_direct_write(iocb, iov, &nr_segs,
+                written = generic_file_direct_write(iocb, iov, &nr_segs, pos, 
-                                                    pos, &iocb->ki_pos,
                                                    count, ocount);
                if (written < 0 || written == count)
                        goto out;
@@ -1289,13 +1378,18 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p)
 ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
                       unsigned long nr_segs, size_t count, loff_t *ppos,
-                       int write)
+                       int flags)
 {
+        int write = flags & FUSE_DIO_WRITE;
+        int cuse = flags & FUSE_DIO_CUSE;
        struct file *file = io->file;
+        struct inode *inode = file->f_mapping->host;
        struct fuse_file *ff = file->private_data;
        struct fuse_conn *fc = ff->fc;
        size_t nmax = write ? fc->max_write : fc->max_read;
        loff_t pos = *ppos;
+        pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
+        pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
        ssize_t res = 0;
        struct fuse_req *req;
        struct iov_iter ii;
@@ -1309,6 +1403,14 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
        if (IS_ERR(req))
                return PTR_ERR(req);
+        if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
+                if (!write)
+                        mutex_lock(&inode->i_mutex);
+                fuse_sync_writes(inode);
+                if (!write)
+                        mutex_unlock(&inode->i_mutex);
+        }
        while (count) {
                size_t nres;
                fl_owner_t owner = current->files;
@@ -1397,7 +1499,8 @@ static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
        res = generic_write_checks(file, ppos, &count, 0);
        if (!res)
-                res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1);
+                res = fuse_direct_io(io, iov, nr_segs, count, ppos,
+                                     FUSE_DIO_WRITE);
        fuse_invalidate_attr(inode);
@@ -1556,13 +1659,13 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
        fuse_writepage_free(fc, req);
 }
-static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
+static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
-                                             struct fuse_inode *fi)
+                                               struct fuse_inode *fi)
 {
        struct fuse_file *ff = NULL;
        spin_lock(&fc->lock);
-        if (!WARN_ON(list_empty(&fi->write_files))) {
+        if (!list_empty(&fi->write_files)) {
                ff = list_entry(fi->write_files.next, struct fuse_file,
                                write_entry);
                fuse_file_get(ff);
@@ -1572,6 +1675,29 @@ static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
        return ff;
 }
+static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
+                                             struct fuse_inode *fi)
+{
+        struct fuse_file *ff = __fuse_write_file_get(fc, fi);
+        WARN_ON(!ff);
+        return ff;
+}
+int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_inode *fi = get_fuse_inode(inode);
+        struct fuse_file *ff;
+        int err;
+        ff = __fuse_write_file_get(fc, fi);
+        err = fuse_flush_times(inode, ff);
+        if (ff)
+                fuse_file_put(ff, 0);
+        return err;
+}
 static int fuse_writepage_locked(struct page *page)
 {
        struct address_space *mapping = page->mapping;
@@ -1885,6 +2011,77 @@ out:
        return err;
 }
+/*
+ * It's worthy to make sure that space is reserved on disk for the write,
+ * but how to implement it without killing performance need more thinking.
+ */
+static int fuse_write_begin(struct file *file, struct address_space *mapping,
+                loff_t pos, unsigned len, unsigned flags,
+                struct page **pagep, void **fsdata)
+{
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+        struct fuse_conn *fc = get_fuse_conn(file->f_dentry->d_inode);
+        struct page *page;
+        loff_t fsize;
+        int err = -ENOMEM;
+        WARN_ON(!fc->writeback_cache);
+        page = grab_cache_page_write_begin(mapping, index, flags);
+        if (!page)
+                goto error;
+        fuse_wait_on_page_writeback(mapping->host, page->index);
+        if (PageUptodate(page) || len == PAGE_CACHE_SIZE)
+                goto success;
+        /*
+         * Check if the start this page comes after the end of file, in which
+         * case the readpage can be optimized away.
+         */
+        fsize = i_size_read(mapping->host);
+        if (fsize <= (pos & PAGE_CACHE_MASK)) {
+                size_t off = pos & ~PAGE_CACHE_MASK;
+                if (off)
+                        zero_user_segment(page, 0, off);
+                goto success;
+        }
+        err = fuse_do_readpage(file, page);
+        if (err)
+                goto cleanup;
+success:
+        *pagep = page;
+        return 0;
+cleanup:
+        unlock_page(page);
+        page_cache_release(page);
+error:
+        return err;
+}
+static int fuse_write_end(struct file *file, struct address_space *mapping,
+                loff_t pos, unsigned len, unsigned copied,
+                struct page *page, void *fsdata)
+{
+        struct inode *inode = page->mapping->host;
+        if (!PageUptodate(page)) {
+                /* Zero any unwritten bytes at the end of the page */
+                size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK;
+                if (endoff)
+                        zero_user_segment(page, endoff, PAGE_CACHE_SIZE);
+                SetPageUptodate(page);
+        }
+        fuse_write_update_size(inode, pos + copied);
+        set_page_dirty(page);
+        unlock_page(page);
+        page_cache_release(page);
+        return copied;
+}
 static int fuse_launder_page(struct page *page)
 {
        int err = 0;
@@ -1940,26 +2137,16 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static const struct vm_operations_struct fuse_file_vm_ops = {
        .close          = fuse_vma_close,
        .fault          = filemap_fault,
+        .map_pages      = filemap_map_pages,
        .page_mkwrite   = fuse_page_mkwrite,
        .remap_pages    = generic_file_remap_pages,
 };
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
-        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
+        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
-                struct inode *inode = file_inode(file);
+                fuse_link_write_file(file);
-                struct fuse_conn *fc = get_fuse_conn(inode);
-                struct fuse_inode *fi = get_fuse_inode(inode);
-                struct fuse_file *ff = file->private_data;
-                /*
-                 * file may be written through mmap, so chain it onto the
-                 * inodes's write_file list
-                 */
-                spin_lock(&fc->lock);
-                if (list_empty(&ff->write_entry))
-                        list_add(&ff->write_entry, &fi->write_files);
-                spin_unlock(&fc->lock);
-        }
        file_accessed(file);
        vma->vm_ops = &fuse_file_vm_ops;
        return 0;
@@ -2606,7 +2793,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
 {
        spin_lock(&fc->lock);
        if (RB_EMPTY_NODE(&ff->polled_node)) {
-                struct rb_node **link, *parent;
+                struct rb_node **link, *uninitialized_var(parent);
                link = fuse_find_polled_node(fc, ff->kh, &parent);
                BUG_ON(*link);
@@ -2808,6 +2995,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
        bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
                           (mode & FALLOC_FL_PUNCH_HOLE);
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+                return -EOPNOTSUPP;
        if (fc->no_fallocate)
                return -EOPNOTSUPP;
@@ -2850,8 +3040,12 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
                goto out;
        /* we could have extended the file */
-        if (!(mode & FALLOC_FL_KEEP_SIZE))
+        if (!(mode & FALLOC_FL_KEEP_SIZE)) {
-                fuse_write_update_size(inode, offset + length);
+                bool changed = fuse_write_update_size(inode, offset + length);
+                if (changed && fc->writeback_cache)
+                        file_update_time(file);
+        }
        if (mode & FALLOC_FL_PUNCH_HOLE)
                truncate_pagecache_range(inode, offset, offset + length - 1);
@@ -2915,6 +3109,8 @@ static const struct address_space_operations fuse_file_aops  = {
        .set_page_dirty = __set_page_dirty_nobuffers,
        .bmap           = fuse_bmap,
        .direct_IO      = fuse_direct_IO,
+        .write_begin    = fuse_write_begin,
+        .write_end      = fuse_write_end,
 };
 void fuse_init_file_inode(struct inode *inode)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 2da5db2c8bdb..7aa5c75e0de1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -480,6 +480,9 @@ struct fuse_conn {
        /** Set if bdi is valid */
        unsigned bdi_initialized:1;
+        /** write-back cache policy (default is write-through) */
+        unsigned writeback_cache:1;
        /*
         * The following bitfields are only for optimization purposes
         * and hence races in setting them will not cause malfunction
@@ -539,6 +542,9 @@ struct fuse_conn {
        /** Is fallocate not implemented by fs? */
        unsigned no_fallocate:1;
+        /** Is rename with flags implemented by fs? */
+        unsigned no_rename2:1;
        /** Use enhanced/automatic page cache invalidation. */
        unsigned auto_inval_data:1;
@@ -720,7 +726,7 @@ int fuse_dev_init(void);
 void fuse_dev_cleanup(void);
 int fuse_ctl_init(void);
-void fuse_ctl_cleanup(void);
+void __exit fuse_ctl_cleanup(void);
 /**
 * Allocate a request
@@ -863,9 +869,20 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
                 bool isdir);
+/**
+ * fuse_direct_io() flags
+ */
+/** If set, it is WRITE; otherwise - READ */
+#define FUSE_DIO_WRITE (1 << 0)
+/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
+#define FUSE_DIO_CUSE  (1 << 1)
 ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
                       unsigned long nr_segs, size_t count, loff_t *ppos,
-                       int write);
+                       int flags);
 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
                   unsigned int flags);
 long fuse_ioctl_common(struct file *file, unsigned int cmd,
@@ -873,7 +890,10 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
 unsigned fuse_file_poll(struct file *file, poll_table *wait);
 int fuse_dev_release(struct inode *inode, struct file *file);
-void fuse_write_update_size(struct inode *inode, loff_t pos);
+bool fuse_write_update_size(struct inode *inode, loff_t pos);
+int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
+int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
 int fuse_do_setattr(struct inode *inode, struct iattr *attr,
                    struct file *file);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index d468643a68b2..754dcf23de8a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -123,7 +123,7 @@ static void fuse_destroy_inode(struct inode *inode)
 static void fuse_evict_inode(struct inode *inode)
 {
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        if (inode->i_sb->s_flags & MS_ACTIVE) {
                struct fuse_conn *fc = get_fuse_conn(inode);
@@ -135,6 +135,7 @@ static void fuse_evict_inode(struct inode *inode)
 static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        if (*flags & MS_MANDLOCK)
                return -EINVAL;
@@ -170,10 +171,13 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
        inode->i_blocks  = attr->blocks;
        inode->i_atime.tv_sec   = attr->atime;
        inode->i_atime.tv_nsec  = attr->atimensec;
-        inode->i_mtime.tv_sec   = attr->mtime;
+        /* mtime from server may be stale due to local buffered write */
-        inode->i_mtime.tv_nsec  = attr->mtimensec;
+        if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
-        inode->i_ctime.tv_sec   = attr->ctime;
+                inode->i_mtime.tv_sec   = attr->mtime;
-        inode->i_ctime.tv_nsec  = attr->ctimensec;
+                inode->i_mtime.tv_nsec  = attr->mtimensec;
+                inode->i_ctime.tv_sec   = attr->ctime;
+                inode->i_ctime.tv_nsec  = attr->ctimensec;
+        }
        if (attr->blksize != 0)
                inode->i_blkbits = ilog2(attr->blksize);
@@ -197,6 +201,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
+        bool is_wb = fc->writeback_cache;
        loff_t oldsize;
        struct timespec old_mtime;
@@ -211,10 +216,16 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
        fuse_change_attributes_common(inode, attr, attr_valid);
        oldsize = inode->i_size;
-        i_size_write(inode, attr->size);
+        /*
+         * In case of writeback_cache enabled, the cached writes beyond EOF
+         * extend local i_size without keeping userspace server in sync. So,
+         * attr->size coming from server can be stale. We cannot trust it.
+         */
+        if (!is_wb || !S_ISREG(inode->i_mode))
+                i_size_write(inode, attr->size);
        spin_unlock(&fc->lock);
-        if (S_ISREG(inode->i_mode)) {
+        if (!is_wb && S_ISREG(inode->i_mode)) {
                bool inval = false;
                if (oldsize != attr->size) {
@@ -243,6 +254,10 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
 {
        inode->i_mode = attr->mode & S_IFMT;
        inode->i_size = attr->size;
+        inode->i_mtime.tv_sec  = attr->mtime;
+        inode->i_mtime.tv_nsec = attr->mtimensec;
+        inode->i_ctime.tv_sec  = attr->ctime;
+        inode->i_ctime.tv_nsec = attr->ctimensec;
        if (S_ISREG(inode->i_mode)) {
                fuse_init_common(inode);
                fuse_init_file_inode(inode);
@@ -289,7 +304,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
                return NULL;
        if ((inode->i_state & I_NEW)) {
-                inode->i_flags |= S_NOATIME|S_NOCMTIME;
+                inode->i_flags |= S_NOATIME;
+                if (!fc->writeback_cache || !S_ISREG(attr->mode))
+                        inode->i_flags |= S_NOCMTIME;
                inode->i_generation = generation;
                inode->i_data.backing_dev_info = &fc->bdi;
                fuse_init_inode(inode, attr);
@@ -773,6 +790,7 @@ static const struct super_operations fuse_super_operations = {
        .alloc_inode    = fuse_alloc_inode,
        .destroy_inode  = fuse_destroy_inode,
        .evict_inode    = fuse_evict_inode,
+        .write_inode    = fuse_write_inode,
        .drop_inode     = generic_delete_inode,
        .remount_fs     = fuse_remount_fs,
        .put_super      = fuse_put_super,
@@ -873,6 +891,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                        }
                        if (arg->flags & FUSE_ASYNC_DIO)
                                fc->async_dio = 1;
+                        if (arg->flags & FUSE_WRITEBACK_CACHE)
+                                fc->writeback_cache = 1;
+                        if (arg->time_gran && arg->time_gran <= 1000000000)
+                                fc->sb->s_time_gran = arg->time_gran;
+                        else
+                                fc->sb->s_time_gran = 1000000000;
                } else {
                        ra_pages = fc->max_read / PAGE_CACHE_SIZE;
                        fc->no_lock = 1;
@@ -900,7 +925,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
                FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
                FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
-                FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO;
+                FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
+                FUSE_WRITEBACK_CACHE;
        req->in.h.opcode = FUSE_INIT;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(*arg);
@@ -978,7 +1004,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        if (sb->s_flags & MS_MANDLOCK)
                goto err;
-        sb->s_flags &= ~MS_NOSEC;
+        sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION);
        if (!parse_fuse_opt((char *) data, &d, is_bdev))
                goto err;
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index ba9456685f47..3088e2a38e30 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -64,18 +64,6 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
        return acl;
 }
-static int gfs2_set_mode(struct inode *inode, umode_t mode)
-{
-        int error = 0;
-        if (mode != inode->i_mode) {
-                inode->i_mode = mode;
-                mark_inode_dirty(inode);
-        }
-        return error;
-}
 int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
        int error;
@@ -85,8 +73,8 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
        BUG_ON(name == NULL);
-        if (acl->a_count > GFS2_ACL_MAX_ENTRIES)
+        if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
-                return -EINVAL;
+                return -E2BIG;
        if (type == ACL_TYPE_ACCESS) {
                umode_t mode = inode->i_mode;
@@ -98,9 +86,10 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
                if (error == 0)
                        acl = NULL;
-                error = gfs2_set_mode(inode, mode);
+                if (mode != inode->i_mode) {
-                if (error)
+                        inode->i_mode = mode;
-                        return error;
+                        mark_inode_dirty(inode);
+                }
        }
        if (acl) {
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 301260c999ba..2d65ec4cd4be 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -14,7 +14,7 @@
 #define GFS2_POSIX_ACL_ACCESS           "posix_acl_access"
 #define GFS2_POSIX_ACL_DEFAULT          "posix_acl_default"
-#define GFS2_ACL_MAX_ENTRIES            25
+#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
 extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
 extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 49436fa7cd4f..ce62dcac90b6 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -21,6 +21,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/backing-dev.h>
 #include <linux/aio.h>
+#include <trace/events/writeback.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -230,13 +231,11 @@ static int gfs2_writepages(struct address_space *mapping,
 static int gfs2_write_jdata_pagevec(struct address_space *mapping,
                                    struct writeback_control *wbc,
                                    struct pagevec *pvec,
-                                    int nr_pages, pgoff_t end)
+                                    int nr_pages, pgoff_t end,
+                                    pgoff_t *done_index)
 {
        struct inode *inode = mapping->host;
        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        loff_t i_size = i_size_read(inode);
-        pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
-        unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
        unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
        int i;
        int ret;
@@ -248,40 +247,83 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
        for(i = 0; i < nr_pages; i++) {
                struct page *page = pvec->pages[i];
+                /*
+                 * At this point, the page may be truncated or
+                 * invalidated (changing page->mapping to NULL), or
+                 * even swizzled back from swapper_space to tmpfs file
+                 * mapping. However, page->index will not change
+                 * because we have a reference on the page.
+                 */
+                if (page->index > end) {
+                        /*
+                         * can't be range_cyclic (1st pass) because
+                         * end == -1 in that case.
+                         */
+                        ret = 1;
+                        break;
+                }
+                *done_index = page->index;
                lock_page(page);
                if (unlikely(page->mapping != mapping)) {
+continue_unlock:
                        unlock_page(page);
                        continue;
                }
-                if (!wbc->range_cyclic && page->index > end) {
+                if (!PageDirty(page)) {
-                        ret = 1;
+                        /* someone wrote it for us */
-                        unlock_page(page);
+                        goto continue_unlock;
-                        continue;
                }
-                if (wbc->sync_mode != WB_SYNC_NONE)
+                if (PageWriteback(page)) {
-                        wait_on_page_writeback(page);
+                        if (wbc->sync_mode != WB_SYNC_NONE)
+                                wait_on_page_writeback(page);
-                if (PageWriteback(page) ||
+                        else
-                    !clear_page_dirty_for_io(page)) {
+                                goto continue_unlock;
-                        unlock_page(page);
-                        continue;
                }
-                /* Is the page fully outside i_size? (truncate in progress) */
+                BUG_ON(PageWriteback(page));
-                if (page->index > end_index || (page->index == end_index && !offset)) {
+                if (!clear_page_dirty_for_io(page))
-                        page->mapping->a_ops->invalidatepage(page, 0,
+                        goto continue_unlock;
-                                                             PAGE_CACHE_SIZE);
-                        unlock_page(page);
+                trace_wbc_writepage(wbc, mapping->backing_dev_info);
-                        continue;
-                }
                ret = __gfs2_jdata_writepage(page, wbc);
+                if (unlikely(ret)) {
+                        if (ret == AOP_WRITEPAGE_ACTIVATE) {
+                                unlock_page(page);
+                                ret = 0;
+                        } else {
+                                /*
+                                 * done_index is set past this page,
+                                 * so media errors will not choke
+                                 * background writeout for the entire
+                                 * file. This has consequences for
+                                 * range_cyclic semantics (ie. it may
+                                 * not be suitable for data integrity
+                                 * writeout).
+                                 */
+                                *done_index = page->index + 1;
+                                ret = 1;
+                                break;
+                        }
+                }
-                if (ret || (--(wbc->nr_to_write) <= 0))
+                /*
+                 * We stop writing back only if we are not doing
+                 * integrity sync. In case of integrity sync we have to
+                 * keep going until we have written all the pages
+                 * we tagged for writeback prior to entering this loop.
+                 */
+                if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) {
                        ret = 1;
+                        break;
+                }
        }
        gfs2_trans_end(sdp);
        return ret;
@@ -306,51 +348,69 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
        int done = 0;
        struct pagevec pvec;
        int nr_pages;
+        pgoff_t uninitialized_var(writeback_index);
        pgoff_t index;
        pgoff_t end;
-        int scanned = 0;
+        pgoff_t done_index;
+        int cycled;
        int range_whole = 0;
+        int tag;
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
-                index = mapping->writeback_index; /* Start from prev offset */
+                writeback_index = mapping->writeback_index; /* prev offset */
+                index = writeback_index;
+                if (index == 0)
+                        cycled = 1;
+                else
+                        cycled = 0;
                end = -1;
        } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
                end = wbc->range_end >> PAGE_CACHE_SHIFT;
                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                        range_whole = 1;
-                scanned = 1;
+                cycled = 1; /* ignore range_cyclic tests */
        }
+        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+                tag = PAGECACHE_TAG_TOWRITE;
+        else
+                tag = PAGECACHE_TAG_DIRTY;
 retry:
-         while (!done && (index <= end) &&
+        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
-                (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                tag_pages_for_writeback(mapping, index, end);
-                                               PAGECACHE_TAG_DIRTY,
+        done_index = index;
-                                               min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+        while (!done && (index <= end)) {
-                scanned = 1;
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-                ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end);
+                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+                if (nr_pages == 0)
+                        break;
+                ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end, &done_index);
                if (ret)
                        done = 1;
                if (ret > 0)
                        ret = 0;
                pagevec_release(&pvec);
                cond_resched();
        }
-        if (!scanned && !done) {
+        if (!cycled && !done) {
                /*
+                 * range_cyclic:
                 * We hit the last page and there is more work to be done: wrap
                 * back to the start of the file
                 */
-                scanned = 1;
+                cycled = 1;
                index = 0;
+                end = writeback_index - 1;
                goto retry;
        }
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
-                mapping->writeback_index = index;
+                mapping->writeback_index = done_index;
        return ret;
 }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index fe0500c0af7a..c62d4b9f51dc 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1328,6 +1328,121 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
 }
 /**
+ * gfs2_free_journal_extents - Free cached journal bmap info
+ * @jd: The journal
+ *
+ */
+void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
+{
+        struct gfs2_journal_extent *jext;
+        while(!list_empty(&jd->extent_list)) {
+                jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
+                list_del(&jext->list);
+                kfree(jext);
+        }
+}
+/**
+ * gfs2_add_jextent - Add or merge a new extent to extent cache
+ * @jd: The journal descriptor
+ * @lblock: The logical block at start of new extent
+ * @pblock: The physical block at start of new extent
+ * @blocks: Size of extent in fs blocks
+ *
+ * Returns: 0 on success or -ENOMEM
+ */
+static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
+{
+        struct gfs2_journal_extent *jext;
+        if (!list_empty(&jd->extent_list)) {
+                jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
+                if ((jext->dblock + jext->blocks) == dblock) {
+                        jext->blocks += blocks;
+                        return 0;
+                }
+        }
+        jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
+        if (jext == NULL)
+                return -ENOMEM;
+        jext->dblock = dblock;
+        jext->lblock = lblock;
+        jext->blocks = blocks;
+        list_add_tail(&jext->list, &jd->extent_list);
+        jd->nr_extents++;
+        return 0;
+}
+/**
+ * gfs2_map_journal_extents - Cache journal bmap info
+ * @sdp: The super block
+ * @jd: The journal to map
+ *
+ * Create a reusable "extent" mapping from all logical
+ * blocks to all physical blocks for the given journal.  This will save
+ * us time when writing journal blocks.  Most journals will have only one
+ * extent that maps all their logical blocks.  That's because gfs2.mkfs
+ * arranges the journal blocks sequentially to maximize performance.
+ * So the extent would map the first block for the entire file length.
+ * However, gfs2_jadd can happen while file activity is happening, so
+ * those journals may not be sequential.  Less likely is the case where
+ * the users created their own journals by mounting the metafs and
+ * laying it out.  But it's still possible.  These journals might have
+ * several extents.
+ *
+ * Returns: 0 on success, or error on failure
+ */
+int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
+{
+        u64 lblock = 0;
+        u64 lblock_stop;
+        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+        struct buffer_head bh;
+        unsigned int shift = sdp->sd_sb.sb_bsize_shift;
+        u64 size;
+        int rc;
+        lblock_stop = i_size_read(jd->jd_inode) >> shift;
+        size = (lblock_stop - lblock) << shift;
+        jd->nr_extents = 0;
+        WARN_ON(!list_empty(&jd->extent_list));
+        do {
+                bh.b_state = 0;
+                bh.b_blocknr = 0;
+                bh.b_size = size;
+                rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
+                if (rc || !buffer_mapped(&bh))
+                        goto fail;
+                rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
+                if (rc)
+                        goto fail;
+                size -= bh.b_size;
+                lblock += (bh.b_size >> ip->i_inode.i_blkbits);
+        } while(size > 0);
+        fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
+                jd->nr_extents);
+        return 0;
+fail:
+        fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
+                rc, jd->jd_jid,
+                (unsigned long long)(i_size_read(jd->jd_inode) - size),
+                jd->nr_extents);
+        fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
+                rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
+                bh.b_state, (unsigned long long)bh.b_size);
+        gfs2_free_journal_extents(jd);
+        return rc;
+}
+/**
 * gfs2_write_alloc_required - figure out if a write will require an allocation
 * @ip: the file being written to
 * @offset: the offset to write to
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 42fea03e2bd9..81ded5e2aaa2 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -55,5 +55,7 @@ extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
 extern int gfs2_file_dealloc(struct gfs2_inode *ip);
 extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
                                     unsigned int len);
+extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd);
+extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd);
 #endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index fa32655449c8..1a349f9a9685 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -53,6 +53,8 @@
 * but never before the maximum hash table size has been reached.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/buffer_head.h>
@@ -507,8 +509,8 @@ static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
                goto error;
        return 0;
 error:
-        printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
+        pr_warn("%s: %s (%s)\n",
-               first ? "first in block" : "not first in block");
+                __func__, msg, first ? "first in block" : "not first in block");
        return -EIO;
 }
@@ -531,8 +533,7 @@ static int gfs2_dirent_offset(const void *buf)
        }
        return offset;
 wrong_type:
-        printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
+        pr_warn("%s: wrong block type %u\n", __func__, be32_to_cpu(h->mh_type));
-               be32_to_cpu(h->mh_type));
        return -1;
 }
@@ -728,7 +729,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
        error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
        if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
-                /* printk(KERN_INFO "block num=%llu\n", leaf_no); */
+                /* pr_info("block num=%llu\n", leaf_no); */
                error = -EIO;
        }
@@ -1006,7 +1007,8 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
        half_len = len >> 1;
        if (!half_len) {
-                printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
+                pr_warn("i_depth %u lf_depth %u index %u\n",
+                        dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
                gfs2_consist_inode(dip);
                error = -EIO;
                goto fail_brelse;
@@ -1684,6 +1686,14 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
        return 0;
 }
+static u16 gfs2_inode_ra_len(const struct gfs2_inode *ip)
+{
+        u64 where = ip->i_no_addr + 1;
+        if (ip->i_eattr == where)
+                return 1;
+        return 0;
+}
 /**
 * gfs2_dir_add - Add new filename into directory
 * @inode: The directory inode
@@ -1721,6 +1731,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                        dent = gfs2_init_dirent(inode, dent, name, bh);
                        gfs2_inum_out(nip, dent);
                        dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
+                        dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip));
                        tv = CURRENT_TIME;
                        if (ip->i_diskflags & GFS2_DIF_EXHASH) {
                                leaf = (struct gfs2_leaf *)bh->b_data;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index efc078f0ee4e..80d67253623c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -494,6 +494,7 @@ out:
 static const struct vm_operations_struct gfs2_vm_ops = {
        .fault = filemap_fault,
+        .map_pages = filemap_map_pages,
        .page_mkwrite = gfs2_page_mkwrite,
        .remap_pages = generic_file_remap_pages,
 };
@@ -811,6 +812,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
        loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
        loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
        loff_t max_chunk_size = UINT_MAX & bsize_mask;
+        struct gfs2_holder gh;
        next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
        /* We only support the FALLOC_FL_KEEP_SIZE mode */
@@ -831,8 +834,10 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
        if (error)
                return error;
-        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+        mutex_lock(&inode->i_mutex);
-        error = gfs2_glock_nq(&ip->i_gh);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        error = gfs2_glock_nq(&gh);
        if (unlikely(error))
                goto out_uninit;
@@ -900,9 +905,10 @@ out_trans_fail:
 out_qunlock:
        gfs2_quota_unlock(ip);
 out_unlock:
-        gfs2_glock_dq(&ip->i_gh);
+        gfs2_glock_dq(&gh);
 out_uninit:
-        gfs2_holder_uninit(&ip->i_gh);
+        gfs2_holder_uninit(&gh);
+        mutex_unlock(&inode->i_mutex);
        return error;
 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ca0be6c69a26..aec7f73832f0 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -7,6 +7,8 @@
 * of the GNU General Public License version 2.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
@@ -468,7 +470,7 @@ retry:
                        do_xmote(gl, gh, LM_ST_UNLOCKED);
                        break;
                default: /* Everything else */
-                        printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state);
+                        pr_err("wanted %u got %u\n", gl->gl_target, state);
                        GLOCK_BUG_ON(gl, 1);
                }
                spin_unlock(&gl->gl_spin);
@@ -542,7 +544,7 @@ __acquires(&gl->gl_spin)
                /* lock_dlm */
                ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
                if (ret) {
-                        printk(KERN_ERR "GFS2: lm_lock ret %d\n", ret);
+                        pr_err("lm_lock ret %d\n", ret);
                        GLOCK_BUG_ON(gl, 1);
                }
        } else { /* lock_nolock */
@@ -935,7 +937,7 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
                vaf.fmt = fmt;
                vaf.va = &args;
-                printk(KERN_ERR " %pV", &vaf);
+                pr_err("%pV", &vaf);
        }
        va_end(args);
@@ -1010,13 +1012,13 @@ do_cancel:
        return;
 trap_recursive:
-        printk(KERN_ERR "original: %pSR\n", (void *)gh2->gh_ip);
+        pr_err("original: %pSR\n", (void *)gh2->gh_ip);
-        printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
+        pr_err("pid: %d\n", pid_nr(gh2->gh_owner_pid));
-        printk(KERN_ERR "lock type: %d req lock state : %d\n",
+        pr_err("lock type: %d req lock state : %d\n",
               gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
-        printk(KERN_ERR "new: %pSR\n", (void *)gh->gh_ip);
+        pr_err("new: %pSR\n", (void *)gh->gh_ip);
-        printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
+        pr_err("pid: %d\n", pid_nr(gh->gh_owner_pid));
-        printk(KERN_ERR "lock type: %d req lock state : %d\n",
+        pr_err("lock type: %d req lock state : %d\n",
               gh->gh_gl->gl_name.ln_type, gh->gh_state);
        gfs2_dump_glock(NULL, gl);
        BUG();
@@ -1045,9 +1047,13 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
        spin_lock(&gl->gl_spin);
        add_to_queue(gh);
-        if ((LM_FLAG_NOEXP & gh->gh_flags) &&
+        if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
-            test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+                     test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) {
                set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+                gl->gl_lockref.count++;
+                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                        gl->gl_lockref.count--;
+        }
        run_queue(gl, 1);
        spin_unlock(&gl->gl_spin);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 3bf0631b5d56..54b66809e818 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -82,6 +82,8 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
        struct gfs2_trans tr;
        memset(&tr, 0, sizeof(tr));
+        INIT_LIST_HEAD(&tr.tr_buf);
+        INIT_LIST_HEAD(&tr.tr_databuf);
        tr.tr_revokes = atomic_read(&gl->gl_ail_count);
        if (!tr.tr_revokes)
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index cf0e34400f71..bdf70c18610c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -52,7 +52,7 @@ struct gfs2_log_header_host {
 */
 struct gfs2_log_operations {
-        void (*lo_before_commit) (struct gfs2_sbd *sdp);
+        void (*lo_before_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
        void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
        void (*lo_before_scan) (struct gfs2_jdesc *jd,
                                struct gfs2_log_header_host *head, int pass);
@@ -371,6 +371,7 @@ enum {
        GIF_ALLOC_FAILED        = 2,
        GIF_SW_PAGED            = 3,
        GIF_ORDERED             = 4,
+        GIF_FREE_VFS_INODE      = 5,
 };
 struct gfs2_inode {
@@ -462,11 +463,11 @@ struct gfs2_trans {
        unsigned int tr_blocks;
        unsigned int tr_revokes;
        unsigned int tr_reserved;
+        unsigned int tr_touched:1;
+        unsigned int tr_attached:1;
        struct gfs2_holder tr_t_gh;
-        int tr_touched;
-        int tr_attached;
        unsigned int tr_num_buf_new;
        unsigned int tr_num_databuf_new;
@@ -476,6 +477,8 @@ struct gfs2_trans {
        unsigned int tr_num_revoke_rm;
        struct list_head tr_list;
+        struct list_head tr_databuf;
+        struct list_head tr_buf;
        unsigned int tr_first;
        struct list_head tr_ail1_list;
@@ -483,7 +486,7 @@ struct gfs2_trans {
 };
 struct gfs2_journal_extent {
-        struct list_head extent_list;
+        struct list_head list;
        unsigned int lblock; /* First logical block */
        u64 dblock; /* First disk block */
@@ -493,6 +496,7 @@ struct gfs2_journal_extent {
 struct gfs2_jdesc {
        struct list_head jd_list;
        struct list_head extent_list;
+        unsigned int nr_extents;
        struct work_struct jd_work;
        struct inode *jd_inode;
        unsigned long jd_flags;
@@ -500,6 +504,15 @@ struct gfs2_jdesc {
        unsigned int jd_jid;
        unsigned int jd_blocks;
        int jd_recover_error;
+        /* Replay stuff */
+        unsigned int jd_found_blocks;
+        unsigned int jd_found_revokes;
+        unsigned int jd_replayed_blocks;
+        struct list_head jd_revoke_list;
+        unsigned int jd_replay_tail;
 };
 struct gfs2_statfs_change_host {
@@ -746,19 +759,12 @@ struct gfs2_sbd {
        struct gfs2_trans *sd_log_tr;
        unsigned int sd_log_blks_reserved;
-        unsigned int sd_log_commited_buf;
-        unsigned int sd_log_commited_databuf;
        int sd_log_commited_revoke;
        atomic_t sd_log_pinned;
-        unsigned int sd_log_num_buf;
        unsigned int sd_log_num_revoke;
-        unsigned int sd_log_num_rg;
-        unsigned int sd_log_num_databuf;
-        struct list_head sd_log_le_buf;
        struct list_head sd_log_le_revoke;
-        struct list_head sd_log_le_databuf;
        struct list_head sd_log_le_ordered;
        spinlock_t sd_ordered_lock;
@@ -786,15 +792,6 @@ struct gfs2_sbd {
        struct list_head sd_ail1_list;
        struct list_head sd_ail2_list;
-        /* Replay stuff */
-        struct list_head sd_revoke_list;
-        unsigned int sd_replay_tail;
-        unsigned int sd_found_blocks;
-        unsigned int sd_found_revokes;
-        unsigned int sd_replayed_blocks;
        /* For quiescing the filesystem */
        struct gfs2_holder sd_freeze_gh;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5c524180c98e..28cc7bf6575a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -376,12 +376,11 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip,
                inode->i_gid = current_fsgid();
 }
-static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
+static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        struct gfs2_alloc_parms ap = { .target = RES_DINODE, .aflags = flags, };
+        struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, };
        int error;
-        int dblocks = 1;
        error = gfs2_quota_lock_check(ip);
        if (error)
@@ -391,11 +390,11 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
        if (error)
                goto out_quota;
-        error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 0);
+        error = gfs2_trans_begin(sdp, (*dblocks * RES_RG_BIT) + RES_STATFS + RES_QUOTA, 0);
        if (error)
                goto out_ipreserv;
-        error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation);
+        error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation);
        ip->i_no_formal_ino = ip->i_generation;
        ip->i_inode.i_ino = ip->i_no_addr;
        ip->i_goal = ip->i_no_addr;
@@ -428,6 +427,33 @@ static void gfs2_init_dir(struct buffer_head *dibh,
 }
 /**
+ * gfs2_init_xattr - Initialise an xattr block for a new inode
+ * @ip: The inode in question
+ *
+ * This sets up an empty xattr block for a new inode, ready to
+ * take any ACLs, LSM xattrs, etc.
+ */
+static void gfs2_init_xattr(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        struct buffer_head *bh;
+        struct gfs2_ea_header *ea;
+        bh = gfs2_meta_new(ip->i_gl, ip->i_eattr);
+        gfs2_trans_add_meta(ip->i_gl, bh);
+        gfs2_metatype_set(bh, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
+        gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+        ea = GFS2_EA_BH2FIRST(bh);
+        ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
+        ea->ea_type = GFS2_EATYPE_UNUSED;
+        ea->ea_flags = GFS2_EAFLAG_LAST;
+        brelse(bh);
+}
+/**
 * init_dinode - Fill in a new dinode structure
 * @dip: The directory this inode is being created in
 * @ip: The inode
@@ -545,13 +571,6 @@ static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
        return err;
 }
-static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
-                              const struct qstr *qstr)
-{
-        return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
-                                            &gfs2_initxattrs, NULL);
-}
 /**
 * gfs2_create_inode - Create a new inode
 * @dir: The parent directory
@@ -578,8 +597,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct gfs2_glock *io_gl;
        struct dentry *d;
-        int error;
+        int error, free_vfs_inode = 0;
        u32 aflags = 0;
+        unsigned blocks = 1;
        struct gfs2_diradd da = { .bh = NULL, };
        if (!name->len || name->len > GFS2_FNAMESIZE)
@@ -676,10 +696,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
            (dip->i_diskflags & GFS2_DIF_TOPDIR))
                aflags |= GFS2_AF_ORLOV;
-        error = alloc_dinode(ip, aflags);
+        if (default_acl || acl)
+                blocks++;
+        error = alloc_dinode(ip, aflags, &blocks);
        if (error)
                goto fail_free_inode;
+        gfs2_set_inode_blocks(inode, blocks);
        error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
        if (error)
                goto fail_free_inode;
@@ -689,10 +714,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        if (error)
                goto fail_free_inode;
-        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        error = gfs2_trans_begin(sdp, blocks, 0);
        if (error)
                goto fail_gunlock2;
+        if (blocks > 1) {
+                ip->i_eattr = ip->i_no_addr + 1;
+                gfs2_init_xattr(ip);
+        }
        init_dinode(dip, ip, symname);
        gfs2_trans_end(sdp);
@@ -722,7 +751,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        if (error)
                goto fail_gunlock3;
-        error = gfs2_security_init(dip, ip, name);
+        error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name,
+                                             &gfs2_initxattrs, NULL);
        if (error)
                goto fail_gunlock3;
@@ -758,15 +788,16 @@ fail_free_acls:
        if (acl)
                posix_acl_release(acl);
 fail_free_vfs_inode:
-        free_inode_nonrcu(inode);
+        free_vfs_inode = 1;
-        inode = NULL;
 fail_gunlock:
        gfs2_dir_no_add(&da);
        gfs2_glock_dq_uninit(ghs);
        if (inode && !IS_ERR(inode)) {
                clear_nlink(inode);
-                mark_inode_dirty(inode);
+                if (!free_vfs_inode)
-                set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags);
+                        mark_inode_dirty(inode);
+                set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED,
+                        &GFS2_I(inode)->i_flags);
                iput(inode);
        }
 fail:
@@ -1263,6 +1294,10 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
                }
                tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
+                if (!tmp) {
+                        error = -ENOENT;
+                        break;
+                }
                if (IS_ERR(tmp)) {
                        error = PTR_ERR(tmp);
                        break;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 2a6ba06bee6f..c1eb555dc588 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -7,6 +7,8 @@
 * of the GNU General Public License version 2.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/fs.h>
 #include <linux/dlm.h>
 #include <linux/slab.h>
@@ -176,7 +178,7 @@ static void gdlm_bast(void *arg, int mode)
                gfs2_glock_cb(gl, LM_ST_SHARED);
                break;
        default:
-                printk(KERN_ERR "unknown bast mode %d", mode);
+                pr_err("unknown bast mode %d\n", mode);
                BUG();
        }
 }
@@ -195,7 +197,7 @@ static int make_mode(const unsigned int lmstate)
        case LM_ST_SHARED:
                return DLM_LOCK_PR;
        }
-        printk(KERN_ERR "unknown LM state %d", lmstate);
+        pr_err("unknown LM state %d\n", lmstate);
        BUG();
        return -1;
 }
@@ -308,7 +310,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
        error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
                           NULL, gl);
        if (error) {
-                printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n",
+                pr_err("gdlm_unlock %x,%llx err=%d\n",
                       gl->gl_name.ln_type,
                       (unsigned long long)gl->gl_name.ln_number, error);
                return;
@@ -1102,7 +1104,7 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
        }
        if (ls->ls_recover_submit[jid]) {
-                fs_info(sdp, "recover_slot jid %d gen %u prev %u",
+                fs_info(sdp, "recover_slot jid %d gen %u prev %u\n",
                        jid, ls->ls_recover_block, ls->ls_recover_submit[jid]);
        }
        ls->ls_recover_submit[jid] = ls->ls_recover_block;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 9dcb9777a5f8..4a14d504ef83 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/bio.h>
+#include <linux/blkdev.h>
 #include <linux/writeback.h>
 #include <linux/list_sort.h>
@@ -145,8 +146,10 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
 {
        struct list_head *head = &sdp->sd_ail1_list;
        struct gfs2_trans *tr;
+        struct blk_plug plug;
        trace_gfs2_ail_flush(sdp, wbc, 1);
+        blk_start_plug(&plug);
        spin_lock(&sdp->sd_ail_lock);
 restart:
        list_for_each_entry_reverse(tr, head, tr_list) {
@@ -156,6 +159,7 @@ restart:
                        goto restart;
        }
        spin_unlock(&sdp->sd_ail_lock);
+        blk_finish_plug(&plug);
        trace_gfs2_ail_flush(sdp, wbc, 0);
 }
@@ -410,24 +414,22 @@ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer
 static unsigned int calc_reserved(struct gfs2_sbd *sdp)
 {
        unsigned int reserved = 0;
-        unsigned int mbuf_limit, metabufhdrs_needed;
+        unsigned int mbuf;
-        unsigned int dbuf_limit, databufhdrs_needed;
+        unsigned int dbuf;
-        unsigned int revokes = 0;
+        struct gfs2_trans *tr = sdp->sd_log_tr;
-        mbuf_limit = buf_limit(sdp);
+        if (tr) {
-        metabufhdrs_needed = (sdp->sd_log_commited_buf +
+                mbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
-                              (mbuf_limit - 1)) / mbuf_limit;
+                dbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
-        dbuf_limit = databuf_limit(sdp);
+                reserved = mbuf + dbuf;
-        databufhdrs_needed = (sdp->sd_log_commited_databuf +
+                /* Account for header blocks */
-                              (dbuf_limit - 1)) / dbuf_limit;
+                reserved += DIV_ROUND_UP(mbuf, buf_limit(sdp));
+                reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp));
+        }
        if (sdp->sd_log_commited_revoke > 0)
-                revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
+                reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
                                          sizeof(u64));
-        reserved = sdp->sd_log_commited_buf + metabufhdrs_needed +
-                sdp->sd_log_commited_databuf + databufhdrs_needed +
-                revokes;
        /* One for the overall header */
        if (reserved)
                reserved++;
@@ -682,36 +684,25 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
        }
        trace_gfs2_log_flush(sdp, 1);
+        sdp->sd_log_flush_head = sdp->sd_log_head;
+        sdp->sd_log_flush_wrapped = 0;
        tr = sdp->sd_log_tr;
        if (tr) {
                sdp->sd_log_tr = NULL;
                INIT_LIST_HEAD(&tr->tr_ail1_list);
                INIT_LIST_HEAD(&tr->tr_ail2_list);
+                tr->tr_first = sdp->sd_log_flush_head;
        }
-        if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) {
-                printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf,
-                       sdp->sd_log_commited_buf);
-                gfs2_assert_withdraw(sdp, 0);
-        }
-        if (sdp->sd_log_num_databuf != sdp->sd_log_commited_databuf) {
-                printk(KERN_INFO "GFS2: log databuf %u %u\n",
-                       sdp->sd_log_num_databuf, sdp->sd_log_commited_databuf);
-                gfs2_assert_withdraw(sdp, 0);
-        }
        gfs2_assert_withdraw(sdp,
                        sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
-        sdp->sd_log_flush_head = sdp->sd_log_head;
-        sdp->sd_log_flush_wrapped = 0;
-        if (tr)
-                tr->tr_first = sdp->sd_log_flush_head;
        gfs2_ordered_write(sdp);
-        lops_before_commit(sdp);
+        lops_before_commit(sdp, tr);
        gfs2_log_flush_bio(sdp, WRITE);
        if (sdp->sd_log_head != sdp->sd_log_flush_head) {
+                log_flush_wait(sdp);
                log_write_header(sdp, 0);
        } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
                atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
@@ -723,8 +714,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
        gfs2_log_lock(sdp);
        sdp->sd_log_head = sdp->sd_log_flush_head;
        sdp->sd_log_blks_reserved = 0;
-        sdp->sd_log_commited_buf = 0;
-        sdp->sd_log_commited_databuf = 0;
        sdp->sd_log_commited_revoke = 0;
        spin_lock(&sdp->sd_ail_lock);
@@ -740,34 +729,54 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
        kfree(tr);
 }
+/**
+ * gfs2_merge_trans - Merge a new transaction into a cached transaction
+ * @old: Original transaction to be expanded
+ * @new: New transaction to be merged
+ */
+static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new)
+{
+        WARN_ON_ONCE(old->tr_attached != 1);
+        old->tr_num_buf_new     += new->tr_num_buf_new;
+        old->tr_num_databuf_new += new->tr_num_databuf_new;
+        old->tr_num_buf_rm      += new->tr_num_buf_rm;
+        old->tr_num_databuf_rm  += new->tr_num_databuf_rm;
+        old->tr_num_revoke      += new->tr_num_revoke;
+        old->tr_num_revoke_rm   += new->tr_num_revoke_rm;
+        list_splice_tail_init(&new->tr_databuf, &old->tr_databuf);
+        list_splice_tail_init(&new->tr_buf, &old->tr_buf);
+}
 static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
        unsigned int reserved;
        unsigned int unused;
+        unsigned int maxres;
        gfs2_log_lock(sdp);
-        sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
+        if (sdp->sd_log_tr) {
-        sdp->sd_log_commited_databuf += tr->tr_num_databuf_new -
+                gfs2_merge_trans(sdp->sd_log_tr, tr);
-                tr->tr_num_databuf_rm;
+        } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) {
-        gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
+                gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl);
-                             (((int)sdp->sd_log_commited_databuf) >= 0));
+                sdp->sd_log_tr = tr;
+                tr->tr_attached = 1;
+        }
        sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
        reserved = calc_reserved(sdp);
-        gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
+        maxres = sdp->sd_log_blks_reserved + tr->tr_reserved;
-        unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
+        gfs2_assert_withdraw(sdp, maxres >= reserved);
+        unused = maxres - reserved;
        atomic_add(unused, &sdp->sd_log_blks_free);
        trace_gfs2_log_blocks(sdp, unused);
        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
                             sdp->sd_jdesc->jd_blocks);
        sdp->sd_log_blks_reserved = reserved;
-        if (sdp->sd_log_tr == NULL &&
-            (tr->tr_num_buf_new || tr->tr_num_databuf_new)) {
-                gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl);
-                sdp->sd_log_tr = tr;
-                tr->tr_attached = 1;
-        }
        gfs2_log_unlock(sdp);
 }
@@ -807,10 +816,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
        down_write(&sdp->sd_log_flush_lock);
        gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
-        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
-        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
-        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
        gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
        sdp->sd_log_flush_head = sdp->sd_log_head;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 76693793cedd..a294d8d8bcd4 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -146,8 +146,8 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
        struct gfs2_journal_extent *je;
        u64 block;
-        list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) {
+        list_for_each_entry(je, &sdp->sd_jdesc->extent_list, list) {
-                if (lbn >= je->lblock && lbn < je->lblock + je->blocks) {
+                if ((lbn >= je->lblock) && (lbn < (je->lblock + je->blocks))) {
                        block = je->dblock + lbn - je->lblock;
                        gfs2_log_incr_head(sdp);
                        return block;
@@ -491,44 +491,40 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
        gfs2_log_unlock(sdp);
 }
-static void buf_lo_before_commit(struct gfs2_sbd *sdp)
+static void buf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
        unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */
+        unsigned int nbuf;
-        gfs2_before_commit(sdp, limit, sdp->sd_log_num_buf,
+        if (tr == NULL)
-                           &sdp->sd_log_le_buf, 0);
+                return;
+        nbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
+        gfs2_before_commit(sdp, limit, nbuf, &tr->tr_buf, 0);
 }
 static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
-        struct list_head *head = &sdp->sd_log_le_buf;
+        struct list_head *head;
        struct gfs2_bufdata *bd;
-        if (tr == NULL) {
+        if (tr == NULL)
-                gfs2_assert(sdp, list_empty(head));
                return;
-        }
+        head = &tr->tr_buf;
        while (!list_empty(head)) {
                bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
                list_del_init(&bd->bd_list);
-                sdp->sd_log_num_buf--;
                gfs2_unpin(sdp, bd->bd_bh, tr);
        }
-        gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
 }
 static void buf_lo_before_scan(struct gfs2_jdesc *jd,
                               struct gfs2_log_header_host *head, int pass)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
        if (pass != 0)
                return;
-        sdp->sd_found_blocks = 0;
+        jd->jd_found_blocks = 0;
-        sdp->sd_replayed_blocks = 0;
+        jd->jd_replayed_blocks = 0;
 }
 static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -551,9 +547,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
        for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
                blkno = be64_to_cpu(*ptr++);
-                sdp->sd_found_blocks++;
+                jd->jd_found_blocks++;
-                if (gfs2_revoke_check(sdp, blkno, start))
+                if (gfs2_revoke_check(jd, blkno, start))
                        continue;
                error = gfs2_replay_read_block(jd, start, &bh_log);
@@ -574,7 +570,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
                if (error)
                        break;
-                sdp->sd_replayed_blocks++;
+                jd->jd_replayed_blocks++;
        }
        return error;
@@ -617,10 +613,10 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
        gfs2_meta_sync(ip->i_gl);
        fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
-                jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+                jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
 }
-static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
+static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
        struct gfs2_meta_header *mh;
        unsigned int offset;
@@ -679,13 +675,11 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
                                  struct gfs2_log_header_host *head, int pass)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
        if (pass != 0)
                return;
-        sdp->sd_found_revokes = 0;
+        jd->jd_found_revokes = 0;
-        sdp->sd_replay_tail = head->lh_tail;
+        jd->jd_replay_tail = head->lh_tail;
 }
 static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -717,13 +711,13 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
                while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
                        blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
-                        error = gfs2_revoke_add(sdp, blkno, start);
+                        error = gfs2_revoke_add(jd, blkno, start);
                        if (error < 0) {
                                brelse(bh);
                                return error;
                        }
                        else if (error)
-                                sdp->sd_found_revokes++;
+                                jd->jd_found_revokes++;
                        if (!--revokes)
                                break;
@@ -743,16 +737,16 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
        if (error) {
-                gfs2_revoke_clean(sdp);
+                gfs2_revoke_clean(jd);
                return;
        }
        if (pass != 1)
                return;
        fs_info(sdp, "jid=%u: Found %u revoke tags\n",
-                jd->jd_jid, sdp->sd_found_revokes);
+                jd->jd_jid, jd->jd_found_revokes);
-        gfs2_revoke_clean(sdp);
+        gfs2_revoke_clean(jd);
 }
 /**
@@ -760,12 +754,14 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
 *
 */
-static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
+static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
-        unsigned int limit = buf_limit(sdp) / 2;
+        unsigned int limit = databuf_limit(sdp);
+        unsigned int nbuf;
-        gfs2_before_commit(sdp, limit, sdp->sd_log_num_databuf,
+        if (tr == NULL)
-                           &sdp->sd_log_le_databuf, 1);
+                return;
+        nbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
+        gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1);
 }
 static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -789,9 +785,9 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
                blkno = be64_to_cpu(*ptr++);
                esc = be64_to_cpu(*ptr++);
-                sdp->sd_found_blocks++;
+                jd->jd_found_blocks++;
-                if (gfs2_revoke_check(sdp, blkno, start))
+                if (gfs2_revoke_check(jd, blkno, start))
                        continue;
                error = gfs2_replay_read_block(jd, start, &bh_log);
@@ -811,7 +807,7 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
                brelse(bh_log);
                brelse(bh_ip);
-                sdp->sd_replayed_blocks++;
+                jd->jd_replayed_blocks++;
        }
        return error;
@@ -835,26 +831,23 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
        gfs2_meta_sync(ip->i_gl);
        fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
-                jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+                jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
 }
 static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 {
-        struct list_head *head = &sdp->sd_log_le_databuf;
+        struct list_head *head;
        struct gfs2_bufdata *bd;
-        if (tr == NULL) {
+        if (tr == NULL)
-                gfs2_assert(sdp, list_empty(head));
                return;
-        }
+        head = &tr->tr_databuf;
        while (!list_empty(head)) {
                bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
                list_del_init(&bd->bd_list);
-                sdp->sd_log_num_databuf--;
                gfs2_unpin(sdp, bd->bd_bh, tr);
        }
-        gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
 }
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 9ca2e6438419..a65a7ba32ffd 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -46,12 +46,13 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp)
        return limit;
 }
-static inline void lops_before_commit(struct gfs2_sbd *sdp)
+static inline void lops_before_commit(struct gfs2_sbd *sdp,
+                                      struct gfs2_trans *tr)
 {
        int x;
        for (x = 0; gfs2_log_ops[x]; x++)
                if (gfs2_log_ops[x]->lo_before_commit)
-                        gfs2_log_ops[x]->lo_before_commit(sdp);
+                        gfs2_log_ops[x]->lo_before_commit(sdp, tr);
 }
 static inline void lops_after_commit(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index c272e73063de..82b6ac829656 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -7,6 +7,8 @@
 * of the GNU General Public License version 2.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
@@ -165,7 +167,7 @@ static int __init init_gfs2_fs(void)
        gfs2_register_debugfs();
-        printk("GFS2 installed\n");
+        pr_info("GFS2 installed\n");
        return 0;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index c7f24690ed05..2cf09b63a6b4 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -97,6 +97,11 @@ const struct address_space_operations gfs2_meta_aops = {
        .releasepage = gfs2_releasepage,
 };
+const struct address_space_operations gfs2_rgrp_aops = {
+        .writepage = gfs2_aspace_writepage,
+        .releasepage = gfs2_releasepage,
+};
 /**
 * gfs2_getbuf - Get a buffer with a given address space
 * @gl: the glock
@@ -267,15 +272,10 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
                trace_gfs2_pin(bd, 0);
                atomic_dec(&sdp->sd_log_pinned);
                list_del_init(&bd->bd_list);
-                if (meta) {
+                if (meta)
-                        gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
-                        sdp->sd_log_num_buf--;
                        tr->tr_num_buf_rm++;
-                } else {
+                else
-                        gfs2_assert_warn(sdp, sdp->sd_log_num_databuf);
-                        sdp->sd_log_num_databuf--;
                        tr->tr_num_databuf_rm++;
-                }
                tr->tr_touched = 1;
                was_pinned = 1;
                brelse(bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 4823b934208a..ac5d8027d335 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -38,12 +38,15 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
 }
 extern const struct address_space_operations gfs2_meta_aops;
+extern const struct address_space_operations gfs2_rgrp_aops;
 static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
 {
        struct inode *inode = mapping->host;
        if (mapping->a_ops == &gfs2_meta_aops)
                return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
+        else if (mapping->a_ops == &gfs2_rgrp_aops)
+                return container_of(mapping, struct gfs2_sbd, sd_aspace);
        else
                return inode->i_sb->s_fs_info;
 }
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c6872d09561a..22f954051bb8 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -7,6 +7,8 @@
 * of the GNU General Public License version 2.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
@@ -104,7 +106,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        mapping = &sdp->sd_aspace;
        address_space_init_once(mapping);
-        mapping->a_ops = &gfs2_meta_aops;
+        mapping->a_ops = &gfs2_rgrp_aops;
        mapping->host = sb->s_bdev->bd_inode;
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_NOFS);
@@ -114,9 +116,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        spin_lock_init(&sdp->sd_log_lock);
        atomic_set(&sdp->sd_log_pinned, 0);
-        INIT_LIST_HEAD(&sdp->sd_log_le_buf);
        INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
-        INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
        INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
        spin_lock_init(&sdp->sd_ordered_lock);
@@ -130,8 +130,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        atomic_set(&sdp->sd_log_in_flight, 0);
        init_waitqueue_head(&sdp->sd_log_flush_wait);
-        INIT_LIST_HEAD(&sdp->sd_revoke_list);
        return sdp;
 }
@@ -154,7 +152,7 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
        if (sb->sb_magic != GFS2_MAGIC ||
            sb->sb_type != GFS2_METATYPE_SB) {
                if (!silent)
-                        printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
+                        pr_warn("not a GFS2 filesystem\n");
                return -EINVAL;
        }
@@ -176,7 +174,7 @@ static void end_bio_io_page(struct bio *bio, int error)
        if (!error)
                SetPageUptodate(page);
        else
-                printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
+                pr_warn("error %d reading superblock\n", error);
        unlock_page(page);
 }
@@ -519,67 +517,6 @@ out:
        return ret;
 }
-/**
- * map_journal_extents - create a reusable "extent" mapping from all logical
- * blocks to all physical blocks for the given journal.  This will save
- * us time when writing journal blocks.  Most journals will have only one
- * extent that maps all their logical blocks.  That's because gfs2.mkfs
- * arranges the journal blocks sequentially to maximize performance.
- * So the extent would map the first block for the entire file length.
- * However, gfs2_jadd can happen while file activity is happening, so
- * those journals may not be sequential.  Less likely is the case where
- * the users created their own journals by mounting the metafs and
- * laying it out.  But it's still possible.  These journals might have
- * several extents.
- *
- * TODO: This should be done in bigger chunks rather than one block at a time,
- *       but since it's only done at mount time, I'm not worried about the
- *       time it takes.
- */
-static int map_journal_extents(struct gfs2_sbd *sdp)
-{
-        struct gfs2_jdesc *jd = sdp->sd_jdesc;
-        unsigned int lb;
-        u64 db, prev_db; /* logical block, disk block, prev disk block */
-        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
-        struct gfs2_journal_extent *jext = NULL;
-        struct buffer_head bh;
-        int rc = 0;
-        prev_db = 0;
-        for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
-                bh.b_state = 0;
-                bh.b_blocknr = 0;
-                bh.b_size = 1 << ip->i_inode.i_blkbits;
-                rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0);
-                db = bh.b_blocknr;
-                if (rc || !db) {
-                        printk(KERN_INFO "GFS2 journal mapping error %d: lb="
-                               "%u db=%llu\n", rc, lb, (unsigned long long)db);
-                        break;
-                }
-                if (!prev_db || db != prev_db + 1) {
-                        jext = kzalloc(sizeof(struct gfs2_journal_extent),
-                                       GFP_KERNEL);
-                        if (!jext) {
-                                printk(KERN_INFO "GFS2 error: out of memory "
-                                       "mapping journal extents.\n");
-                                rc = -ENOMEM;
-                                break;
-                        }
-                        jext->dblock = db;
-                        jext->lblock = lb;
-                        jext->blocks = 1;
-                        list_add_tail(&jext->extent_list, &jd->extent_list);
-                } else {
-                        jext->blocks++;
-                }
-                prev_db = db;
-        }
-        return rc;
-}
 static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
 {
        char *message = "FIRSTMOUNT=Done";
@@ -638,6 +575,8 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
                        break;
                INIT_LIST_HEAD(&jd->extent_list);
+                INIT_LIST_HEAD(&jd->jd_revoke_list);
                INIT_WORK(&jd->jd_work, gfs2_recover_func);
                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
@@ -781,7 +720,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
                /* Map the extents for this journal's blocks */
-                map_journal_extents(sdp);
+                gfs2_map_journal_extents(sdp, sdp->sd_jdesc);
        }
        trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free));
@@ -1008,7 +947,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
                lm = &gfs2_dlm_ops;
 #endif
        } else {
-                printk(KERN_INFO "GFS2: can't find protocol %s\n", proto);
+                pr_info("can't find protocol %s\n", proto);
                return -ENOENT;
        }
@@ -1115,7 +1054,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
        sdp = init_sbd(sb);
        if (!sdp) {
-                printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
+                pr_warn("can't alloc struct gfs2_sbd\n");
                return -ENOMEM;
        }
        sdp->sd_args = *args;
@@ -1363,7 +1302,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
        error = gfs2_mount_args(&args, data);
        if (error) {
-                printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
+                pr_warn("can't parse mount arguments\n");
                goto error_super;
        }
@@ -1413,15 +1352,15 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
        error = kern_path(dev_name, LOOKUP_FOLLOW, &path);
        if (error) {
-                printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
+                pr_warn("path_lookup on %s returned error %d\n",
-                       dev_name, error);
+                        dev_name, error);
                return ERR_PTR(error);
        }
        s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags,
                 path.dentry->d_inode->i_sb->s_bdev);
        path_put(&path);
        if (IS_ERR(s)) {
-                printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
+                pr_warn("gfs2 mount does not exist\n");
                return ERR_CAST(s);
        }
        if ((flags ^ s->s_flags) & MS_RDONLY) {
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8bec0e3192dd..c4effff7cf55 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -36,6 +36,8 @@
 * the quota file, so it is not being constantly read.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
@@ -330,6 +332,7 @@ static int slot_get(struct gfs2_quota_data *qd)
        if (bit < sdp->sd_quota_slots) {
                set_bit(bit, sdp->sd_quota_bitmap);
                qd->qd_slot = bit;
+                error = 0;
 out:
                qd->qd_slot_count++;
        }
@@ -1081,10 +1084,10 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
 {
        struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-        printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n",
+        fs_info(sdp, "quota %s for %s %u\n",
-               sdp->sd_fsname, type,
+                type,
-               (qd->qd_id.type == USRQUOTA) ? "user" : "group",
+                (qd->qd_id.type == USRQUOTA) ? "user" : "group",
-               from_kqid(&init_user_ns, qd->qd_id));
+                from_kqid(&init_user_ns, qd->qd_id));
        return 0;
 }
@@ -1242,14 +1245,13 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
        bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long));
        bm_size *= sizeof(unsigned long);
        error = -ENOMEM;
-        sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN);
+        sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN);
        if (sdp->sd_quota_bitmap == NULL)
-                sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL);
+                sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS |
+                                                 __GFP_ZERO, PAGE_KERNEL);
        if (!sdp->sd_quota_bitmap)
                return error;
-        memset(sdp->sd_quota_bitmap, 0, bm_size);
        for (x = 0; x < blocks; x++) {
                struct buffer_head *bh;
                const struct gfs2_quota_change *qc;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 963b2d75200c..7ad4094d68c0 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -52,9 +52,9 @@ int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
        return error;
 }
-int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
+int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
 {
-        struct list_head *head = &sdp->sd_revoke_list;
+        struct list_head *head = &jd->jd_revoke_list;
        struct gfs2_revoke_replay *rr;
        int found = 0;
@@ -81,13 +81,13 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
        return 1;
 }
-int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
+int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
 {
        struct gfs2_revoke_replay *rr;
        int wrap, a, b, revoke;
        int found = 0;
-        list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
+        list_for_each_entry(rr, &jd->jd_revoke_list, rr_list) {
                if (rr->rr_blkno == blkno) {
                        found = 1;
                        break;
@@ -97,17 +97,17 @@ int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
        if (!found)
                return 0;
-        wrap = (rr->rr_where < sdp->sd_replay_tail);
+        wrap = (rr->rr_where < jd->jd_replay_tail);
-        a = (sdp->sd_replay_tail < where);
+        a = (jd->jd_replay_tail < where);
        b = (where < rr->rr_where);
        revoke = (wrap) ? (a || b) : (a && b);
        return revoke;
 }
-void gfs2_revoke_clean(struct gfs2_sbd *sdp)
+void gfs2_revoke_clean(struct gfs2_jdesc *jd)
 {
-        struct list_head *head = &sdp->sd_revoke_list;
+        struct list_head *head = &jd->jd_revoke_list;
        struct gfs2_revoke_replay *rr;
        while (!list_empty(head)) {
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 2226136c7647..6142836cce96 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -23,9 +23,9 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
 extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
                           struct buffer_head **bh);
-extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
-extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
-extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
+extern void gfs2_revoke_clean(struct gfs2_jdesc *jd);
 extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
                    struct gfs2_log_header_host *head);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index a1da21349235..281a7716e3f3 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -7,6 +7,8 @@
 * of the GNU General Public License version 2.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
@@ -99,12 +101,12 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
        cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
        if (unlikely(!valid_change[new_state * 4 + cur_state])) {
-                printk(KERN_WARNING "GFS2: buf_blk = 0x%x old_state=%d, "
+                pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n",
-                       "new_state=%d\n", rbm->offset, cur_state, new_state);
+                        rbm->offset, cur_state, new_state);
-                printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%x\n",
+                pr_warn("rgrp=0x%llx bi_start=0x%x\n",
-                       (unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
+                        (unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
-                printk(KERN_WARNING "GFS2: bi_offset=0x%x bi_len=0x%x\n",
+                pr_warn("bi_offset=0x%x bi_len=0x%x\n",
-                       bi->bi_offset, bi->bi_len);
+                        bi->bi_offset, bi->bi_len);
                dump_stack();
                gfs2_consist_rgrpd(rbm->rgd);
                return;
@@ -736,11 +738,11 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
 static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
 {
-        printk(KERN_INFO "  ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
+        pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
-        printk(KERN_INFO "  ri_length = %u\n", rgd->rd_length);
+        pr_info("ri_length = %u\n", rgd->rd_length);
-        printk(KERN_INFO "  ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
+        pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
-        printk(KERN_INFO "  ri_data = %u\n", rgd->rd_data);
+        pr_info("ri_data = %u\n", rgd->rd_data);
-        printk(KERN_INFO "  ri_bitbytes = %u\n", rgd->rd_bitbytes);
+        pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes);
 }
 /**
@@ -1102,7 +1104,7 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd)
 * Returns: errno
 */
-int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 {
        struct gfs2_sbd *sdp = rgd->rd_sbd;
        struct gfs2_glock *gl = rgd->rd_gl;
@@ -1169,7 +1171,7 @@ fail:
        return error;
 }
-int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
+static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
 {
        u32 rl_flags;
@@ -2278,7 +2280,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
                }
        }
        if (rbm.rgd->rd_free < *nblocks) {
-                printk(KERN_WARNING "nblocks=%u\n", *nblocks);
+                pr_warn("nblocks=%u\n", *nblocks);
                goto rgrp_error;
        }
@@ -2296,7 +2298,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
        gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
        if (dinode)
-                gfs2_trans_add_unrevoke(sdp, block, 1);
+                gfs2_trans_add_unrevoke(sdp, block, *nblocks);
        gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 60f60f6181f3..de8afad89e51 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -7,6 +7,8 @@
 * of the GNU General Public License version 2.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/bio.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -175,8 +177,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
                        break;
                case Opt_debug:
                        if (args->ar_errors == GFS2_ERRORS_PANIC) {
-                                printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
+                                pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
-                                       "are mutually exclusive.\n");
                                return -EINVAL;
                        }
                        args->ar_debug = 1;
@@ -228,21 +229,21 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
                case Opt_commit:
                        rv = match_int(&tmp[0], &args->ar_commit);
                        if (rv || args->ar_commit <= 0) {
-                                printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n");
+                                pr_warn("commit mount option requires a positive numeric argument\n");
                                return rv ? rv : -EINVAL;
                        }
                        break;
                case Opt_statfs_quantum:
                        rv = match_int(&tmp[0], &args->ar_statfs_quantum);
                        if (rv || args->ar_statfs_quantum < 0) {
-                                printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n");
+                                pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n");
                                return rv ? rv : -EINVAL;
                        }
                        break;
                case Opt_quota_quantum:
                        rv = match_int(&tmp[0], &args->ar_quota_quantum);
                        if (rv || args->ar_quota_quantum <= 0) {
-                                printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n");
+                                pr_warn("quota_quantum mount option requires a positive numeric argument\n");
                                return rv ? rv : -EINVAL;
                        }
                        break;
@@ -250,7 +251,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
                        rv = match_int(&tmp[0], &args->ar_statfs_percent);
                        if (rv || args->ar_statfs_percent < 0 ||
                            args->ar_statfs_percent > 100) {
-                                printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n");
+                                pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n");
                                return rv ? rv : -EINVAL;
                        }
                        break;
@@ -259,8 +260,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
                        break;
                case Opt_err_panic:
                        if (args->ar_debug) {
-                                printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
+                                pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
-                                        "are mutually exclusive.\n");
                                return -EINVAL;
                        }
                        args->ar_errors = GFS2_ERRORS_PANIC;
@@ -279,7 +279,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
                        break;
                case Opt_error:
                default:
-                        printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o);
+                        pr_warn("invalid mount option: %s\n", o);
                        return -EINVAL;
                }
        }
@@ -295,9 +295,8 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
 void gfs2_jindex_free(struct gfs2_sbd *sdp)
 {
-        struct list_head list, *head;
+        struct list_head list;
        struct gfs2_jdesc *jd;
-        struct gfs2_journal_extent *jext;
        spin_lock(&sdp->sd_jindex_spin);
        list_add(&list, &sdp->sd_jindex_list);
@@ -307,14 +306,7 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp)
        while (!list_empty(&list)) {
                jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
-                head = &jd->extent_list;
+                gfs2_free_journal_extents(jd);
-                while (!list_empty(head)) {
-                        jext = list_entry(head->next,
-                                          struct gfs2_journal_extent,
-                                          extent_list);
-                        list_del(&jext->extent_list);
-                        kfree(jext);
-                }
                list_del(&jd->jd_list);
                iput(jd->jd_inode);
                kfree(jd);
@@ -1175,6 +1167,8 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        struct gfs2_tune *gt = &sdp->sd_tune;
        int error;
+        sync_filesystem(sb);
        spin_lock(&gt->gt_spin);
        args.ar_commit = gt->gt_logd_secs;
        args.ar_quota_quantum = gt->gt_quota_quantum;
@@ -1256,7 +1250,7 @@ static int gfs2_drop_inode(struct inode *inode)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
-        if (inode->i_nlink) {
+        if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && inode->i_nlink) {
                struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
                if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
                        clear_nlink(inode);
@@ -1471,6 +1465,11 @@ static void gfs2_evict_inode(struct inode *inode)
        struct gfs2_holder gh;
        int error;
+        if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
+                clear_inode(inode);
+                return;
+        }
        if (inode->i_nlink || (sb->s_flags & MS_RDONLY))
                goto out;
@@ -1558,7 +1557,7 @@ out_unlock:
                fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
 out:
        /* Case 3 starts here */
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        gfs2_rs_delete(ip, NULL);
        gfs2_ordered_del_inode(ip);
        clear_inode(inode);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d09f6edda0ff..de25d5577e5d 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -7,6 +7,8 @@
 * of the GNU General Public License version 2.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
@@ -138,9 +140,8 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
        if (simple_strtol(buf, NULL, 0) != 1)
                return -EINVAL;
-        gfs2_lm_withdraw(sdp,
+        gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n");
-                "GFS2: fsid=%s: withdrawing from cluster at user's request\n",
-                sdp->sd_fsname);
        return len;
 }
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 2b20d7046bf3..bead90d27bad 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -7,6 +7,8 @@
 * of the GNU General Public License version 2.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
@@ -51,6 +53,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
        if (revokes)
                tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
                                                   sizeof(u64));
+        INIT_LIST_HEAD(&tr->tr_databuf);
+        INIT_LIST_HEAD(&tr->tr_buf);
        sb_start_intwrite(sdp->sd_vfs);
        gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
@@ -96,14 +101,13 @@ static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
 static void gfs2_print_trans(const struct gfs2_trans *tr)
 {
-        printk(KERN_WARNING "GFS2: Transaction created at: %pSR\n",
+        pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip);
-               (void *)tr->tr_ip);
+        pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n",
-        printk(KERN_WARNING "GFS2: blocks=%u revokes=%u reserved=%u touched=%d\n",
+                tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched);
-               tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched);
+        pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
-        printk(KERN_WARNING "GFS2: Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
+                tr->tr_num_buf_new, tr->tr_num_buf_rm,
-               tr->tr_num_buf_new, tr->tr_num_buf_rm,
+                tr->tr_num_databuf_new, tr->tr_num_databuf_rm,
-               tr->tr_num_databuf_new, tr->tr_num_databuf_rm,
+                tr->tr_num_revoke, tr->tr_num_revoke_rm);
-               tr->tr_num_revoke, tr->tr_num_revoke_rm);
 }
 void gfs2_trans_end(struct gfs2_sbd *sdp)
@@ -210,8 +214,7 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
                set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
                gfs2_pin(sdp, bd->bd_bh);
                tr->tr_num_databuf_new++;
-                sdp->sd_log_num_databuf++;
+                list_add_tail(&bd->bd_list, &tr->tr_databuf);
-                list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
        }
        gfs2_log_unlock(sdp);
        unlock_buffer(bh);
@@ -230,16 +233,14 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
        set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
        mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
        if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
-                printk(KERN_ERR
+                pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n",
-                       "Attempting to add uninitialised block to journal (inplace block=%lld)\n",
                       (unsigned long long)bd->bd_bh->b_blocknr);
                BUG();
        }
        gfs2_pin(sdp, bd->bd_bh);
        mh->__pad0 = cpu_to_be64(0);
        mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
-        sdp->sd_log_num_buf++;
+        list_add(&bd->bd_list, &tr->tr_buf);
-        list_add(&bd->bd_list, &sdp->sd_log_le_buf);
        tr->tr_num_buf_new++;
 }
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f7109f689e61..86d2035ac669 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,6 +7,8 @@
 * of the GNU General Public License version 2.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
@@ -30,22 +32,27 @@ mempool_t *gfs2_page_pool __read_mostly;
 void gfs2_assert_i(struct gfs2_sbd *sdp)
 {
-        printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
+        fs_emerg(sdp, "fatal assertion failed\n");
-               sdp->sd_fsname);
 }
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
 {
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
        const struct lm_lockops *lm = ls->ls_ops;
        va_list args;
+        struct va_format vaf;
        if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
            test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
                return 0;
        va_start(args, fmt);
-        vprintk(fmt, args);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        fs_err(sdp, "%pV", &vaf);
        va_end(args);
        if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
@@ -66,7 +73,7 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
        }
        if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
-                panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname);
+                panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname);
        return -1;
 }
@@ -82,10 +89,9 @@ int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
 {
        int me;
        me = gfs2_lm_withdraw(sdp,
-                "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
+                              "fatal: assertion \"%s\" failed\n"
-                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                              "   function = %s, file = %s, line = %u\n",
-                sdp->sd_fsname, assertion,
+                              assertion, function, file, line);
-                sdp->sd_fsname, function, file, line);
        dump_stack();
        return (me) ? -1 : -2;
 }
@@ -105,11 +111,8 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
                return -2;
        if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
-                printk(KERN_WARNING
+                fs_warn(sdp, "warning: assertion \"%s\" failed at function = %s, file = %s, line = %u\n",
-                       "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+                        assertion, function, file, line);
-                       "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
-                       sdp->sd_fsname, assertion,
-                       sdp->sd_fsname, function, file, line);
        if (sdp->sd_args.ar_debug)
                BUG();
@@ -138,10 +141,8 @@ int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
 {
        int rv;
        rv = gfs2_lm_withdraw(sdp,
-                "GFS2: fsid=%s: fatal: filesystem consistency error\n"
+                              "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n",
-                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                              function, file, line);
-                sdp->sd_fsname,
-                sdp->sd_fsname, function, file, line);
        return rv;
 }
@@ -157,13 +158,12 @@ int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        int rv;
        rv = gfs2_lm_withdraw(sdp,
-                "GFS2: fsid=%s: fatal: filesystem consistency error\n"
+                              "fatal: filesystem consistency error\n"
-                "GFS2: fsid=%s:   inode = %llu %llu\n"
+                              "  inode = %llu %llu\n"
-                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                              "  function = %s, file = %s, line = %u\n",
-                sdp->sd_fsname,
+                              (unsigned long long)ip->i_no_formal_ino,
-                sdp->sd_fsname, (unsigned long long)ip->i_no_formal_ino,
+                              (unsigned long long)ip->i_no_addr,
-                (unsigned long long)ip->i_no_addr,
+                              function, file, line);
-                sdp->sd_fsname, function, file, line);
        return rv;
 }
@@ -179,12 +179,11 @@ int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
        struct gfs2_sbd *sdp = rgd->rd_sbd;
        int rv;
        rv = gfs2_lm_withdraw(sdp,
-                "GFS2: fsid=%s: fatal: filesystem consistency error\n"
+                              "fatal: filesystem consistency error\n"
-                "GFS2: fsid=%s:   RG = %llu\n"
+                              "  RG = %llu\n"
-                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                              "  function = %s, file = %s, line = %u\n",
-                sdp->sd_fsname,
+                              (unsigned long long)rgd->rd_addr,
-                sdp->sd_fsname, (unsigned long long)rgd->rd_addr,
+                              function, file, line);
-                sdp->sd_fsname, function, file, line);
        return rv;
 }
@@ -200,12 +199,11 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
 {
        int me;
        me = gfs2_lm_withdraw(sdp,
-                "GFS2: fsid=%s: fatal: invalid metadata block\n"
+                              "fatal: invalid metadata block\n"
-                "GFS2: fsid=%s:   bh = %llu (%s)\n"
+                              "  bh = %llu (%s)\n"
-                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                              "  function = %s, file = %s, line = %u\n",
-                sdp->sd_fsname,
+                              (unsigned long long)bh->b_blocknr, type,
-                sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type,
+                              function, file, line);
-                sdp->sd_fsname, function, file, line);
        return (me) ? -1 : -2;
 }
@@ -221,12 +219,11 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
 {
        int me;
        me = gfs2_lm_withdraw(sdp,
-                "GFS2: fsid=%s: fatal: invalid metadata block\n"
+                              "fatal: invalid metadata block\n"
-                "GFS2: fsid=%s:   bh = %llu (type: exp=%u, found=%u)\n"
+                              "  bh = %llu (type: exp=%u, found=%u)\n"
-                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                              "  function = %s, file = %s, line = %u\n",
-                sdp->sd_fsname,
+                              (unsigned long long)bh->b_blocknr, type, t,
-                sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t,
+                              function, file, line);
-                sdp->sd_fsname, function, file, line);
        return (me) ? -1 : -2;
 }
@@ -241,10 +238,9 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
 {
        int rv;
        rv = gfs2_lm_withdraw(sdp,
-                "GFS2: fsid=%s: fatal: I/O error\n"
+                              "fatal: I/O error\n"
-                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                              "  function = %s, file = %s, line = %u\n",
-                sdp->sd_fsname,
+                              function, file, line);
-                sdp->sd_fsname, function, file, line);
        return rv;
 }
@@ -259,12 +255,11 @@ int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
 {
        int rv;
        rv = gfs2_lm_withdraw(sdp,
-                "GFS2: fsid=%s: fatal: I/O error\n"
+                              "fatal: I/O error\n"
-                "GFS2: fsid=%s:   block = %llu\n"
+                              "  block = %llu\n"
-                "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                              "  function = %s, file = %s, line = %u\n",
-                sdp->sd_fsname,
+                              (unsigned long long)bh->b_blocknr,
-                sdp->sd_fsname, (unsigned long long)bh->b_blocknr,
+                              function, file, line);
-                sdp->sd_fsname, function, file, line);
        return rv;
 }
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index b7ffb09b99ea..cbdcbdf39614 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -10,22 +10,23 @@
 #ifndef __UTIL_DOT_H__
 #define __UTIL_DOT_H__
+#ifdef pr_fmt
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#endif
 #include <linux/mempool.h>
 #include "incore.h"
-#define fs_printk(level, fs, fmt, arg...) \
+#define fs_emerg(fs, fmt, ...)                                          \
-        printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
+        pr_emerg("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
+#define fs_warn(fs, fmt, ...)                                           \
-#define fs_info(fs, fmt, arg...) \
+        pr_warn("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
-        fs_printk(KERN_INFO , fs , fmt , ## arg)
+#define fs_err(fs, fmt, ...)                                            \
+        pr_err("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
-#define fs_warn(fs, fmt, arg...) \
+#define fs_info(fs, fmt, ...)                                           \
-        fs_printk(KERN_WARNING , fs , fmt , ## arg)
+        pr_info("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
-#define fs_err(fs, fmt, arg...) \
-        fs_printk(KERN_ERR, fs , fmt , ## arg)
 void gfs2_assert_i(struct gfs2_sbd *sdp);
@@ -85,7 +86,7 @@ static inline int gfs2_meta_check(struct gfs2_sbd *sdp,
        struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
        u32 magic = be32_to_cpu(mh->mh_magic);
        if (unlikely(magic != GFS2_MAGIC)) {
-                printk(KERN_ERR "GFS2: Magic number missing at %llu\n",
+                pr_err("Magic number missing at %llu\n",
                       (unsigned long long)bh->b_blocknr);
                return -EIO;
        }
@@ -164,7 +165,7 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
 #define gfs2_tune_get(sdp, field) \
 gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...);
+__printf(2, 3)
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...);
 #endif /* __UTIL_DOT_H__ */
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 380ab31b5e0f..9e2fecd62f62 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -547,7 +547,7 @@ out:
 void hfs_evict_inode(struct inode *inode)
 {
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
                HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 2d2039e754cd..eee7206c38d1 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -112,6 +112,7 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int hfs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_NODIRATIME;
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                return 0;
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index 0f47890299c4..caf89a7be0a1 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -11,7 +11,7 @@
 static struct kmem_cache *hfsplus_attr_tree_cachep;
-int hfsplus_create_attr_tree_cache(void)
+int __init hfsplus_create_attr_tree_cache(void)
 {
        if (hfsplus_attr_tree_cachep)
                return -EEXIST;
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index fbb212fbb1ef..a7aafb35b624 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -227,10 +227,8 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
        u32 ablock, dblock, mask;
        sector_t sector;
        int was_dirty = 0;
-        int shift;
        /* Convert inode block to disk allocation block */
-        shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
        ablock = iblock >> sbi->fs_shift;
        if (iblock >= hip->fs_blocks) {
@@ -498,11 +496,13 @@ int hfsplus_file_extend(struct inode *inode)
                        goto insert_extent;
        }
 out:
-        mutex_unlock(&hip->extents_lock);
        if (!res) {
                hip->alloc_blocks += len;
+                mutex_unlock(&hip->extents_lock);
                hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
+                return 0;
        }
+        mutex_unlock(&hip->extents_lock);
        return res;
 insert_extent:
@@ -556,11 +556,13 @@ void hfsplus_file_truncate(struct inode *inode)
        blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
                        HFSPLUS_SB(sb)->alloc_blksz_shift;
+        mutex_lock(&hip->extents_lock);
        alloc_cnt = hip->alloc_blocks;
        if (blk_cnt == alloc_cnt)
-                goto out;
+                goto out_unlock;
-        mutex_lock(&hip->extents_lock);
        res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
        if (res) {
                mutex_unlock(&hip->extents_lock);
@@ -592,10 +594,10 @@ void hfsplus_file_truncate(struct inode *inode)
                hfs_brec_remove(&fd);
        }
        hfs_find_exit(&fd);
-        mutex_unlock(&hip->extents_lock);
        hip->alloc_blocks = blk_cnt;
-out:
+out_unlock:
+        mutex_unlock(&hip->extents_lock);
        hip->phys_size = inode->i_size;
        hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >>
                sb->s_blocksize_bits;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 62d571eb69ba..83dc29286b10 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -367,7 +367,7 @@ typedef int (*search_strategy_t)(struct hfs_bnode *,
 */
 /* attributes.c */
-int hfsplus_create_attr_tree_cache(void);
+int __init hfsplus_create_attr_tree_cache(void);
 void hfsplus_destroy_attr_tree_cache(void);
 hfsplus_attr_entry *hfsplus_alloc_attr_entry(void);
 void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 80875aa640ef..a513d2d36be9 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -161,7 +161,7 @@ static int hfsplus_write_inode(struct inode *inode,
 static void hfsplus_evict_inode(struct inode *inode)
 {
        hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        if (HFSPLUS_IS_RSRC(inode)) {
                HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
@@ -323,6 +323,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                return 0;
        if (!(*flags & MS_RDONLY)) {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index fe649d325b1f..9c470fde9878 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -230,7 +230,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
 static void hostfs_evict_inode(struct inode *inode)
 {
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        if (HOSTFS_I(inode)->fd != -1) {
                close_file(&HOSTFS_I(inode)->fd);
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 9edeeb0ea97e..50a427313835 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -304,7 +304,7 @@ void hpfs_write_if_changed(struct inode *inode)
 void hpfs_evict_inode(struct inode *inode)
 {
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        if (!inode->i_nlink) {
                hpfs_lock(inode->i_sb);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 4534ff688b76..fe3463a43236 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -421,6 +421,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        struct hpfs_sb_info *sbi = hpfs_sb(s);
        char *new_opts = kstrdup(data, GFP_KERNEL);
        
+        sync_filesystem(s);
        *flags |= MS_NOATIME;
        
        hpfs_lock(s);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d19b30ababf1..e19d4c0cacae 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -366,7 +366,13 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
 static void hugetlbfs_evict_inode(struct inode *inode)
 {
+        struct resv_map *resv_map;
        truncate_hugepages(inode, 0);
+        resv_map = (struct resv_map *)inode->i_mapping->private_data;
+        /* root inode doesn't have the resv_map, so we should check it */
+        if (resv_map)
+                resv_map_release(&resv_map->refs);
        clear_inode(inode);
 }
@@ -476,6 +482,11 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
                                        umode_t mode, dev_t dev)
 {
        struct inode *inode;
+        struct resv_map *resv_map;
+        resv_map = resv_map_alloc();
+        if (!resv_map)
+                return NULL;
        inode = new_inode(sb);
        if (inode) {
@@ -487,7 +498,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
                inode->i_mapping->a_ops = &hugetlbfs_aops;
                inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-                INIT_LIST_HEAD(&inode->i_mapping->private_list);
+                inode->i_mapping->private_data = resv_map;
                info = HUGETLBFS_I(inode);
                /*
                 * The policy is initialized here even if we are creating a
@@ -517,7 +528,9 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
                        break;
                }
                lockdep_annotate_inode_mutex_key(inode);
-        }
+        } else
+                kref_put(&resv_map->refs, resv_map_release);
        return inode;
 }
@@ -1017,6 +1030,11 @@ static int __init init_hugetlbfs_fs(void)
        int error;
        int i;
+        if (!hugepages_supported()) {
+                pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n");
+                return -ENOTSUPP;
+        }
        error = bdi_init(&hugetlbfs_backing_dev_info);
        if (error)
                return error;
diff --git a/fs/inode.c b/fs/inode.c
index 4bcdad3c9361..f96d2a6f88cc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -503,6 +503,7 @@ void clear_inode(struct inode *inode)
         */
        spin_lock_irq(&inode->i_data.tree_lock);
        BUG_ON(inode->i_data.nrpages);
+        BUG_ON(inode->i_data.nrshadows);
        spin_unlock_irq(&inode->i_data.tree_lock);
        BUG_ON(!list_empty(&inode->i_data.private_list));
        BUG_ON(!(inode->i_state & I_FREEING));
@@ -548,8 +549,7 @@ static void evict(struct inode *inode)
        if (op->evict_inode) {
                op->evict_inode(inode);
        } else {
-                if (inode->i_data.nrpages)
+                truncate_inode_pages_final(&inode->i_data);
-                        truncate_inode_pages(&inode->i_data, 0);
                clear_inode(inode);
        }
        if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -944,24 +944,22 @@ EXPORT_SYMBOL(unlock_new_inode);
 /**
 * lock_two_nondirectories - take two i_mutexes on non-directory objects
+ *
+ * Lock any non-NULL argument that is not a directory.
+ * Zero, one or two objects may be locked by this function.
+ *
 * @inode1: first inode to lock
 * @inode2: second inode to lock
 */
 void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
 {
-        WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
+        if (inode1 > inode2)
-        if (inode1 == inode2 || !inode2) {
+                swap(inode1, inode2);
-                mutex_lock(&inode1->i_mutex);
-                return;
+        if (inode1 && !S_ISDIR(inode1->i_mode))
-        }
-        WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
-        if (inode1 < inode2) {
                mutex_lock(&inode1->i_mutex);
+        if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
                mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2);
-        } else {
-                mutex_lock(&inode2->i_mutex);
-                mutex_lock_nested(&inode1->i_mutex, I_MUTEX_NONDIR2);
-        }
 }
 EXPORT_SYMBOL(lock_two_nondirectories);
@@ -972,8 +970,9 @@ EXPORT_SYMBOL(lock_two_nondirectories);
 */
 void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
 {
-        mutex_unlock(&inode1->i_mutex);
+        if (inode1 && !S_ISDIR(inode1->i_mode))
-        if (inode2 && inode2 != inode1)
+                mutex_unlock(&inode1->i_mutex);
+        if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
                mutex_unlock(&inode2->i_mutex);
 }
 EXPORT_SYMBOL(unlock_two_nondirectories);
@@ -1899,3 +1898,34 @@ void inode_dio_done(struct inode *inode)
                wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
 }
 EXPORT_SYMBOL(inode_dio_done);
+/*
+ * inode_set_flags - atomically set some inode flags
+ *
+ * Note: the caller should be holding i_mutex, or else be sure that
+ * they have exclusive access to the inode structure (i.e., while the
+ * inode is being instantiated).  The reason for the cmpxchg() loop
+ * --- which wouldn't be necessary if all code paths which modify
+ * i_flags actually followed this rule, is that there is at least one
+ * code path which doesn't today --- for example,
+ * __generic_file_aio_write() calls file_remove_suid() without holding
+ * i_mutex --- so we use cmpxchg() out of an abundance of caution.
+ *
+ * In the long run, i_mutex is overkill, and we should probably look
+ * at using the i_lock spinlock to protect i_flags, and then make sure
+ * it is so documented in include/linux/fs.h and that all code follows
+ * the locking convention!!
+ */
+void inode_set_flags(struct inode *inode, unsigned int flags,
+                     unsigned int mask)
+{
+        unsigned int old_flags, new_flags;
+        WARN_ON_ONCE(flags & ~mask);
+        do {
+                old_flags = ACCESS_ONCE(inode->i_flags);
+                new_flags = (old_flags & ~mask) | flags;
+        } while (unlikely(cmpxchg(&inode->i_flags, old_flags,
+                                  new_flags) != old_flags));
+}
+EXPORT_SYMBOL(inode_set_flags);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 4a9e10ea13f2..4556ce1af5b0 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -93,7 +93,7 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
 }
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
        isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
                                        sizeof(struct iso_inode_info),
@@ -117,6 +117,7 @@ static void destroy_inodecache(void)
 static int isofs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        if (!(*flags & MS_RDONLY))
                return -EROFS;
        return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index cf2fc0594063..5f26139a165a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -555,7 +555,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        blk_start_plug(&plug);
        jbd2_journal_write_revoke_records(journal, commit_transaction,
                                          &log_bufs, WRITE_SYNC);
-        blk_finish_plug(&plug);
        jbd_debug(3, "JBD2: commit phase 2b\n");
@@ -582,7 +581,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        err = 0;
        bufs = 0;
        descriptor = NULL;
-        blk_start_plug(&plug);
        while (commit_transaction->t_buffers) {
                /* Find the next buffer to be journaled... */
@@ -1067,6 +1065,25 @@ restart_loop:
                goto restart_loop;
        }
+        /* Add the transaction to the checkpoint list
+         * __journal_remove_checkpoint() can not destroy transaction
+         * under us because it is not marked as T_FINISHED yet */
+        if (journal->j_checkpoint_transactions == NULL) {
+                journal->j_checkpoint_transactions = commit_transaction;
+                commit_transaction->t_cpnext = commit_transaction;
+                commit_transaction->t_cpprev = commit_transaction;
+        } else {
+                commit_transaction->t_cpnext =
+                        journal->j_checkpoint_transactions;
+                commit_transaction->t_cpprev =
+                        commit_transaction->t_cpnext->t_cpprev;
+                commit_transaction->t_cpnext->t_cpprev =
+                        commit_transaction;
+                commit_transaction->t_cpprev->t_cpnext =
+                                commit_transaction;
+        }
+        spin_unlock(&journal->j_list_lock);
        /* Done with this transaction! */
        jbd_debug(3, "JBD2: commit phase 7\n");
@@ -1085,24 +1102,7 @@ restart_loop:
                atomic_read(&commit_transaction->t_handle_count);
        trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
                             commit_transaction->t_tid, &stats.run);
+        stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
-        /*
-         * Calculate overall stats
-         */
-        spin_lock(&journal->j_history_lock);
-        journal->j_stats.ts_tid++;
-        if (commit_transaction->t_requested)
-                journal->j_stats.ts_requested++;
-        journal->j_stats.run.rs_wait += stats.run.rs_wait;
-        journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
-        journal->j_stats.run.rs_running += stats.run.rs_running;
-        journal->j_stats.run.rs_locked += stats.run.rs_locked;
-        journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
-        journal->j_stats.run.rs_logging += stats.run.rs_logging;
-        journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
-        journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
-        journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
-        spin_unlock(&journal->j_history_lock);
        commit_transaction->t_state = T_COMMIT_CALLBACK;
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
@@ -1122,24 +1122,6 @@ restart_loop:
        write_unlock(&journal->j_state_lock);
-        if (journal->j_checkpoint_transactions == NULL) {
-                journal->j_checkpoint_transactions = commit_transaction;
-                commit_transaction->t_cpnext = commit_transaction;
-                commit_transaction->t_cpprev = commit_transaction;
-        } else {
-                commit_transaction->t_cpnext =
-                        journal->j_checkpoint_transactions;
-                commit_transaction->t_cpprev =
-                        commit_transaction->t_cpnext->t_cpprev;
-                commit_transaction->t_cpnext->t_cpprev =
-                        commit_transaction;
-                commit_transaction->t_cpprev->t_cpnext =
-                                commit_transaction;
-        }
-        spin_unlock(&journal->j_list_lock);
-        /* Drop all spin_locks because commit_callback may be block.
-         * __journal_remove_checkpoint() can not destroy transaction
-         * under us because it is not marked as T_FINISHED yet */
        if (journal->j_commit_callback)
                journal->j_commit_callback(journal, commit_transaction);
@@ -1150,7 +1132,7 @@ restart_loop:
        write_lock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
        commit_transaction->t_state = T_FINISHED;
-        /* Recheck checkpoint lists after j_list_lock was dropped */
+        /* Check if the transaction can be dropped now that we are finished */
        if (commit_transaction->t_checkpoint_list == NULL &&
            commit_transaction->t_checkpoint_io_list == NULL) {
                __jbd2_journal_drop_transaction(journal, commit_transaction);
@@ -1159,4 +1141,21 @@ restart_loop:
        spin_unlock(&journal->j_list_lock);
        write_unlock(&journal->j_state_lock);
        wake_up(&journal->j_wait_done_commit);
+        /*
+         * Calculate overall stats
+         */
+        spin_lock(&journal->j_history_lock);
+        journal->j_stats.ts_tid++;
+        journal->j_stats.ts_requested += stats.ts_requested;
+        journal->j_stats.run.rs_wait += stats.run.rs_wait;
+        journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
+        journal->j_stats.run.rs_running += stats.run.rs_running;
+        journal->j_stats.run.rs_locked += stats.run.rs_locked;
+        journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
+        journal->j_stats.run.rs_logging += stats.run.rs_logging;
+        journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
+        journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
+        journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
+        spin_unlock(&journal->j_history_lock);
 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5fa344afb49a..67b8e303946c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -122,7 +122,7 @@ EXPORT_SYMBOL(__jbd2_debug);
 #endif
 /* Checksumming functions */
-int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
+static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
 {
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return 1;
@@ -143,7 +143,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
        return cpu_to_be32(csum);
 }
-int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
+static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
 {
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return 1;
@@ -151,7 +151,7 @@ int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
        return sb->s_checksum == jbd2_superblock_csum(j, sb);
 }
-void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
+static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
 {
        if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
                return;
@@ -302,8 +302,8 @@ static void journal_kill_thread(journal_t *journal)
        journal->j_flags |= JBD2_UNMOUNT;
        while (journal->j_task) {
-                wake_up(&journal->j_wait_commit);
                write_unlock(&journal->j_state_lock);
+                wake_up(&journal->j_wait_commit);
                wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
                write_lock(&journal->j_state_lock);
        }
@@ -710,8 +710,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
        while (tid_gt(tid, journal->j_commit_sequence)) {
                jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
                                  tid, journal->j_commit_sequence);
-                wake_up(&journal->j_wait_commit);
                read_unlock(&journal->j_state_lock);
+                wake_up(&journal->j_wait_commit);
                wait_event(journal->j_wait_done_commit,
                                !tid_gt(tid, journal->j_commit_sequence));
                read_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 60bb365f54a5..38cfcf5f6fce 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1073,7 +1073,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
         * reused here.
         */
        jbd_lock_bh_state(bh);
-        spin_lock(&journal->j_list_lock);
        J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
                jh->b_transaction == NULL ||
                (jh->b_transaction == journal->j_committing_transaction &&
@@ -1096,12 +1095,14 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
                jh->b_modified = 0;
                JBUFFER_TRACE(jh, "file as BJ_Reserved");
+                spin_lock(&journal->j_list_lock);
                __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
        } else if (jh->b_transaction == journal->j_committing_transaction) {
                /* first access by this transaction */
                jh->b_modified = 0;
                JBUFFER_TRACE(jh, "set next transaction");
+                spin_lock(&journal->j_list_lock);
                jh->b_next_transaction = transaction;
        }
        spin_unlock(&journal->j_list_lock);
@@ -1312,7 +1313,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
                             journal->j_running_transaction)) {
                        printk(KERN_ERR "JBD2: %s: "
                               "jh->b_transaction (%llu, %p, %u) != "
-                               "journal->j_running_transaction (%p, %u)",
+                               "journal->j_running_transaction (%p, %u)\n",
                               journal->j_devname,
                               (unsigned long long) bh->b_blocknr,
                               jh->b_transaction,
@@ -1335,30 +1336,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
         */
        if (jh->b_transaction != transaction) {
                JBUFFER_TRACE(jh, "already on other transaction");
-                if (unlikely(jh->b_transaction !=
+                if (unlikely(((jh->b_transaction !=
-                             journal->j_committing_transaction)) {
+                               journal->j_committing_transaction)) ||
-                        printk(KERN_ERR "JBD2: %s: "
+                             (jh->b_next_transaction != transaction))) {
-                               "jh->b_transaction (%llu, %p, %u) != "
+                        printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
-                               "journal->j_committing_transaction (%p, %u)",
+                               "bad jh for block %llu: "
+                               "transaction (%p, %u), "
+                               "jh->b_transaction (%p, %u), "
+                               "jh->b_next_transaction (%p, %u), jlist %u\n",
                               journal->j_devname,
                               (unsigned long long) bh->b_blocknr,
+                               transaction, transaction->t_tid,
                               jh->b_transaction,
-                               jh->b_transaction ? jh->b_transaction->t_tid : 0,
+                               jh->b_transaction ?
-                               journal->j_committing_transaction,
+                               jh->b_transaction->t_tid : 0,
-                               journal->j_committing_transaction ?
-                               journal->j_committing_transaction->t_tid : 0);
-                        ret = -EINVAL;
-                }
-                if (unlikely(jh->b_next_transaction != transaction)) {
-                        printk(KERN_ERR "JBD2: %s: "
-                               "jh->b_next_transaction (%llu, %p, %u) != "
-                               "transaction (%p, %u)",
-                               journal->j_devname,
-                               (unsigned long long) bh->b_blocknr,
                               jh->b_next_transaction,
                               jh->b_next_transaction ?
                               jh->b_next_transaction->t_tid : 0,
-                               transaction, transaction->t_tid);
+                               jh->b_jlist);
+                        WARN_ON(1);
                        ret = -EINVAL;
                }
                /* And this case is illegal: we can't reuse another
@@ -1415,7 +1411,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
        BUFFER_TRACE(bh, "entry");
        jbd_lock_bh_state(bh);
-        spin_lock(&journal->j_list_lock);
        if (!buffer_jbd(bh))
                goto not_jbd;
@@ -1468,6 +1463,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                 * we know to remove the checkpoint after we commit.
                 */
+                spin_lock(&journal->j_list_lock);
                if (jh->b_cp_transaction) {
                        __jbd2_journal_temp_unlink_buffer(jh);
                        __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
@@ -1480,6 +1476,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                                goto drop;
                        }
                }
+                spin_unlock(&journal->j_list_lock);
        } else if (jh->b_transaction) {
                J_ASSERT_JH(jh, (jh->b_transaction ==
                                 journal->j_committing_transaction));
@@ -1491,7 +1488,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                if (jh->b_next_transaction) {
                        J_ASSERT(jh->b_next_transaction == transaction);
+                        spin_lock(&journal->j_list_lock);
                        jh->b_next_transaction = NULL;
+                        spin_unlock(&journal->j_list_lock);
                        /*
                         * only drop a reference if this transaction modified
@@ -1503,7 +1502,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
        }
 not_jbd:
-        spin_unlock(&journal->j_list_lock);
        jbd_unlock_bh_state(bh);
        __brelse(bh);
 drop:
@@ -1821,11 +1819,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
        if (buffer_locked(bh) || buffer_dirty(bh))
                goto out;
-        if (jh->b_next_transaction != NULL)
+        if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
                goto out;
        spin_lock(&journal->j_list_lock);
-        if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+        if (jh->b_cp_transaction != NULL) {
                /* written-back checkpointed metadata buffer */
                JBUFFER_TRACE(jh, "remove from checkpoint list");
                __jbd2_journal_remove_checkpoint(jh);
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 16a5047903a6..406d9cc84ba8 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -33,7 +33,7 @@ static int jffs2_rtime_compress(unsigned char *data_in,
                                unsigned char *cpage_out,
                                uint32_t *sourcelen, uint32_t *dstlen)
 {
-        short positions[256];
+        unsigned short positions[256];
        int outpos = 0;
        int pos=0;
@@ -74,7 +74,7 @@ static int jffs2_rtime_decompress(unsigned char *data_in,
                                  unsigned char *cpage_out,
                                  uint32_t srclen, uint32_t destlen)
 {
-        short positions[256];
+        unsigned short positions[256];
        int outpos = 0;
        int pos=0;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index a69e426435dd..601afd1afddf 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -242,7 +242,7 @@ void jffs2_evict_inode (struct inode *inode)
        jffs2_dbg(1, "%s(): ino #%lu mode %o\n",
                  __func__, inode->i_ino, inode->i_mode);
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        jffs2_do_clear_inode(c, f);
 }
@@ -457,12 +457,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
           The umask is only applied if there's no default ACL */
        ret = jffs2_init_acl_pre(dir_i, inode, &mode);
        if (ret) {
-            make_bad_inode(inode);
+                mutex_unlock(&f->sem);
-            iput(inode);
+                make_bad_inode(inode);
-            return ERR_PTR(ret);
+                iput(inode);
+                return ERR_PTR(ret);
        }
        ret = jffs2_do_new_inode (c, f, mode, ri);
        if (ret) {
+                mutex_unlock(&f->sem);
                make_bad_inode(inode);
                iput(inode);
                return ERR_PTR(ret);
@@ -479,6 +481,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
        inode->i_size = 0;
        if (insert_inode_locked(inode) < 0) {
+                mutex_unlock(&f->sem);
                make_bad_inode(inode);
                iput(inode);
                return ERR_PTR(-EINVAL);
@@ -687,7 +690,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
        struct inode *inode = OFNI_EDONI_2SFFJ(f);
        struct page *pg;
-        pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
+        pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
                             (void *)jffs2_do_readpage_unlock, inode);
        if (IS_ERR(pg))
                return (void *)pg;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index e4619b00f7c5..fa35ff79ab35 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -231,7 +231,7 @@ struct jffs2_tmp_dnode_info
        uint32_t version;
        uint32_t data_crc;
        uint32_t partial_crc;
-        uint16_t csize;
+        uint32_t csize;
        uint16_t overlapped;
 };
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 03310721712f..b6bd4affd9ad 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -179,6 +179,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
                                        spin_unlock(&c->erase_completion_lock);
                                        schedule();
+                                        remove_wait_queue(&c->erase_wait, &wait);
                                } else
                                        spin_unlock(&c->erase_completion_lock);
                        } else if (ret)
@@ -211,20 +212,25 @@ out:
 int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
                           uint32_t *len, uint32_t sumsize)
 {
-        int ret = -EAGAIN;
+        int ret;
        minsize = PAD(minsize);
        jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize);
-        spin_lock(&c->erase_completion_lock);
+        while (true) {
-        while(ret == -EAGAIN) {
+                spin_lock(&c->erase_completion_lock);
                ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
                if (ret) {
                        jffs2_dbg(1, "%s(): looping, ret is %d\n",
                                  __func__, ret);
                }
+                spin_unlock(&c->erase_completion_lock);
+                if (ret == -EAGAIN)
+                        cond_resched();
+                else
+                        break;
        }
-        spin_unlock(&c->erase_completion_lock);
        if (!ret)
                ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0defb1cc2a35..0918f0e2e266 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -243,6 +243,7 @@ static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
        int err;
+        sync_filesystem(sb);
        err = jffs2_parse_options(c, data);
        if (err)
                return -EINVAL;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index f4aab719add5..6f8fe72c2a7a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -154,7 +154,7 @@ void jfs_evict_inode(struct inode *inode)
                dquot_initialize(inode);
                if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
-                        truncate_inode_pages(&inode->i_data, 0);
+                        truncate_inode_pages_final(&inode->i_data);
                        if (test_cflag(COMMIT_Freewmap, inode))
                                jfs_free_zero_link(inode);
@@ -168,7 +168,7 @@ void jfs_evict_inode(struct inode *inode)
                        dquot_free_inode(inode);
                }
        } else {
-                truncate_inode_pages(&inode->i_data, 0);
+                truncate_inode_pages_final(&inode->i_data);
        }
        clear_inode(inode);
        dquot_drop(inode);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index e2b7483444fd..97f7fda51890 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -418,6 +418,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
        int flag = JFS_SBI(sb)->flag;
        int ret;
+        sync_filesystem(sb);
        if (!parse_options(data, sb, &newLVSize, &flag)) {
                return -EINVAL;
        }
diff --git a/fs/kernfs/Kconfig b/fs/kernfs/Kconfig
new file mode 100644
index 000000000000..397b5f7a7a16
--- /dev/null
+++ b/fs/kernfs/Kconfig
@@ -0,0 +1,7 @@
+#
+# KERNFS should be selected by its users
+#
+config KERNFS
+        bool
+        default n
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index bd6e18be6e1a..ac127cd008bf 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -8,6 +8,7 @@
 * This file is released under the GPLv2.
 */
+#include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/idr.h>
@@ -18,9 +19,162 @@
 #include "kernfs-internal.h"
 DEFINE_MUTEX(kernfs_mutex);
+static DEFINE_SPINLOCK(kernfs_rename_lock);     /* kn->parent and ->name */
+static char kernfs_pr_cont_buf[PATH_MAX];       /* protected by rename_lock */
 #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
+static bool kernfs_active(struct kernfs_node *kn)
+{
+        lockdep_assert_held(&kernfs_mutex);
+        return atomic_read(&kn->active) >= 0;
+}
+static bool kernfs_lockdep(struct kernfs_node *kn)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        return kn->flags & KERNFS_LOCKDEP;
+#else
+        return false;
+#endif
+}
+static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
+{
+        return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
+}
+static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
+                                              size_t buflen)
+{
+        char *p = buf + buflen;
+        int len;
+        *--p = '\0';
+        do {
+                len = strlen(kn->name);
+                if (p - buf < len + 1) {
+                        buf[0] = '\0';
+                        p = NULL;
+                        break;
+                }
+                p -= len;
+                memcpy(p, kn->name, len);
+                *--p = '/';
+                kn = kn->parent;
+        } while (kn && kn->parent);
+        return p;
+}
+/**
+ * kernfs_name - obtain the name of a given node
+ * @kn: kernfs_node of interest
+ * @buf: buffer to copy @kn's name into
+ * @buflen: size of @buf
+ *
+ * Copies the name of @kn into @buf of @buflen bytes.  The behavior is
+ * similar to strlcpy().  It returns the length of @kn's name and if @buf
+ * isn't long enough, it's filled upto @buflen-1 and nul terminated.
+ *
+ * This function can be called from any context.
+ */
+int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
+{
+        unsigned long flags;
+        int ret;
+        spin_lock_irqsave(&kernfs_rename_lock, flags);
+        ret = kernfs_name_locked(kn, buf, buflen);
+        spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+        return ret;
+}
+/**
+ * kernfs_path - build full path of a given node
+ * @kn: kernfs_node of interest
+ * @buf: buffer to copy @kn's name into
+ * @buflen: size of @buf
+ *
+ * Builds and returns the full path of @kn in @buf of @buflen bytes.  The
+ * path is built from the end of @buf so the returned pointer usually
+ * doesn't match @buf.  If @buf isn't long enough, @buf is nul terminated
+ * and %NULL is returned.
+ */
+char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
+{
+        unsigned long flags;
+        char *p;
+        spin_lock_irqsave(&kernfs_rename_lock, flags);
+        p = kernfs_path_locked(kn, buf, buflen);
+        spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+        return p;
+}
+EXPORT_SYMBOL_GPL(kernfs_path);
+/**
+ * pr_cont_kernfs_name - pr_cont name of a kernfs_node
+ * @kn: kernfs_node of interest
+ *
+ * This function can be called from any context.
+ */
+void pr_cont_kernfs_name(struct kernfs_node *kn)
+{
+        unsigned long flags;
+        spin_lock_irqsave(&kernfs_rename_lock, flags);
+        kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
+        pr_cont("%s", kernfs_pr_cont_buf);
+        spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+}
+/**
+ * pr_cont_kernfs_path - pr_cont path of a kernfs_node
+ * @kn: kernfs_node of interest
+ *
+ * This function can be called from any context.
+ */
+void pr_cont_kernfs_path(struct kernfs_node *kn)
+{
+        unsigned long flags;
+        char *p;
+        spin_lock_irqsave(&kernfs_rename_lock, flags);
+        p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
+                               sizeof(kernfs_pr_cont_buf));
+        if (p)
+                pr_cont("%s", p);
+        else
+                pr_cont("<name too long>");
+        spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+}
+/**
+ * kernfs_get_parent - determine the parent node and pin it
+ * @kn: kernfs_node of interest
+ *
+ * Determines @kn's parent, pins and returns it.  This function can be
+ * called from any context.
+ */
+struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
+{
+        struct kernfs_node *parent;
+        unsigned long flags;
+        spin_lock_irqsave(&kernfs_rename_lock, flags);
+        parent = kn->parent;
+        kernfs_get(parent);
+        spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+        return parent;
+}
 /**
 *      kernfs_name_hash
 *      @name: Null terminated string to hash
@@ -37,7 +191,7 @@ static unsigned int kernfs_name_hash(const char *name, const void *ns)
        hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
        hash &= 0x7fffffffU;
        /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
-        if (hash < 1)
+        if (hash < 2)
                hash += 2;
        if (hash >= INT_MAX)
                hash = INT_MAX - 1;
@@ -78,9 +232,6 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
        struct rb_node **node = &kn->parent->dir.children.rb_node;
        struct rb_node *parent = NULL;
-        if (kernfs_type(kn) == KERNFS_DIR)
-                kn->parent->dir.subdirs++;
        while (*node) {
                struct kernfs_node *pos;
                int result;
@@ -95,9 +246,15 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
                else
                        return -EEXIST;
        }
        /* add new node and rebalance the tree */
        rb_link_node(&kn->rb, parent, node);
        rb_insert_color(&kn->rb, &kn->parent->dir.children);
+        /* successfully added, account subdir number */
+        if (kernfs_type(kn) == KERNFS_DIR)
+                kn->parent->dir.subdirs++;
        return 0;
 }
@@ -105,18 +262,24 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
 *      kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
 *      @kn: kernfs_node of interest
 *
- *      Unlink @kn from its sibling rbtree which starts from
+ *      Try to unlink @kn from its sibling rbtree which starts from
- *      kn->parent->dir.children.
+ *      kn->parent->dir.children.  Returns %true if @kn was actually
+ *      removed, %false if @kn wasn't on the rbtree.
 *
 *      Locking:
 *      mutex_lock(kernfs_mutex)
 */
-static void kernfs_unlink_sibling(struct kernfs_node *kn)
+static bool kernfs_unlink_sibling(struct kernfs_node *kn)
 {
+        if (RB_EMPTY_NODE(&kn->rb))
+                return false;
        if (kernfs_type(kn) == KERNFS_DIR)
                kn->parent->dir.subdirs--;
        rb_erase(&kn->rb, &kn->parent->dir.children);
+        RB_CLEAR_NODE(&kn->rb);
+        return true;
 }
 /**
@@ -137,7 +300,7 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
        if (!atomic_inc_unless_negative(&kn->active))
                return NULL;
-        if (kn->flags & KERNFS_LOCKDEP)
+        if (kernfs_lockdep(kn))
                rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
        return kn;
 }
@@ -151,59 +314,57 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
 */
 void kernfs_put_active(struct kernfs_node *kn)
 {
+        struct kernfs_root *root = kernfs_root(kn);
        int v;
        if (unlikely(!kn))
                return;
-        if (kn->flags & KERNFS_LOCKDEP)
+        if (kernfs_lockdep(kn))
                rwsem_release(&kn->dep_map, 1, _RET_IP_);
        v = atomic_dec_return(&kn->active);
        if (likely(v != KN_DEACTIVATED_BIAS))
                return;
-        /*
+        wake_up_all(&root->deactivate_waitq);
-         * atomic_dec_return() is a mb(), we'll always see the updated
-         * kn->u.completion.
-         */
-        complete(kn->u.completion);
 }
 /**
- *      kernfs_deactivate - deactivate kernfs_node
+ * kernfs_drain - drain kernfs_node
- *      @kn: kernfs_node to deactivate
+ * @kn: kernfs_node to drain
 *
- *      Deny new active references and drain existing ones.
+ * Drain existing usages and nuke all existing mmaps of @kn.  Mutiple
+ * removers may invoke this function concurrently on @kn and all will
+ * return after draining is complete.
 */
-static void kernfs_deactivate(struct kernfs_node *kn)
+static void kernfs_drain(struct kernfs_node *kn)
+        __releases(&kernfs_mutex) __acquires(&kernfs_mutex)
 {
-        DECLARE_COMPLETION_ONSTACK(wait);
+        struct kernfs_root *root = kernfs_root(kn);
-        int v;
-        BUG_ON(!(kn->flags & KERNFS_REMOVED));
-        if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
+        lockdep_assert_held(&kernfs_mutex);
-                return;
+        WARN_ON_ONCE(kernfs_active(kn));
-        kn->u.completion = (void *)&wait;
+        mutex_unlock(&kernfs_mutex);
-        if (kn->flags & KERNFS_LOCKDEP)
+        if (kernfs_lockdep(kn)) {
                rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
-        /* atomic_add_return() is a mb(), put_active() will always see
+                if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
-         * the updated kn->u.completion.
-         */
-        v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active);
-        if (v != KN_DEACTIVATED_BIAS) {
-                if (kn->flags & KERNFS_LOCKDEP)
                        lock_contended(&kn->dep_map, _RET_IP_);
-                wait_for_completion(&wait);
        }
-        if (kn->flags & KERNFS_LOCKDEP) {
+        /* but everyone should wait for draining */
+        wait_event(root->deactivate_waitq,
+                   atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
+        if (kernfs_lockdep(kn)) {
                lock_acquired(&kn->dep_map, _RET_IP_);
                rwsem_release(&kn->dep_map, 1, _RET_IP_);
        }
+        kernfs_unmap_bin_file(kn);
+        mutex_lock(&kernfs_mutex);
 }
 /**
@@ -234,13 +395,15 @@ void kernfs_put(struct kernfs_node *kn)
                return;
        root = kernfs_root(kn);
 repeat:
-        /* Moving/renaming is always done while holding reference.
+        /*
+         * Moving/renaming is always done while holding reference.
         * kn->parent won't change beneath us.
         */
        parent = kn->parent;
-        WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n",
+        WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
-             parent ? parent->name : "", kn->name);
+                  "kernfs_put: %s/%s: released with incorrect active_ref %d\n",
+                  parent ? parent->name : "", kn->name, atomic_read(&kn->active));
        if (kernfs_type(kn) == KERNFS_LINK)
                kernfs_put(kn->symlink.target_kn);
@@ -282,8 +445,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
        kn = dentry->d_fsdata;
        mutex_lock(&kernfs_mutex);
-        /* The kernfs node has been deleted */
+        /* The kernfs node has been deactivated */
-        if (kn->flags & KERNFS_REMOVED)
+        if (!kernfs_active(kn))
                goto out_bad;
        /* The kernfs node has been moved? */
@@ -328,6 +491,24 @@ const struct dentry_operations kernfs_dops = {
        .d_release      = kernfs_dop_release,
 };
+/**
+ * kernfs_node_from_dentry - determine kernfs_node associated with a dentry
+ * @dentry: the dentry in question
+ *
+ * Return the kernfs_node associated with @dentry.  If @dentry is not a
+ * kernfs one, %NULL is returned.
+ *
+ * While the returned kernfs_node will stay accessible as long as @dentry
+ * is accessible, the returned node can be in any state and the caller is
+ * fully responsible for determining what's accessible.
+ */
+struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
+{
+        if (dentry->d_sb->s_op == &kernfs_sops)
+                return dentry->d_fsdata;
+        return NULL;
+}
 static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
                                             const char *name, umode_t mode,
                                             unsigned flags)
@@ -352,11 +533,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
        kn->ino = ret;
        atomic_set(&kn->count, 1);
-        atomic_set(&kn->active, 0);
+        atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
+        RB_CLEAR_NODE(&kn->rb);
        kn->name = name;
        kn->mode = mode;
-        kn->flags = flags | KERNFS_REMOVED;
+        kn->flags = flags;
        return kn;
@@ -382,69 +564,44 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
 }
 /**
- *      kernfs_addrm_start - prepare for kernfs_node add/remove
- *      @acxt: pointer to kernfs_addrm_cxt to be used
- *
- *      This function is called when the caller is about to add or remove
- *      kernfs_node.  This function acquires kernfs_mutex.  @acxt is used
- *      to keep and pass context to other addrm functions.
- *
- *      LOCKING:
- *      Kernel thread context (may sleep).  kernfs_mutex is locked on
- *      return.
- */
-void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
-        __acquires(kernfs_mutex)
-{
-        memset(acxt, 0, sizeof(*acxt));
-        mutex_lock(&kernfs_mutex);
-}
-/**
 *      kernfs_add_one - add kernfs_node to parent without warning
- *      @acxt: addrm context to use
 *      @kn: kernfs_node to be added
 *
 *      The caller must already have initialized @kn->parent.  This
 *      function increments nlink of the parent's inode if @kn is a
 *      directory and link into the children list of the parent.
 *
- *      This function should be called between calls to
- *      kernfs_addrm_start() and kernfs_addrm_finish() and should be passed
- *      the same @acxt as passed to kernfs_addrm_start().
- *
- *      LOCKING:
- *      Determined by kernfs_addrm_start().
- *
 *      RETURNS:
 *      0 on success, -EEXIST if entry with the given name already
 *      exists.
 */
-int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn)
+int kernfs_add_one(struct kernfs_node *kn)
 {
        struct kernfs_node *parent = kn->parent;
-        bool has_ns = kernfs_ns_enabled(parent);
        struct kernfs_iattrs *ps_iattr;
+        bool has_ns;
        int ret;
-        if (has_ns != (bool)kn->ns) {
+        mutex_lock(&kernfs_mutex);
-                WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
-                     has_ns ? "required" : "invalid", parent->name, kn->name);
+        ret = -EINVAL;
-                return -EINVAL;
+        has_ns = kernfs_ns_enabled(parent);
-        }
+        if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+                 has_ns ? "required" : "invalid", parent->name, kn->name))
+                goto out_unlock;
        if (kernfs_type(parent) != KERNFS_DIR)
-                return -EINVAL;
+                goto out_unlock;
-        if (parent->flags & KERNFS_REMOVED)
+        ret = -ENOENT;
-                return -ENOENT;
+        if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
+                goto out_unlock;
        kn->hash = kernfs_name_hash(kn->name, kn->ns);
        ret = kernfs_link_sibling(kn);
        if (ret)
-                return ret;
+                goto out_unlock;
        /* Update timestamps on the parent */
        ps_iattr = parent->iattr;
@@ -453,82 +610,22 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn)
                ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
        }
-        /* Mark the entry added into directory tree */
+        mutex_unlock(&kernfs_mutex);
-        kn->flags &= ~KERNFS_REMOVED;
-        return 0;
-}
-/**
- *      kernfs_remove_one - remove kernfs_node from parent
- *      @acxt: addrm context to use
- *      @kn: kernfs_node to be removed
- *
- *      Mark @kn removed and drop nlink of parent inode if @kn is a
- *      directory.  @kn is unlinked from the children list.
- *
- *      This function should be called between calls to
- *      kernfs_addrm_start() and kernfs_addrm_finish() and should be
- *      passed the same @acxt as passed to kernfs_addrm_start().
- *
- *      LOCKING:
- *      Determined by kernfs_addrm_start().
- */
-static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
-                              struct kernfs_node *kn)
-{
-        struct kernfs_iattrs *ps_iattr;
        /*
-         * Removal can be called multiple times on the same node.  Only the
+         * Activate the new node unless CREATE_DEACTIVATED is requested.
-         * first invocation is effective and puts the base ref.
+         * If not activated here, the kernfs user is responsible for
+         * activating the node with kernfs_activate().  A node which hasn't
+         * been activated is not visible to userland and its removal won't
+         * trigger deactivation.
         */
-        if (kn->flags & KERNFS_REMOVED)
+        if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
-                return;
+                kernfs_activate(kn);
+        return 0;
-        if (kn->parent) {
-                kernfs_unlink_sibling(kn);
-                /* Update timestamps on the parent */
-                ps_iattr = kn->parent->iattr;
-                if (ps_iattr) {
-                        ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
-                        ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
-                }
-        }
-        kn->flags |= KERNFS_REMOVED;
-        kn->u.removed_list = acxt->removed;
-        acxt->removed = kn;
-}
-/**
+out_unlock:
- *      kernfs_addrm_finish - finish up kernfs_node add/remove
- *      @acxt: addrm context to finish up
- *
- *      Finish up kernfs_node add/remove.  Resources acquired by
- *      kernfs_addrm_start() are released and removed kernfs_nodes are
- *      cleaned up.
- *
- *      LOCKING:
- *      kernfs_mutex is released.
- */
-void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
-        __releases(kernfs_mutex)
-{
-        /* release resources acquired by kernfs_addrm_start() */
        mutex_unlock(&kernfs_mutex);
+        return ret;
-        /* kill removed kernfs_nodes */
-        while (acxt->removed) {
-                struct kernfs_node *kn = acxt->removed;
-                acxt->removed = kn->u.removed_list;
-                kernfs_deactivate(kn);
-                kernfs_unmap_bin_file(kn);
-                kernfs_put(kn);
-        }
 }
 /**
@@ -599,13 +696,15 @@ EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
 /**
 * kernfs_create_root - create a new kernfs hierarchy
- * @kdops: optional directory syscall operations for the hierarchy
+ * @scops: optional syscall operations for the hierarchy
+ * @flags: KERNFS_ROOT_* flags
 * @priv: opaque data associated with the new directory
 *
 * Returns the root of the new hierarchy on success, ERR_PTR() value on
 * failure.
 */
-struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
+struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
+                                       unsigned int flags, void *priv)
 {
        struct kernfs_root *root;
        struct kernfs_node *kn;
@@ -624,12 +723,16 @@ struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
                return ERR_PTR(-ENOMEM);
        }
-        kn->flags &= ~KERNFS_REMOVED;
        kn->priv = priv;
        kn->dir.root = root;
-        root->dir_ops = kdops;
+        root->syscall_ops = scops;
+        root->flags = flags;
        root->kn = kn;
+        init_waitqueue_head(&root->deactivate_waitq);
+        if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
+                kernfs_activate(kn);
        return root;
 }
@@ -660,7 +763,6 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
                                         const char *name, umode_t mode,
                                         void *priv, const void *ns)
 {
-        struct kernfs_addrm_cxt acxt;
        struct kernfs_node *kn;
        int rc;
@@ -674,10 +776,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
        kn->priv = priv;
        /* link in */
-        kernfs_addrm_start(&acxt);
+        rc = kernfs_add_one(kn);
-        rc = kernfs_add_one(&acxt, kn);
-        kernfs_addrm_finish(&acxt);
        if (!rc)
                return kn;
@@ -703,7 +802,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
        kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
        /* no such entry */
-        if (!kn) {
+        if (!kn || !kernfs_active(kn)) {
                ret = NULL;
                goto out_unlock;
        }
@@ -728,23 +827,37 @@ static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
                            umode_t mode)
 {
        struct kernfs_node *parent = dir->i_private;
-        struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops;
+        struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
+        int ret;
-        if (!kdops || !kdops->mkdir)
+        if (!scops || !scops->mkdir)
                return -EPERM;
-        return kdops->mkdir(parent, dentry->d_name.name, mode);
+        if (!kernfs_get_active(parent))
+                return -ENODEV;
+        ret = scops->mkdir(parent, dentry->d_name.name, mode);
+        kernfs_put_active(parent);
+        return ret;
 }
 static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct kernfs_node *kn  = dentry->d_fsdata;
-        struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops;
+        struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
+        int ret;
-        if (!kdops || !kdops->rmdir)
+        if (!scops || !scops->rmdir)
                return -EPERM;
-        return kdops->rmdir(kn);
+        if (!kernfs_get_active(kn))
+                return -ENODEV;
+        ret = scops->rmdir(kn);
+        kernfs_put_active(kn);
+        return ret;
 }
 static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -752,12 +865,25 @@ static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
 {
        struct kernfs_node *kn  = old_dentry->d_fsdata;
        struct kernfs_node *new_parent = new_dir->i_private;
-        struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops;
+        struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
+        int ret;
-        if (!kdops || !kdops->rename)
+        if (!scops || !scops->rename)
                return -EPERM;
-        return kdops->rename(kn, new_parent, new_dentry->d_name.name);
+        if (!kernfs_get_active(kn))
+                return -ENODEV;
+        if (!kernfs_get_active(new_parent)) {
+                kernfs_put_active(kn);
+                return -ENODEV;
+        }
+        ret = scops->rename(kn, new_parent, new_dentry->d_name.name);
+        kernfs_put_active(new_parent);
+        kernfs_put_active(kn);
+        return ret;
 }
 const struct inode_operations kernfs_dir_iops = {
@@ -830,23 +956,104 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
        return pos->parent;
 }
-static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
+/**
-                            struct kernfs_node *kn)
+ * kernfs_activate - activate a node which started deactivated
+ * @kn: kernfs_node whose subtree is to be activated
+ *
+ * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node
+ * needs to be explicitly activated.  A node which hasn't been activated
+ * isn't visible to userland and deactivation is skipped during its
+ * removal.  This is useful to construct atomic init sequences where
+ * creation of multiple nodes should either succeed or fail atomically.
+ *
+ * The caller is responsible for ensuring that this function is not called
+ * after kernfs_remove*() is invoked on @kn.
+ */
+void kernfs_activate(struct kernfs_node *kn)
 {
-        struct kernfs_node *pos, *next;
+        struct kernfs_node *pos;
-        if (!kn)
+        mutex_lock(&kernfs_mutex);
+        pos = NULL;
+        while ((pos = kernfs_next_descendant_post(pos, kn))) {
+                if (!pos || (pos->flags & KERNFS_ACTIVATED))
+                        continue;
+                WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
+                WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);
+                atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
+                pos->flags |= KERNFS_ACTIVATED;
+        }
+        mutex_unlock(&kernfs_mutex);
+}
+static void __kernfs_remove(struct kernfs_node *kn)
+{
+        struct kernfs_node *pos;
+        lockdep_assert_held(&kernfs_mutex);
+        /*
+         * Short-circuit if non-root @kn has already finished removal.
+         * This is for kernfs_remove_self() which plays with active ref
+         * after removal.
+         */
+        if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb)))
                return;
        pr_debug("kernfs %s: removing\n", kn->name);
-        next = NULL;
+        /* prevent any new usage under @kn by deactivating all nodes */
+        pos = NULL;
+        while ((pos = kernfs_next_descendant_post(pos, kn)))
+                if (kernfs_active(pos))
+                        atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
+        /* deactivate and unlink the subtree node-by-node */
        do {
-                pos = next;
+                pos = kernfs_leftmost_descendant(kn);
-                next = kernfs_next_descendant_post(pos, kn);
-                if (pos)
+                /*
-                        kernfs_remove_one(acxt, pos);
+                 * kernfs_drain() drops kernfs_mutex temporarily and @pos's
-        } while (next);
+                 * base ref could have been put by someone else by the time
+                 * the function returns.  Make sure it doesn't go away
+                 * underneath us.
+                 */
+                kernfs_get(pos);
+                /*
+                 * Drain iff @kn was activated.  This avoids draining and
+                 * its lockdep annotations for nodes which have never been
+                 * activated and allows embedding kernfs_remove() in create
+                 * error paths without worrying about draining.
+                 */
+                if (kn->flags & KERNFS_ACTIVATED)
+                        kernfs_drain(pos);
+                else
+                        WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
+                /*
+                 * kernfs_unlink_sibling() succeeds once per node.  Use it
+                 * to decide who's responsible for cleanups.
+                 */
+                if (!pos->parent || kernfs_unlink_sibling(pos)) {
+                        struct kernfs_iattrs *ps_iattr =
+                                pos->parent ? pos->parent->iattr : NULL;
+                        /* update timestamps on the parent */
+                        if (ps_iattr) {
+                                ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
+                                ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
+                        }
+                        kernfs_put(pos);
+                }
+                kernfs_put(pos);
+        } while (pos != kn);
 }
 /**
@@ -857,11 +1064,140 @@ static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
 */
 void kernfs_remove(struct kernfs_node *kn)
 {
-        struct kernfs_addrm_cxt acxt;
+        mutex_lock(&kernfs_mutex);
+        __kernfs_remove(kn);
+        mutex_unlock(&kernfs_mutex);
+}
+/**
+ * kernfs_break_active_protection - break out of active protection
+ * @kn: the self kernfs_node
+ *
+ * The caller must be running off of a kernfs operation which is invoked
+ * with an active reference - e.g. one of kernfs_ops.  Each invocation of
+ * this function must also be matched with an invocation of
+ * kernfs_unbreak_active_protection().
+ *
+ * This function releases the active reference of @kn the caller is
+ * holding.  Once this function is called, @kn may be removed at any point
+ * and the caller is solely responsible for ensuring that the objects it
+ * dereferences are accessible.
+ */
+void kernfs_break_active_protection(struct kernfs_node *kn)
+{
+        /*
+         * Take out ourself out of the active ref dependency chain.  If
+         * we're called without an active ref, lockdep will complain.
+         */
+        kernfs_put_active(kn);
+}
+/**
+ * kernfs_unbreak_active_protection - undo kernfs_break_active_protection()
+ * @kn: the self kernfs_node
+ *
+ * If kernfs_break_active_protection() was called, this function must be
+ * invoked before finishing the kernfs operation.  Note that while this
+ * function restores the active reference, it doesn't and can't actually
+ * restore the active protection - @kn may already or be in the process of
+ * being removed.  Once kernfs_break_active_protection() is invoked, that
+ * protection is irreversibly gone for the kernfs operation instance.
+ *
+ * While this function may be called at any point after
+ * kernfs_break_active_protection() is invoked, its most useful location
+ * would be right before the enclosing kernfs operation returns.
+ */
+void kernfs_unbreak_active_protection(struct kernfs_node *kn)
+{
+        /*
+         * @kn->active could be in any state; however, the increment we do
+         * here will be undone as soon as the enclosing kernfs operation
+         * finishes and this temporary bump can't break anything.  If @kn
+         * is alive, nothing changes.  If @kn is being deactivated, the
+         * soon-to-follow put will either finish deactivation or restore
+         * deactivated state.  If @kn is already removed, the temporary
+         * bump is guaranteed to be gone before @kn is released.
+         */
+        atomic_inc(&kn->active);
+        if (kernfs_lockdep(kn))
+                rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
+}
+/**
+ * kernfs_remove_self - remove a kernfs_node from its own method
+ * @kn: the self kernfs_node to remove
+ *
+ * The caller must be running off of a kernfs operation which is invoked
+ * with an active reference - e.g. one of kernfs_ops.  This can be used to
+ * implement a file operation which deletes itself.
+ *
+ * For example, the "delete" file for a sysfs device directory can be
+ * implemented by invoking kernfs_remove_self() on the "delete" file
+ * itself.  This function breaks the circular dependency of trying to
+ * deactivate self while holding an active ref itself.  It isn't necessary
+ * to modify the usual removal path to use kernfs_remove_self().  The
+ * "delete" implementation can simply invoke kernfs_remove_self() on self
+ * before proceeding with the usual removal path.  kernfs will ignore later
+ * kernfs_remove() on self.
+ *
+ * kernfs_remove_self() can be called multiple times concurrently on the
+ * same kernfs_node.  Only the first one actually performs removal and
+ * returns %true.  All others will wait until the kernfs operation which
+ * won self-removal finishes and return %false.  Note that the losers wait
+ * for the completion of not only the winning kernfs_remove_self() but also
+ * the whole kernfs_ops which won the arbitration.  This can be used to
+ * guarantee, for example, all concurrent writes to a "delete" file to
+ * finish only after the whole operation is complete.
+ */
+bool kernfs_remove_self(struct kernfs_node *kn)
+{
+        bool ret;
+        mutex_lock(&kernfs_mutex);
+        kernfs_break_active_protection(kn);
+        /*
+         * SUICIDAL is used to arbitrate among competing invocations.  Only
+         * the first one will actually perform removal.  When the removal
+         * is complete, SUICIDED is set and the active ref is restored
+         * while holding kernfs_mutex.  The ones which lost arbitration
+         * waits for SUICDED && drained which can happen only after the
+         * enclosing kernfs operation which executed the winning instance
+         * of kernfs_remove_self() finished.
+         */
+        if (!(kn->flags & KERNFS_SUICIDAL)) {
+                kn->flags |= KERNFS_SUICIDAL;
+                __kernfs_remove(kn);
+                kn->flags |= KERNFS_SUICIDED;
+                ret = true;
+        } else {
+                wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
+                DEFINE_WAIT(wait);
-        kernfs_addrm_start(&acxt);
+                while (true) {
-        __kernfs_remove(&acxt, kn);
+                        prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);
-        kernfs_addrm_finish(&acxt);
+                        if ((kn->flags & KERNFS_SUICIDED) &&
+                            atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
+                                break;
+                        mutex_unlock(&kernfs_mutex);
+                        schedule();
+                        mutex_lock(&kernfs_mutex);
+                }
+                finish_wait(waitq, &wait);
+                WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
+                ret = false;
+        }
+        /*
+         * This must be done while holding kernfs_mutex; otherwise, waiting
+         * for SUICIDED && deactivated could finish prematurely.
+         */
+        kernfs_unbreak_active_protection(kn);
+        mutex_unlock(&kernfs_mutex);
+        return ret;
 }
 /**
@@ -876,7 +1212,6 @@ void kernfs_remove(struct kernfs_node *kn)
 int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
                             const void *ns)
 {
-        struct kernfs_addrm_cxt acxt;
        struct kernfs_node *kn;
        if (!parent) {
@@ -885,13 +1220,13 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
                return -ENOENT;
        }
-        kernfs_addrm_start(&acxt);
+        mutex_lock(&kernfs_mutex);
        kn = kernfs_find_ns(parent, name, ns);
        if (kn)
-                __kernfs_remove(&acxt, kn);
+                __kernfs_remove(kn);
-        kernfs_addrm_finish(&acxt);
+        mutex_unlock(&kernfs_mutex);
        if (kn)
                return 0;
@@ -909,12 +1244,18 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
                     const char *new_name, const void *new_ns)
 {
+        struct kernfs_node *old_parent;
+        const char *old_name = NULL;
        int error;
+        /* can't move or rename root */
+        if (!kn->parent)
+                return -EINVAL;
        mutex_lock(&kernfs_mutex);
        error = -ENOENT;
-        if ((kn->flags | new_parent->flags) & KERNFS_REMOVED)
+        if (!kernfs_active(kn) || !kernfs_active(new_parent))
                goto out;
        error = 0;
@@ -932,13 +1273,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
                new_name = kstrdup(new_name, GFP_KERNEL);
                if (!new_name)
                        goto out;
+        } else {
-                if (kn->flags & KERNFS_STATIC_NAME)
+                new_name = NULL;
-                        kn->flags &= ~KERNFS_STATIC_NAME;
-                else
-                        kfree(kn->name);
-                kn->name = new_name;
        }
        /*
@@ -946,12 +1282,29 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
         */
        kernfs_unlink_sibling(kn);
        kernfs_get(new_parent);
-        kernfs_put(kn->parent);
+        /* rename_lock protects ->parent and ->name accessors */
+        spin_lock_irq(&kernfs_rename_lock);
+        old_parent = kn->parent;
+        kn->parent = new_parent;
        kn->ns = new_ns;
+        if (new_name) {
+                if (!(kn->flags & KERNFS_STATIC_NAME))
+                        old_name = kn->name;
+                kn->flags &= ~KERNFS_STATIC_NAME;
+                kn->name = new_name;
+        }
+        spin_unlock_irq(&kernfs_rename_lock);
        kn->hash = kernfs_name_hash(kn->name, kn->ns);
-        kn->parent = new_parent;
        kernfs_link_sibling(kn);
+        kernfs_put(old_parent);
+        kfree(old_name);
        error = 0;
 out:
        mutex_unlock(&kernfs_mutex);
@@ -974,7 +1327,7 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
        struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
 {
        if (pos) {
-                int valid = !(pos->flags & KERNFS_REMOVED) &&
+                int valid = kernfs_active(pos) &&
                        pos->parent == parent && hash == pos->hash;
                kernfs_put(pos);
                if (!valid)
@@ -993,8 +1346,8 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
                                break;
                }
        }
-        /* Skip over entries in the wrong namespace */
+        /* Skip over entries which are dying/dead or in the wrong namespace */
-        while (pos && pos->ns != ns) {
+        while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
                struct rb_node *node = rb_next(&pos->rb);
                if (!node)
                        pos = NULL;
@@ -1008,14 +1361,15 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
        struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
 {
        pos = kernfs_dir_pos(ns, parent, ino, pos);
-        if (pos)
+        if (pos) {
                do {
                        struct rb_node *node = rb_next(&pos->rb);
                        if (!node)
                                pos = NULL;
                        else
                                pos = rb_to_kn(node);
-                } while (pos && pos->ns != ns);
+                } while (pos && (!kernfs_active(pos) || pos->ns != ns));
+        }
        return pos;
 }
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index dbf397bfdff2..5e9a80cfc3d8 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -252,10 +252,18 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
                                size_t count, loff_t *ppos)
 {
        struct kernfs_open_file *of = kernfs_of(file);
-        ssize_t len = min_t(size_t, count, PAGE_SIZE);
        const struct kernfs_ops *ops;
+        size_t len;
        char *buf;
+        if (of->atomic_write_len) {
+                len = count;
+                if (len > of->atomic_write_len)
+                        return -E2BIG;
+        } else {
+                len = min_t(size_t, count, PAGE_SIZE);
+        }
        buf = kmalloc(len + 1, GFP_KERNEL);
        if (!buf)
                return -ENOMEM;
@@ -476,6 +484,8 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
        ops = kernfs_ops(of->kn);
        rc = ops->mmap(of, vma);
+        if (rc)
+                goto out_put;
        /*
         * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
@@ -600,6 +610,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn,
 static int kernfs_fop_open(struct inode *inode, struct file *file)
 {
        struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
+        struct kernfs_root *root = kernfs_root(kn);
        const struct kernfs_ops *ops;
        struct kernfs_open_file *of;
        bool has_read, has_write, has_mmap;
@@ -614,14 +625,16 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
        has_write = ops->write || ops->mmap;
        has_mmap = ops->mmap;
-        /* check perms and supported operations */
+        /* see the flag definition for details */
-        if ((file->f_mode & FMODE_WRITE) &&
+        if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) {
-            (!(inode->i_mode & S_IWUGO) || !has_write))
+                if ((file->f_mode & FMODE_WRITE) &&
-                goto err_out;
+                    (!(inode->i_mode & S_IWUGO) || !has_write))
+                        goto err_out;
-        if ((file->f_mode & FMODE_READ) &&
+                if ((file->f_mode & FMODE_READ) &&
-            (!(inode->i_mode & S_IRUGO) || !has_read))
+                    (!(inode->i_mode & S_IRUGO) || !has_read))
-                goto err_out;
+                        goto err_out;
+        }
        /* allocate a kernfs_open_file for the file */
        error = -ENOMEM;
@@ -653,6 +666,12 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
        of->file = file;
        /*
+         * Write path needs to atomic_write_len outside active reference.
+         * Cache it in open_file.  See kernfs_fop_write() for details.
+         */
+        of->atomic_write_len = ops->atomic_write_len;
+        /*
         * Always instantiate seq_file even if read access doesn't use
         * seq_file or is not requested.  This unifies private data access
         * and readable regular files are the vast majority anyway.
@@ -820,7 +839,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
                                         bool name_is_static,
                                         struct lock_class_key *key)
 {
-        struct kernfs_addrm_cxt acxt;
        struct kernfs_node *kn;
        unsigned flags;
        int rc;
@@ -855,10 +873,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
        if (ops->mmap)
                kn->flags |= KERNFS_HAS_MMAP;
-        kernfs_addrm_start(&acxt);
+        rc = kernfs_add_one(kn);
-        rc = kernfs_add_one(&acxt, kn);
-        kernfs_addrm_finish(&acxt);
        if (rc) {
                kernfs_put(kn);
                return ERR_PTR(rc);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index e55126f85bd2..985217626e66 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -48,14 +48,18 @@ void __init kernfs_inode_init(void)
 static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
 {
+        static DEFINE_MUTEX(iattr_mutex);
+        struct kernfs_iattrs *ret;
        struct iattr *iattrs;
+        mutex_lock(&iattr_mutex);
        if (kn->iattr)
-                return kn->iattr;
+                goto out_unlock;
        kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
        if (!kn->iattr)
-                return NULL;
+                goto out_unlock;
        iattrs = &kn->iattr->ia_iattr;
        /* assign default attributes */
@@ -65,8 +69,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
        iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
        simple_xattrs_init(&kn->iattr->xattrs);
+out_unlock:
-        return kn->iattr;
+        ret = kn->iattr;
+        mutex_unlock(&iattr_mutex);
+        return ret;
 }
 static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
@@ -355,7 +361,7 @@ void kernfs_evict_inode(struct inode *inode)
 {
        struct kernfs_node *kn = inode->i_private;
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        kernfs_put(kn);
 }
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index eb536b76374a..8be13b2a079b 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -26,7 +26,8 @@ struct kernfs_iattrs {
        struct simple_xattrs    xattrs;
 };
-#define KN_DEACTIVATED_BIAS             INT_MIN
+/* +1 to avoid triggering overflow warning when negating it */
+#define KN_DEACTIVATED_BIAS             (INT_MIN + 1)
 /* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
@@ -45,13 +46,6 @@ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
 }
 /*
- * Context structure to be used while adding/removing nodes.
- */
-struct kernfs_addrm_cxt {
-        struct kernfs_node      *removed;
-};
-/*
 * mount.c
 */
 struct kernfs_super_info {
@@ -71,6 +65,7 @@ struct kernfs_super_info {
 };
 #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
+extern const struct super_operations kernfs_sops;
 extern struct kmem_cache *kernfs_node_cache;
 /*
@@ -100,9 +95,7 @@ extern const struct inode_operations kernfs_dir_iops;
 struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
 void kernfs_put_active(struct kernfs_node *kn);
-void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt);
+int kernfs_add_one(struct kernfs_node *kn);
-int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn);
-void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
 struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
                                    const char *name, umode_t mode,
                                    unsigned flags);
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 0f4152defe7b..95dcd1d558bb 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -19,13 +19,50 @@
 struct kmem_cache *kernfs_node_cache;
-static const struct super_operations kernfs_sops = {
+static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+        struct kernfs_root *root = kernfs_info(sb)->root;
+        struct kernfs_syscall_ops *scops = root->syscall_ops;
+        if (scops && scops->remount_fs)
+                return scops->remount_fs(root, flags, data);
+        return 0;
+}
+static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
+{
+        struct kernfs_root *root = kernfs_root(dentry->d_fsdata);
+        struct kernfs_syscall_ops *scops = root->syscall_ops;
+        if (scops && scops->show_options)
+                return scops->show_options(sf, root);
+        return 0;
+}
+const struct super_operations kernfs_sops = {
        .statfs         = simple_statfs,
        .drop_inode     = generic_delete_inode,
        .evict_inode    = kernfs_evict_inode,
+        .remount_fs     = kernfs_sop_remount_fs,
+        .show_options   = kernfs_sop_show_options,
 };
-static int kernfs_fill_super(struct super_block *sb)
+/**
+ * kernfs_root_from_sb - determine kernfs_root associated with a super_block
+ * @sb: the super_block in question
+ *
+ * Return the kernfs_root associated with @sb.  If @sb is not a kernfs one,
+ * %NULL is returned.
+ */
+struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
+{
+        if (sb->s_op == &kernfs_sops)
+                return kernfs_info(sb)->root;
+        return NULL;
+}
+static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
 {
        struct kernfs_super_info *info = kernfs_info(sb);
        struct inode *inode;
@@ -33,7 +70,7 @@ static int kernfs_fill_super(struct super_block *sb)
        sb->s_blocksize = PAGE_CACHE_SIZE;
        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-        sb->s_magic = SYSFS_MAGIC;
+        sb->s_magic = magic;
        sb->s_op = &kernfs_sops;
        sb->s_time_gran = 1;
@@ -94,6 +131,7 @@ const void *kernfs_super_ns(struct super_block *sb)
 * @fs_type: file_system_type of the fs being mounted
 * @flags: mount flags specified for the mount
 * @root: kernfs_root of the hierarchy being mounted
+ * @magic: file system specific magic number
 * @new_sb_created: tell the caller if we allocated a new superblock
 * @ns: optional namespace tag of the mount
 *
@@ -105,8 +143,8 @@ const void *kernfs_super_ns(struct super_block *sb)
 * The return value can be passed to the vfs layer verbatim.
 */
 struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
-                               struct kernfs_root *root, bool *new_sb_created,
+                                struct kernfs_root *root, unsigned long magic,
-                               const void *ns)
+                                bool *new_sb_created, const void *ns)
 {
        struct super_block *sb;
        struct kernfs_super_info *info;
@@ -129,7 +167,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
                *new_sb_created = !sb->s_root;
        if (!sb->s_root) {
-                error = kernfs_fill_super(sb);
+                error = kernfs_fill_super(sb, magic);
                if (error) {
                        deactivate_locked_super(sb);
                        return ERR_PTR(error);
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 4d457055acb9..8a198898e39a 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -27,7 +27,6 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
                                       struct kernfs_node *target)
 {
        struct kernfs_node *kn;
-        struct kernfs_addrm_cxt acxt;
        int error;
        kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
@@ -39,10 +38,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
        kn->symlink.target_kn = target;
        kernfs_get(target);     /* ref owned by symlink */
-        kernfs_addrm_start(&acxt);
+        error = kernfs_add_one(kn);
-        error = kernfs_add_one(&acxt, kn);
-        kernfs_addrm_finish(&acxt);
        if (!error)
                return kn;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 10d6c41aecad..6bf06a07f3e0 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -235,6 +235,7 @@ out_err:
        if (warned++ == 0)
                printk(KERN_WARNING
                        "lockd_up: makesock failed, error=%d\n", err);
+        svc_shutdown_net(serv, net);
        return err;
 }
diff --git a/fs/locks.c b/fs/locks.c
index 92a0f0a52b06..e390bd9ae068 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -135,6 +135,7 @@
 #define IS_POSIX(fl)    (fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)    (fl->fl_flags & FL_FLOCK)
 #define IS_LEASE(fl)    (fl->fl_flags & (FL_LEASE|FL_DELEG))
+#define IS_OFDLCK(fl)   (fl->fl_flags & FL_OFDLCK)
 static bool lease_breaking(struct file_lock *fl)
 {
@@ -344,48 +345,43 @@ static int assign_type(struct file_lock *fl, long type)
        return 0;
 }
-/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
+static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
- * style lock.
+                                 struct flock64 *l)
- */
-static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
-                               struct flock *l)
 {
-        off_t start, end;
        switch (l->l_whence) {
        case SEEK_SET:
-                start = 0;
+                fl->fl_start = 0;
                break;
        case SEEK_CUR:
-                start = filp->f_pos;
+                fl->fl_start = filp->f_pos;
                break;
        case SEEK_END:
-                start = i_size_read(file_inode(filp));
+                fl->fl_start = i_size_read(file_inode(filp));
                break;
        default:
                return -EINVAL;
        }
+        if (l->l_start > OFFSET_MAX - fl->fl_start)
+                return -EOVERFLOW;
+        fl->fl_start += l->l_start;
+        if (fl->fl_start < 0)
+                return -EINVAL;
        /* POSIX-1996 leaves the case l->l_len < 0 undefined;
           POSIX-2001 defines it. */
-        start += l->l_start;
-        if (start < 0)
-                return -EINVAL;
-        fl->fl_end = OFFSET_MAX;
        if (l->l_len > 0) {
-                end = start + l->l_len - 1;
+                if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
-                fl->fl_end = end;
+                        return -EOVERFLOW;
+                fl->fl_end = fl->fl_start + l->l_len - 1;
        } else if (l->l_len < 0) {
-                end = start - 1;
+                if (fl->fl_start + l->l_len < 0)
-                fl->fl_end = end;
-                start += l->l_len;
-                if (start < 0)
                        return -EINVAL;
-        }
+                fl->fl_end = fl->fl_start - 1;
-        fl->fl_start = start;   /* we record the absolute position */
+                fl->fl_start += l->l_len;
-        if (fl->fl_end < fl->fl_start)
+        } else
-                return -EOVERFLOW;
+                fl->fl_end = OFFSET_MAX;
-        
        fl->fl_owner = current->files;
        fl->fl_pid = current->tgid;
        fl->fl_file = filp;
@@ -396,52 +392,21 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
        return assign_type(fl, l->l_type);
 }
-#if BITS_PER_LONG == 32
+/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
-static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
+ * style lock.
-                                 struct flock64 *l)
+ */
+static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
+                               struct flock *l)
 {
-        loff_t start;
+        struct flock64 ll = {
+                .l_type = l->l_type,
-        switch (l->l_whence) {
+                .l_whence = l->l_whence,
-        case SEEK_SET:
+                .l_start = l->l_start,
-                start = 0;
+                .l_len = l->l_len,
-                break;
+        };
-        case SEEK_CUR:
-                start = filp->f_pos;
-                break;
-        case SEEK_END:
-                start = i_size_read(file_inode(filp));
-                break;
-        default:
-                return -EINVAL;
-        }
-        start += l->l_start;
+        return flock64_to_posix_lock(filp, fl, &ll);
-        if (start < 0)
-                return -EINVAL;
-        fl->fl_end = OFFSET_MAX;
-        if (l->l_len > 0) {
-                fl->fl_end = start + l->l_len - 1;
-        } else if (l->l_len < 0) {
-                fl->fl_end = start - 1;
-                start += l->l_len;
-                if (start < 0)
-                        return -EINVAL;
-        }
-        fl->fl_start = start;   /* we record the absolute position */
-        if (fl->fl_end < fl->fl_start)
-                return -EOVERFLOW;
-        
-        fl->fl_owner = current->files;
-        fl->fl_pid = current->tgid;
-        fl->fl_file = filp;
-        fl->fl_flags = FL_POSIX;
-        fl->fl_ops = NULL;
-        fl->fl_lmops = NULL;
-        return assign_type(fl, l->l_type);
 }
-#endif
 /* default lease lock manager operations */
 static void lease_break_callback(struct file_lock *fl)
@@ -511,8 +476,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 }
 /* Must be called with the i_lock held! */
-static inline void
+static void locks_insert_global_locks(struct file_lock *fl)
-locks_insert_global_locks(struct file_lock *fl)
 {
        lg_local_lock(&file_lock_lglock);
        fl->fl_link_cpu = smp_processor_id();
@@ -521,8 +485,7 @@ locks_insert_global_locks(struct file_lock *fl)
 }
 /* Must be called with the i_lock held! */
-static inline void
+static void locks_delete_global_locks(struct file_lock *fl)
-locks_delete_global_locks(struct file_lock *fl)
 {
        /*
         * Avoid taking lock if already unhashed. This is safe since this check
@@ -544,14 +507,12 @@ posix_owner_key(struct file_lock *fl)
        return (unsigned long)fl->fl_owner;
 }
-static inline void
+static void locks_insert_global_blocked(struct file_lock *waiter)
-locks_insert_global_blocked(struct file_lock *waiter)
 {
        hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
 }
-static inline void
+static void locks_delete_global_blocked(struct file_lock *waiter)
-locks_delete_global_blocked(struct file_lock *waiter)
 {
        hash_del(&waiter->fl_link);
 }
@@ -581,7 +542,7 @@ static void locks_delete_block(struct file_lock *waiter)
 * it seems like the reasonable thing to do.
 *
 * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
- * list itself is protected by the file_lock_list, but by ensuring that the
+ * list itself is protected by the blocked_lock_lock, but by ensuring that the
 * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
 * in some cases when we see that the fl_block list is empty.
 */
@@ -591,7 +552,7 @@ static void __locks_insert_block(struct file_lock *blocker,
        BUG_ON(!list_empty(&waiter->fl_block));
        waiter->fl_next = blocker;
        list_add_tail(&waiter->fl_block, &blocker->fl_block);
-        if (IS_POSIX(blocker))
+        if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
                locks_insert_global_blocked(waiter);
 }
@@ -652,15 +613,18 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
        locks_insert_global_locks(fl);
 }
-/*
+/**
- * Delete a lock and then free it.
+ * locks_delete_lock - Delete a lock and then free it.
- * Wake up processes that are blocked waiting for this lock,
+ * @thisfl_p: pointer that points to the fl_next field of the previous
- * notify the FS that the lock has been cleared and
+ *            inode->i_flock list entry
- * finally free the lock.
+ *
+ * Unlink a lock from all lists and free the namespace reference, but don't
+ * free it yet. Wake up processes that are blocked waiting for this lock and
+ * notify the FS that the lock has been cleared.
 *
 * Must be called with the i_lock held!
 */
-static void locks_delete_lock(struct file_lock **thisfl_p)
+static void locks_unlink_lock(struct file_lock **thisfl_p)
 {
        struct file_lock *fl = *thisfl_p;
@@ -675,6 +639,18 @@ static void locks_delete_lock(struct file_lock **thisfl_p)
        }
        locks_wake_up_blocks(fl);
+}
+/*
+ * Unlink a lock from all lists and free it.
+ *
+ * Must be called with i_lock held!
+ */
+static void locks_delete_lock(struct file_lock **thisfl_p)
+{
+        struct file_lock *fl = *thisfl_p;
+        locks_unlink_lock(thisfl_p);
        locks_free_lock(fl);
 }
@@ -769,8 +745,16 @@ EXPORT_SYMBOL(posix_test_lock);
 * Note: the above assumption may not be true when handling lock
 * requests from a broken NFS client. It may also fail in the presence
 * of tasks (such as posix threads) sharing the same open file table.
- *
 * To handle those cases, we just bail out after a few iterations.
+ *
+ * For FL_OFDLCK locks, the owner is the filp, not the files_struct.
+ * Because the owner is not even nominally tied to a thread of
+ * execution, the deadlock detection below can't reasonably work well. Just
+ * skip it for those.
+ *
+ * In principle, we could do a more limited deadlock detection on FL_OFDLCK
+ * locks that just checks for the case where two tasks are attempting to
+ * upgrade from read to write locks on the same inode.
 */
 #define MAX_DEADLK_ITERATIONS 10
@@ -793,6 +777,13 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
 {
        int i = 0;
+        /*
+         * This deadlock detector can't reasonably detect deadlocks with
+         * FL_OFDLCK locks, since they aren't owned by a process, per-se.
+         */
+        if (IS_OFDLCK(caller_fl))
+                return 0;
        while ((block_fl = what_owner_is_waiting_for(block_fl))) {
                if (i++ > MAX_DEADLK_ITERATIONS)
                        return 0;
@@ -1152,13 +1143,14 @@ EXPORT_SYMBOL(posix_lock_file_wait);
 /**
 * locks_mandatory_locked - Check for an active lock
- * @inode: the file to check
+ * @file: the file to check
 *
 * Searches the inode's list of locks to find any POSIX locks which conflict.
 * This function is called from locks_verify_locked() only.
 */
-int locks_mandatory_locked(struct inode *inode)
+int locks_mandatory_locked(struct file *file)
 {
+        struct inode *inode = file_inode(file);
        fl_owner_t owner = current->files;
        struct file_lock *fl;
@@ -1169,7 +1161,7 @@ int locks_mandatory_locked(struct inode *inode)
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!IS_POSIX(fl))
                        continue;
-                if (fl->fl_owner != owner)
+                if (fl->fl_owner != owner && fl->fl_owner != (fl_owner_t)file)
                        break;
        }
        spin_unlock(&inode->i_lock);
@@ -1195,19 +1187,30 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 {
        struct file_lock fl;
        int error;
+        bool sleep = false;
        locks_init_lock(&fl);
-        fl.fl_owner = current->files;
        fl.fl_pid = current->tgid;
        fl.fl_file = filp;
        fl.fl_flags = FL_POSIX | FL_ACCESS;
        if (filp && !(filp->f_flags & O_NONBLOCK))
-                fl.fl_flags |= FL_SLEEP;
+                sleep = true;
        fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK;
        fl.fl_start = offset;
        fl.fl_end = offset + count - 1;
        for (;;) {
+                if (filp) {
+                        fl.fl_owner = (fl_owner_t)filp;
+                        fl.fl_flags &= ~FL_SLEEP;
+                        error = __posix_lock_file(inode, &fl, NULL);
+                        if (!error)
+                                break;
+                }
+                if (sleep)
+                        fl.fl_flags |= FL_SLEEP;
+                fl.fl_owner = current->files;
                error = __posix_lock_file(inode, &fl, NULL);
                if (error != FILE_LOCK_DEFERRED)
                        break;
@@ -1376,11 +1379,10 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 restart:
        break_time = flock->fl_break_time;
-        if (break_time != 0) {
+        if (break_time != 0)
                break_time -= jiffies;
-                if (break_time == 0)
+        if (break_time == 0)
-                        break_time++;
+                break_time++;
-        }
        locks_insert_block(flock, new_fl);
        spin_unlock(&inode->i_lock);
        error = wait_event_interruptible_timeout(new_fl->fl_wait,
@@ -1472,6 +1474,32 @@ int fcntl_getlease(struct file *filp)
        return type;
 }
+/**
+ * check_conflicting_open - see if the given dentry points to a file that has
+ *                          an existing open that would conflict with the
+ *                          desired lease.
+ * @dentry:     dentry to check
+ * @arg:        type of lease that we're trying to acquire
+ *
+ * Check to see if there's an existing open fd on this file that would
+ * conflict with the lease we're trying to set.
+ */
+static int
+check_conflicting_open(const struct dentry *dentry, const long arg)
+{
+        int ret = 0;
+        struct inode *inode = dentry->d_inode;
+        if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
+                return -EAGAIN;
+        if ((arg == F_WRLCK) && ((d_count(dentry) > 1) ||
+            (atomic_read(&inode->i_count) > 1)))
+                ret = -EAGAIN;
+        return ret;
+}
 static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
 {
        struct file_lock *fl, **before, **my_before = NULL, *lease;
@@ -1499,12 +1527,8 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
                return -EINVAL;
        }
-        error = -EAGAIN;
+        error = check_conflicting_open(dentry, arg);
-        if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
+        if (error)
-                goto out;
-        if ((arg == F_WRLCK)
-            && ((d_count(dentry) > 1)
-                || (atomic_read(&inode->i_count) > 1)))
                goto out;
        /*
@@ -1549,7 +1573,19 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
                goto out;
        locks_insert_lock(before, lease);
-        error = 0;
+        /*
+         * The check in break_lease() is lockless. It's possible for another
+         * open to race in after we did the earlier check for a conflicting
+         * open but before the lease was inserted. Check again for a
+         * conflicting open and cancel the lease if there is one.
+         *
+         * We also add a barrier here to ensure that the insertion of the lock
+         * precedes these checks.
+         */
+        smp_mb();
+        error = check_conflicting_open(dentry, arg);
+        if (error)
+                locks_unlink_lock(flp);
 out:
        if (is_deleg)
                mutex_unlock(&inode->i_mutex);
@@ -1842,7 +1878,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock);
 static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
 {
-        flock->l_pid = fl->fl_pid;
+        flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid;
 #if BITS_PER_LONG == 32
        /*
         * Make sure we can represent the posix lock via
@@ -1864,7 +1900,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
 #if BITS_PER_LONG == 32
 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
 {
-        flock->l_pid = fl->fl_pid;
+        flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid;
        flock->l_start = fl->fl_start;
        flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
                fl->fl_end - fl->fl_start + 1;
@@ -1876,7 +1912,7 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
 /* Report the first existing lock that would conflict with l.
 * This implements the F_GETLK command of fcntl().
 */
-int fcntl_getlk(struct file *filp, struct flock __user *l)
+int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l)
 {
        struct file_lock file_lock;
        struct flock flock;
@@ -1893,6 +1929,16 @@ int fcntl_getlk(struct file *filp, struct flock __user *l)
        if (error)
                goto out;
+        if (cmd == F_OFD_GETLK) {
+                error = -EINVAL;
+                if (flock.l_pid != 0)
+                        goto out;
+                cmd = F_GETLK;
+                file_lock.fl_flags |= FL_OFDLCK;
+                file_lock.fl_owner = (fl_owner_t)filp;
+        }
        error = vfs_test_lock(filp, &file_lock);
        if (error)
                goto out;
@@ -1976,6 +2022,22 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
        return error;
 }
+/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */
+static int
+check_fmode_for_setlk(struct file_lock *fl)
+{
+        switch (fl->fl_type) {
+        case F_RDLCK:
+                if (!(fl->fl_file->f_mode & FMODE_READ))
+                        return -EBADF;
+                break;
+        case F_WRLCK:
+                if (!(fl->fl_file->f_mode & FMODE_WRITE))
+                        return -EBADF;
+        }
+        return 0;
+}
 /* Apply the lock described by l to an open file descriptor.
 * This implements both the F_SETLK and F_SETLKW commands of fcntl().
 */
@@ -2012,25 +2074,36 @@ again:
        error = flock_to_posix_lock(filp, file_lock, &flock);
        if (error)
                goto out;
-        if (cmd == F_SETLKW) {
-                file_lock->fl_flags |= FL_SLEEP;
+        error = check_fmode_for_setlk(file_lock);
-        }
+        if (error)
-        
+                goto out;
-        error = -EBADF;
-        switch (flock.l_type) {
+        /*
-        case F_RDLCK:
+         * If the cmd is requesting file-private locks, then set the
-                if (!(filp->f_mode & FMODE_READ))
+         * FL_OFDLCK flag and override the owner.
-                        goto out;
+         */
-                break;
+        switch (cmd) {
-        case F_WRLCK:
+        case F_OFD_SETLK:
-                if (!(filp->f_mode & FMODE_WRITE))
+                error = -EINVAL;
+                if (flock.l_pid != 0)
                        goto out;
+                cmd = F_SETLK;
+                file_lock->fl_flags |= FL_OFDLCK;
+                file_lock->fl_owner = (fl_owner_t)filp;
                break;
-        case F_UNLCK:
+        case F_OFD_SETLKW:
-                break;
-        default:
                error = -EINVAL;
-                goto out;
+                if (flock.l_pid != 0)
+                        goto out;
+                cmd = F_SETLKW;
+                file_lock->fl_flags |= FL_OFDLCK;
+                file_lock->fl_owner = (fl_owner_t)filp;
+                /* Fallthrough */
+        case F_SETLKW:
+                file_lock->fl_flags |= FL_SLEEP;
        }
        error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2061,7 +2134,7 @@ out:
 /* Report the first existing lock that would conflict with l.
 * This implements the F_GETLK command of fcntl().
 */
-int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
+int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
 {
        struct file_lock file_lock;
        struct flock64 flock;
@@ -2078,6 +2151,16 @@ int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
        if (error)
                goto out;
+        if (cmd == F_OFD_GETLK) {
+                error = -EINVAL;
+                if (flock.l_pid != 0)
+                        goto out;
+                cmd = F_GETLK64;
+                file_lock.fl_flags |= FL_OFDLCK;
+                file_lock.fl_owner = (fl_owner_t)filp;
+        }
        error = vfs_test_lock(filp, &file_lock);
        if (error)
                goto out;
@@ -2130,25 +2213,36 @@ again:
        error = flock64_to_posix_lock(filp, file_lock, &flock);
        if (error)
                goto out;
-        if (cmd == F_SETLKW64) {
-                file_lock->fl_flags |= FL_SLEEP;
+        error = check_fmode_for_setlk(file_lock);
-        }
+        if (error)
-        
+                goto out;
-        error = -EBADF;
-        switch (flock.l_type) {
+        /*
-        case F_RDLCK:
+         * If the cmd is requesting file-private locks, then set the
-                if (!(filp->f_mode & FMODE_READ))
+         * FL_OFDLCK flag and override the owner.
-                        goto out;
+         */
-                break;
+        switch (cmd) {
-        case F_WRLCK:
+        case F_OFD_SETLK:
-                if (!(filp->f_mode & FMODE_WRITE))
+                error = -EINVAL;
+                if (flock.l_pid != 0)
                        goto out;
+                cmd = F_SETLK64;
+                file_lock->fl_flags |= FL_OFDLCK;
+                file_lock->fl_owner = (fl_owner_t)filp;
                break;
-        case F_UNLCK:
+        case F_OFD_SETLKW:
-                break;
-        default:
                error = -EINVAL;
-                goto out;
+                if (flock.l_pid != 0)
+                        goto out;
+                cmd = F_SETLKW64;
+                file_lock->fl_flags |= FL_OFDLCK;
+                file_lock->fl_owner = (fl_owner_t)filp;
+                /* Fallthrough */
+        case F_SETLKW64:
+                file_lock->fl_flags |= FL_SLEEP;
        }
        error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2209,7 +2303,7 @@ EXPORT_SYMBOL(locks_remove_posix);
 /*
 * This function is called on the last close of an open file.
 */
-void locks_remove_flock(struct file *filp)
+void locks_remove_file(struct file *filp)
 {
        struct inode * inode = file_inode(filp);
        struct file_lock *fl;
@@ -2218,6 +2312,8 @@ void locks_remove_flock(struct file *filp)
        if (!inode->i_flock)
                return;
+        locks_remove_posix(filp, (fl_owner_t)filp);
        if (filp->f_op->flock) {
                struct file_lock fl = {
                        .fl_pid = current->tgid,
@@ -2236,16 +2332,28 @@ void locks_remove_flock(struct file *filp)
        while ((fl = *before) != NULL) {
                if (fl->fl_file == filp) {
-                        if (IS_FLOCK(fl)) {
-                                locks_delete_lock(before);
-                                continue;
-                        }
                        if (IS_LEASE(fl)) {
                                lease_modify(before, F_UNLCK);
                                continue;
                        }
-                        /* What? */
-                        BUG();
+                        /*
+                         * There's a leftover lock on the list of a type that
+                         * we didn't expect to see. Most likely a classic
+                         * POSIX lock that ended up not getting released
+                         * properly, or that raced onto the list somehow. Log
+                         * some info about it and then just remove it from
+                         * the list.
+                         */
+                        WARN(!IS_FLOCK(fl),
+                                "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
+                                MAJOR(inode->i_sb->s_dev),
+                                MINOR(inode->i_sb->s_dev), inode->i_ino,
+                                fl->fl_type, fl->fl_flags,
+                                fl->fl_start, fl->fl_end);
+                        locks_delete_lock(before);
+                        continue;
                }
                before = &fl->fl_next;
        }
@@ -2314,8 +2422,14 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
        seq_printf(f, "%lld:%s ", id, pfx);
        if (IS_POSIX(fl)) {
-                seq_printf(f, "%6s %s ",
+                if (fl->fl_flags & FL_ACCESS)
-                             (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
+                        seq_printf(f, "ACCESS");
+                else if (IS_OFDLCK(fl))
+                        seq_printf(f, "OFDLCK");
+                else
+                        seq_printf(f, "POSIX ");
+                seq_printf(f, " %s ",
                             (inode == NULL) ? "*NOINODE*" :
                             mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
        } else if (IS_FLOCK(fl)) {
@@ -2385,6 +2499,7 @@ static int locks_show(struct seq_file *f, void *v)
 }
 static void *locks_start(struct seq_file *f, loff_t *pos)
+        __acquires(&blocked_lock_lock)
 {
        struct locks_iterator *iter = f->private;
@@ -2403,6 +2518,7 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 }
 static void locks_stop(struct seq_file *f, void *v)
+        __releases(&blocked_lock_lock)
 {
        spin_unlock(&blocked_lock_lock);
        lg_global_unlock(&file_lock_lglock);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 9a59cbade2fb..48140315f627 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -2180,7 +2180,7 @@ void logfs_evict_inode(struct inode *inode)
                        do_delete_inode(inode);
                }
        }
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        /* Cheaper version of write_inode.  All changes are concealed in
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e519e45bf673..bf166e388f0d 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -26,6 +26,41 @@
 * back on the lru list.
 */
+/*
+ * Lock descriptions and usage:
+ *
+ * Each hash chain of both the block and index hash tables now contains
+ * a built-in lock used to serialize accesses to the hash chain.
+ *
+ * Accesses to global data structures mb_cache_list and mb_cache_lru_list
+ * are serialized via the global spinlock mb_cache_spinlock.
+ *
+ * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
+ * accesses to its local data, such as e_used and e_queued.
+ *
+ * Lock ordering:
+ *
+ * Each block hash chain's lock has the highest lock order, followed by an
+ * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
+ * lock), and mb_cach_spinlock, with the lowest order.  While holding
+ * either a block or index hash chain lock, a thread can acquire an
+ * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
+ *
+ * Synchronization:
+ *
+ * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
+ * index hash chian, it needs to lock the corresponding hash chain.  For each
+ * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
+ * prevent either any simultaneous release or free on the entry and also
+ * to serialize accesses to either the e_used or e_queued member of the entry.
+ *
+ * To avoid having a dangling reference to an already freed
+ * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
+ * block hash chain and also no longer being referenced, both e_used,
+ * and e_queued are 0's.  When an mb_cache_entry is explicitly freed it is
+ * first removed from a block hash chain.
+ */
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -34,9 +69,10 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
-#include <linux/init.h>
+#include <linux/list_bl.h>
 #include <linux/mbcache.h>
+#include <linux/init.h>
+#include <linux/blockgroup_lock.h>
 #ifdef MB_CACHE_DEBUG
 # define mb_debug(f...) do { \
@@ -57,8 +93,14 @@
 #define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
+#define MB_CACHE_ENTRY_LOCK_BITS        __builtin_log2(NR_BG_LOCKS)
+#define MB_CACHE_ENTRY_LOCK_INDEX(ce)                   \
+        (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
 static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
-                
+static struct blockgroup_lock *mb_cache_bg_lock;
+static struct kmem_cache *mb_cache_kmem_cache;
 MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
 MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
 MODULE_LICENSE("GPL");
@@ -86,58 +128,110 @@ static LIST_HEAD(mb_cache_list);
 static LIST_HEAD(mb_cache_lru_list);
 static DEFINE_SPINLOCK(mb_cache_spinlock);
+static inline void
+__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
+{
+        spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
+                MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+}
+static inline void
+__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
+{
+        spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
+                MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+}
 static inline int
-__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
+__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
 {
-        return !list_empty(&ce->e_block_list);
+        return !hlist_bl_unhashed(&ce->e_block_list);
 }
-static void
+static inline void
-__mb_cache_entry_unhash(struct mb_cache_entry *ce)
+__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
 {
-        if (__mb_cache_entry_is_hashed(ce)) {
+        if (__mb_cache_entry_is_block_hashed(ce))
-                list_del_init(&ce->e_block_list);
+                hlist_bl_del_init(&ce->e_block_list);
-                list_del(&ce->e_index.o_list);
-        }
 }
+static inline int
+__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
+{
+        return !hlist_bl_unhashed(&ce->e_index.o_list);
+}
+static inline void
+__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
+{
+        if (__mb_cache_entry_is_index_hashed(ce))
+                hlist_bl_del_init(&ce->e_index.o_list);
+}
+/*
+ * __mb_cache_entry_unhash_unlock()
+ *
+ * This function is called to unhash both the block and index hash
+ * chain.
+ * It assumes both the block and index hash chain is locked upon entry.
+ * It also unlock both hash chains both exit
+ */
+static inline void
+__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
+{
+        __mb_cache_entry_unhash_index(ce);
+        hlist_bl_unlock(ce->e_index_hash_p);
+        __mb_cache_entry_unhash_block(ce);
+        hlist_bl_unlock(ce->e_block_hash_p);
+}
 static void
 __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
 {
        struct mb_cache *cache = ce->e_cache;
-        mb_assert(!(ce->e_used || ce->e_queued));
+        mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
        kmem_cache_free(cache->c_entry_cache, ce);
        atomic_dec(&cache->c_entry_count);
 }
 static void
-__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
+__mb_cache_entry_release(struct mb_cache_entry *ce)
-        __releases(mb_cache_spinlock)
 {
+        /* First lock the entry to serialize access to its local data. */
+        __spin_lock_mb_cache_entry(ce);
        /* Wake up all processes queuing for this cache entry. */
        if (ce->e_queued)
                wake_up_all(&mb_cache_queue);
        if (ce->e_used >= MB_CACHE_WRITER)
                ce->e_used -= MB_CACHE_WRITER;
+        /*
+         * Make sure that all cache entries on lru_list have
+         * both e_used and e_qued of 0s.
+         */
        ce->e_used--;
-        if (!(ce->e_used || ce->e_queued)) {
+        if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
-                if (!__mb_cache_entry_is_hashed(ce))
+                if (!__mb_cache_entry_is_block_hashed(ce)) {
+                        __spin_unlock_mb_cache_entry(ce);
                        goto forget;
-                mb_assert(list_empty(&ce->e_lru_list));
+                }
-                list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
+                /*
+                 * Need access to lru list, first drop entry lock,
+                 * then reacquire the lock in the proper order.
+                 */
+                spin_lock(&mb_cache_spinlock);
+                if (list_empty(&ce->e_lru_list))
+                        list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
+                spin_unlock(&mb_cache_spinlock);
        }
-        spin_unlock(&mb_cache_spinlock);
+        __spin_unlock_mb_cache_entry(ce);
        return;
 forget:
-        spin_unlock(&mb_cache_spinlock);
+        mb_assert(list_empty(&ce->e_lru_list));
        __mb_cache_entry_forget(ce, GFP_KERNEL);
 }
 /*
 * mb_cache_shrink_scan()  memory pressure callback
 *
@@ -160,17 +254,34 @@ mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
        mb_debug("trying to free %d entries", nr_to_scan);
        spin_lock(&mb_cache_spinlock);
-        while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
+        while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
                struct mb_cache_entry *ce =
                        list_entry(mb_cache_lru_list.next,
-                                   struct mb_cache_entry, e_lru_list);
+                                struct mb_cache_entry, e_lru_list);
-                list_move_tail(&ce->e_lru_list, &free_list);
+                list_del_init(&ce->e_lru_list);
-                __mb_cache_entry_unhash(ce);
+                if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
-                freed++;
+                        continue;
+                spin_unlock(&mb_cache_spinlock);
+                /* Prevent any find or get operation on the entry */
+                hlist_bl_lock(ce->e_block_hash_p);
+                hlist_bl_lock(ce->e_index_hash_p);
+                /* Ignore if it is touched by a find/get */
+                if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
+                        !list_empty(&ce->e_lru_list)) {
+                        hlist_bl_unlock(ce->e_index_hash_p);
+                        hlist_bl_unlock(ce->e_block_hash_p);
+                        spin_lock(&mb_cache_spinlock);
+                        continue;
+                }
+                __mb_cache_entry_unhash_unlock(ce);
+                list_add_tail(&ce->e_lru_list, &free_list);
+                spin_lock(&mb_cache_spinlock);
        }
        spin_unlock(&mb_cache_spinlock);
        list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
                __mb_cache_entry_forget(entry, gfp_mask);
+                freed++;
        }
        return freed;
 }
@@ -215,29 +326,40 @@ mb_cache_create(const char *name, int bucket_bits)
        int n, bucket_count = 1 << bucket_bits;
        struct mb_cache *cache = NULL;
+        if (!mb_cache_bg_lock) {
+                mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
+                        GFP_KERNEL);
+                if (!mb_cache_bg_lock)
+                        return NULL;
+                bgl_lock_init(mb_cache_bg_lock);
+        }
        cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
        if (!cache)
                return NULL;
        cache->c_name = name;
        atomic_set(&cache->c_entry_count, 0);
        cache->c_bucket_bits = bucket_bits;
-        cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
+        cache->c_block_hash = kmalloc(bucket_count *
-                                      GFP_KERNEL);
+                sizeof(struct hlist_bl_head), GFP_KERNEL);
        if (!cache->c_block_hash)
                goto fail;
        for (n=0; n<bucket_count; n++)
-                INIT_LIST_HEAD(&cache->c_block_hash[n]);
+                INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
-        cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head),
+        cache->c_index_hash = kmalloc(bucket_count *
-                                      GFP_KERNEL);
+                sizeof(struct hlist_bl_head), GFP_KERNEL);
        if (!cache->c_index_hash)
                goto fail;
        for (n=0; n<bucket_count; n++)
-                INIT_LIST_HEAD(&cache->c_index_hash[n]);
+                INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
-        cache->c_entry_cache = kmem_cache_create(name,
+        if (!mb_cache_kmem_cache) {
-                sizeof(struct mb_cache_entry), 0,
+                mb_cache_kmem_cache = kmem_cache_create(name,
-                SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+                        sizeof(struct mb_cache_entry), 0,
-        if (!cache->c_entry_cache)
+                        SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
-                goto fail2;
+                if (!mb_cache_kmem_cache)
+                        goto fail2;
+        }
+        cache->c_entry_cache = mb_cache_kmem_cache;
        /*
         * Set an upper limit on the number of cache entries so that the hash
@@ -273,21 +395,47 @@ void
 mb_cache_shrink(struct block_device *bdev)
 {
        LIST_HEAD(free_list);
-        struct list_head *l, *ltmp;
+        struct list_head *l;
+        struct mb_cache_entry *ce, *tmp;
+        l = &mb_cache_lru_list;
        spin_lock(&mb_cache_spinlock);
-        list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
+        while (!list_is_last(l, &mb_cache_lru_list)) {
-                struct mb_cache_entry *ce =
+                l = l->next;
-                        list_entry(l, struct mb_cache_entry, e_lru_list);
+                ce = list_entry(l, struct mb_cache_entry, e_lru_list);
                if (ce->e_bdev == bdev) {
-                        list_move_tail(&ce->e_lru_list, &free_list);
+                        list_del_init(&ce->e_lru_list);
-                        __mb_cache_entry_unhash(ce);
+                        if (ce->e_used || ce->e_queued ||
+                                atomic_read(&ce->e_refcnt))
+                                continue;
+                        spin_unlock(&mb_cache_spinlock);
+                        /*
+                         * Prevent any find or get operation on the entry.
+                         */
+                        hlist_bl_lock(ce->e_block_hash_p);
+                        hlist_bl_lock(ce->e_index_hash_p);
+                        /* Ignore if it is touched by a find/get */
+                        if (ce->e_used || ce->e_queued ||
+                                atomic_read(&ce->e_refcnt) ||
+                                !list_empty(&ce->e_lru_list)) {
+                                hlist_bl_unlock(ce->e_index_hash_p);
+                                hlist_bl_unlock(ce->e_block_hash_p);
+                                l = &mb_cache_lru_list;
+                                spin_lock(&mb_cache_spinlock);
+                                continue;
+                        }
+                        __mb_cache_entry_unhash_unlock(ce);
+                        mb_assert(!(ce->e_used || ce->e_queued ||
+                                atomic_read(&ce->e_refcnt)));
+                        list_add_tail(&ce->e_lru_list, &free_list);
+                        l = &mb_cache_lru_list;
+                        spin_lock(&mb_cache_spinlock);
                }
        }
        spin_unlock(&mb_cache_spinlock);
-        list_for_each_safe(l, ltmp, &free_list) {
-                __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
+        list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
-                                                   e_lru_list), GFP_KERNEL);
+                __mb_cache_entry_forget(ce, GFP_KERNEL);
        }
 }
@@ -303,23 +451,27 @@ void
 mb_cache_destroy(struct mb_cache *cache)
 {
        LIST_HEAD(free_list);
-        struct list_head *l, *ltmp;
+        struct mb_cache_entry *ce, *tmp;
        spin_lock(&mb_cache_spinlock);
-        list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
+        list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
-                struct mb_cache_entry *ce =
+                if (ce->e_cache == cache)
-                        list_entry(l, struct mb_cache_entry, e_lru_list);
-                if (ce->e_cache == cache) {
                        list_move_tail(&ce->e_lru_list, &free_list);
-                        __mb_cache_entry_unhash(ce);
-                }
        }
        list_del(&cache->c_cache_list);
        spin_unlock(&mb_cache_spinlock);
-        list_for_each_safe(l, ltmp, &free_list) {
+        list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
-                __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
+                list_del_init(&ce->e_lru_list);
-                                                   e_lru_list), GFP_KERNEL);
+                /*
+                 * Prevent any find or get operation on the entry.
+                 */
+                hlist_bl_lock(ce->e_block_hash_p);
+                hlist_bl_lock(ce->e_index_hash_p);
+                mb_assert(!(ce->e_used || ce->e_queued ||
+                        atomic_read(&ce->e_refcnt)));
+                __mb_cache_entry_unhash_unlock(ce);
+                __mb_cache_entry_forget(ce, GFP_KERNEL);
        }
        if (atomic_read(&cache->c_entry_count) > 0) {
@@ -328,8 +480,10 @@ mb_cache_destroy(struct mb_cache *cache)
                          atomic_read(&cache->c_entry_count));
        }
-        kmem_cache_destroy(cache->c_entry_cache);
+        if (list_empty(&mb_cache_list)) {
+                kmem_cache_destroy(mb_cache_kmem_cache);
+                mb_cache_kmem_cache = NULL;
+        }
        kfree(cache->c_index_hash);
        kfree(cache->c_block_hash);
        kfree(cache);
@@ -346,28 +500,61 @@ mb_cache_destroy(struct mb_cache *cache)
 struct mb_cache_entry *
 mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 {
-        struct mb_cache_entry *ce = NULL;
+        struct mb_cache_entry *ce;
        if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
+                struct list_head *l;
+                l = &mb_cache_lru_list;
                spin_lock(&mb_cache_spinlock);
-                if (!list_empty(&mb_cache_lru_list)) {
+                while (!list_is_last(l, &mb_cache_lru_list)) {
-                        ce = list_entry(mb_cache_lru_list.next,
+                        l = l->next;
-                                        struct mb_cache_entry, e_lru_list);
+                        ce = list_entry(l, struct mb_cache_entry, e_lru_list);
-                        list_del_init(&ce->e_lru_list);
+                        if (ce->e_cache == cache) {
-                        __mb_cache_entry_unhash(ce);
+                                list_del_init(&ce->e_lru_list);
+                                if (ce->e_used || ce->e_queued ||
+                                        atomic_read(&ce->e_refcnt))
+                                        continue;
+                                spin_unlock(&mb_cache_spinlock);
+                                /*
+                                 * Prevent any find or get operation on the
+                                 * entry.
+                                 */
+                                hlist_bl_lock(ce->e_block_hash_p);
+                                hlist_bl_lock(ce->e_index_hash_p);
+                                /* Ignore if it is touched by a find/get */
+                                if (ce->e_used || ce->e_queued ||
+                                        atomic_read(&ce->e_refcnt) ||
+                                        !list_empty(&ce->e_lru_list)) {
+                                        hlist_bl_unlock(ce->e_index_hash_p);
+                                        hlist_bl_unlock(ce->e_block_hash_p);
+                                        l = &mb_cache_lru_list;
+                                        spin_lock(&mb_cache_spinlock);
+                                        continue;
+                                }
+                                mb_assert(list_empty(&ce->e_lru_list));
+                                mb_assert(!(ce->e_used || ce->e_queued ||
+                                        atomic_read(&ce->e_refcnt)));
+                                __mb_cache_entry_unhash_unlock(ce);
+                                goto found;
+                        }
                }
                spin_unlock(&mb_cache_spinlock);
        }
-        if (!ce) {
-                ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+        ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
-                if (!ce)
+        if (!ce)
-                        return NULL;
+                return NULL;
-                atomic_inc(&cache->c_entry_count);
+        atomic_inc(&cache->c_entry_count);
-                INIT_LIST_HEAD(&ce->e_lru_list);
+        INIT_LIST_HEAD(&ce->e_lru_list);
-                INIT_LIST_HEAD(&ce->e_block_list);
+        INIT_HLIST_BL_NODE(&ce->e_block_list);
-                ce->e_cache = cache;
+        INIT_HLIST_BL_NODE(&ce->e_index.o_list);
-                ce->e_queued = 0;
+        ce->e_cache = cache;
-        }
+        ce->e_queued = 0;
+        atomic_set(&ce->e_refcnt, 0);
+found:
+        ce->e_block_hash_p = &cache->c_block_hash[0];
+        ce->e_index_hash_p = &cache->c_index_hash[0];
        ce->e_used = 1 + MB_CACHE_WRITER;
        return ce;
 }
@@ -393,29 +580,38 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
 {
        struct mb_cache *cache = ce->e_cache;
        unsigned int bucket;
-        struct list_head *l;
+        struct hlist_bl_node *l;
-        int error = -EBUSY;
+        struct hlist_bl_head *block_hash_p;
+        struct hlist_bl_head *index_hash_p;
+        struct mb_cache_entry *lce;
+        mb_assert(ce);
        bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 
                           cache->c_bucket_bits);
-        spin_lock(&mb_cache_spinlock);
+        block_hash_p = &cache->c_block_hash[bucket];
-        list_for_each_prev(l, &cache->c_block_hash[bucket]) {
+        hlist_bl_lock(block_hash_p);
-                struct mb_cache_entry *ce =
+        hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
-                        list_entry(l, struct mb_cache_entry, e_block_list);
+                if (lce->e_bdev == bdev && lce->e_block == block) {
-                if (ce->e_bdev == bdev && ce->e_block == block)
+                        hlist_bl_unlock(block_hash_p);
-                        goto out;
+                        return -EBUSY;
+                }
        }
-        __mb_cache_entry_unhash(ce);
+        mb_assert(!__mb_cache_entry_is_block_hashed(ce));
+        __mb_cache_entry_unhash_block(ce);
+        __mb_cache_entry_unhash_index(ce);
        ce->e_bdev = bdev;
        ce->e_block = block;
-        list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
+        ce->e_block_hash_p = block_hash_p;
        ce->e_index.o_key = key;
+        hlist_bl_add_head(&ce->e_block_list, block_hash_p);
+        hlist_bl_unlock(block_hash_p);
        bucket = hash_long(key, cache->c_bucket_bits);
-        list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]);
+        index_hash_p = &cache->c_index_hash[bucket];
-        error = 0;
+        hlist_bl_lock(index_hash_p);
-out:
+        ce->e_index_hash_p = index_hash_p;
-        spin_unlock(&mb_cache_spinlock);
+        hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
-        return error;
+        hlist_bl_unlock(index_hash_p);
+        return 0;
 }
@@ -429,24 +625,26 @@ out:
 void
 mb_cache_entry_release(struct mb_cache_entry *ce)
 {
-        spin_lock(&mb_cache_spinlock);
+        __mb_cache_entry_release(ce);
-        __mb_cache_entry_release_unlock(ce);
 }
 /*
 * mb_cache_entry_free()
 *
- * This is equivalent to the sequence mb_cache_entry_takeout() --
- * mb_cache_entry_release().
 */
 void
 mb_cache_entry_free(struct mb_cache_entry *ce)
 {
-        spin_lock(&mb_cache_spinlock);
+        mb_assert(ce);
        mb_assert(list_empty(&ce->e_lru_list));
-        __mb_cache_entry_unhash(ce);
+        hlist_bl_lock(ce->e_index_hash_p);
-        __mb_cache_entry_release_unlock(ce);
+        __mb_cache_entry_unhash_index(ce);
+        hlist_bl_unlock(ce->e_index_hash_p);
+        hlist_bl_lock(ce->e_block_hash_p);
+        __mb_cache_entry_unhash_block(ce);
+        hlist_bl_unlock(ce->e_block_hash_p);
+        __mb_cache_entry_release(ce);
 }
@@ -463,84 +661,110 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
                   sector_t block)
 {
        unsigned int bucket;
-        struct list_head *l;
+        struct hlist_bl_node *l;
        struct mb_cache_entry *ce;
+        struct hlist_bl_head *block_hash_p;
        bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
                           cache->c_bucket_bits);
-        spin_lock(&mb_cache_spinlock);
+        block_hash_p = &cache->c_block_hash[bucket];
-        list_for_each(l, &cache->c_block_hash[bucket]) {
+        /* First serialize access to the block corresponding hash chain. */
-                ce = list_entry(l, struct mb_cache_entry, e_block_list);
+        hlist_bl_lock(block_hash_p);
+        hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
+                mb_assert(ce->e_block_hash_p == block_hash_p);
                if (ce->e_bdev == bdev && ce->e_block == block) {
-                        DEFINE_WAIT(wait);
+                        /*
+                         * Prevent a free from removing the entry.
+                         */
+                        atomic_inc(&ce->e_refcnt);
+                        hlist_bl_unlock(block_hash_p);
+                        __spin_lock_mb_cache_entry(ce);
+                        atomic_dec(&ce->e_refcnt);
+                        if (ce->e_used > 0) {
+                                DEFINE_WAIT(wait);
+                                while (ce->e_used > 0) {
+                                        ce->e_queued++;
+                                        prepare_to_wait(&mb_cache_queue, &wait,
+                                                        TASK_UNINTERRUPTIBLE);
+                                        __spin_unlock_mb_cache_entry(ce);
+                                        schedule();
+                                        __spin_lock_mb_cache_entry(ce);
+                                        ce->e_queued--;
+                                }
+                                finish_wait(&mb_cache_queue, &wait);
+                        }
+                        ce->e_used += 1 + MB_CACHE_WRITER;
+                        __spin_unlock_mb_cache_entry(ce);
-                        if (!list_empty(&ce->e_lru_list))
+                        if (!list_empty(&ce->e_lru_list)) {
+                                spin_lock(&mb_cache_spinlock);
                                list_del_init(&ce->e_lru_list);
-                        while (ce->e_used > 0) {
-                                ce->e_queued++;
-                                prepare_to_wait(&mb_cache_queue, &wait,
-                                                TASK_UNINTERRUPTIBLE);
                                spin_unlock(&mb_cache_spinlock);
-                                schedule();
-                                spin_lock(&mb_cache_spinlock);
-                                ce->e_queued--;
                        }
-                        finish_wait(&mb_cache_queue, &wait);
+                        if (!__mb_cache_entry_is_block_hashed(ce)) {
-                        ce->e_used += 1 + MB_CACHE_WRITER;
+                                __mb_cache_entry_release(ce);
-                        if (!__mb_cache_entry_is_hashed(ce)) {
-                                __mb_cache_entry_release_unlock(ce);
                                return NULL;
                        }
-                        goto cleanup;
+                        return ce;
                }
        }
-        ce = NULL;
+        hlist_bl_unlock(block_hash_p);
+        return NULL;
-cleanup:
-        spin_unlock(&mb_cache_spinlock);
-        return ce;
 }
 #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
 static struct mb_cache_entry *
-__mb_cache_entry_find(struct list_head *l, struct list_head *head,
+__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
                      struct block_device *bdev, unsigned int key)
 {
-        while (l != head) {
+        /* The index hash chain is alredy acquire by caller. */
+        while (l != NULL) {
                struct mb_cache_entry *ce =
-                        list_entry(l, struct mb_cache_entry, e_index.o_list);
+                        hlist_bl_entry(l, struct mb_cache_entry,
+                                e_index.o_list);
+                mb_assert(ce->e_index_hash_p == head);
                if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
-                        DEFINE_WAIT(wait);
+                        /*
+                         * Prevent a free from removing the entry.
-                        if (!list_empty(&ce->e_lru_list))
+                         */
-                                list_del_init(&ce->e_lru_list);
+                        atomic_inc(&ce->e_refcnt);
+                        hlist_bl_unlock(head);
+                        __spin_lock_mb_cache_entry(ce);
+                        atomic_dec(&ce->e_refcnt);
+                        ce->e_used++;
                        /* Incrementing before holding the lock gives readers
                           priority over writers. */
-                        ce->e_used++;
+                        if (ce->e_used >= MB_CACHE_WRITER) {
-                        while (ce->e_used >= MB_CACHE_WRITER) {
+                                DEFINE_WAIT(wait);
-                                ce->e_queued++;
-                                prepare_to_wait(&mb_cache_queue, &wait,
+                                while (ce->e_used >= MB_CACHE_WRITER) {
-                                                TASK_UNINTERRUPTIBLE);
+                                        ce->e_queued++;
-                                spin_unlock(&mb_cache_spinlock);
+                                        prepare_to_wait(&mb_cache_queue, &wait,
-                                schedule();
+                                                        TASK_UNINTERRUPTIBLE);
-                                spin_lock(&mb_cache_spinlock);
+                                        __spin_unlock_mb_cache_entry(ce);
-                                ce->e_queued--;
+                                        schedule();
+                                        __spin_lock_mb_cache_entry(ce);
+                                        ce->e_queued--;
+                                }
+                                finish_wait(&mb_cache_queue, &wait);
                        }
-                        finish_wait(&mb_cache_queue, &wait);
+                        __spin_unlock_mb_cache_entry(ce);
+                        if (!list_empty(&ce->e_lru_list)) {
-                        if (!__mb_cache_entry_is_hashed(ce)) {
-                                __mb_cache_entry_release_unlock(ce);
                                spin_lock(&mb_cache_spinlock);
+                                list_del_init(&ce->e_lru_list);
+                                spin_unlock(&mb_cache_spinlock);
+                        }
+                        if (!__mb_cache_entry_is_block_hashed(ce)) {
+                                __mb_cache_entry_release(ce);
                                return ERR_PTR(-EAGAIN);
                        }
                        return ce;
                }
                l = l->next;
        }
+        hlist_bl_unlock(head);
        return NULL;
 }
@@ -562,13 +786,17 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
                          unsigned int key)
 {
        unsigned int bucket = hash_long(key, cache->c_bucket_bits);
-        struct list_head *l;
+        struct hlist_bl_node *l;
-        struct mb_cache_entry *ce;
+        struct mb_cache_entry *ce = NULL;
+        struct hlist_bl_head *index_hash_p;
-        spin_lock(&mb_cache_spinlock);
-        l = cache->c_index_hash[bucket].next;
+        index_hash_p = &cache->c_index_hash[bucket];
-        ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
+        hlist_bl_lock(index_hash_p);
-        spin_unlock(&mb_cache_spinlock);
+        if (!hlist_bl_empty(index_hash_p)) {
+                l = hlist_bl_first(index_hash_p);
+                ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
+        } else
+                hlist_bl_unlock(index_hash_p);
        return ce;
 }
@@ -597,13 +825,17 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev,
 {
        struct mb_cache *cache = prev->e_cache;
        unsigned int bucket = hash_long(key, cache->c_bucket_bits);
-        struct list_head *l;
+        struct hlist_bl_node *l;
        struct mb_cache_entry *ce;
+        struct hlist_bl_head *index_hash_p;
-        spin_lock(&mb_cache_spinlock);
+        index_hash_p = &cache->c_index_hash[bucket];
+        mb_assert(prev->e_index_hash_p == index_hash_p);
+        hlist_bl_lock(index_hash_p);
+        mb_assert(!hlist_bl_empty(index_hash_p));
        l = prev->e_index.o_list.next;
-        ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
+        ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
-        __mb_cache_entry_release_unlock(prev);
+        __mb_cache_entry_release(prev);
        return ce;
 }
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 0332109162a5..f007a3355570 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -26,7 +26,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data);
 static void minix_evict_inode(struct inode *inode)
 {
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        if (!inode->i_nlink) {
                inode->i_size = 0;
                minix_truncate(inode);
@@ -86,7 +86,7 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
 }
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
        minix_inode_cachep = kmem_cache_create("minix_inode_cache",
                                             sizeof(struct minix_inode_info),
@@ -123,6 +123,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
        struct minix_sb_info * sbi = minix_sb(sb);
        struct minix_super_block * ms;
+        sync_filesystem(sb);
        ms = sbi->s_ms;
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                return 0;
diff --git a/fs/mount.h b/fs/mount.h
index b29e42f05f34..d55297f2fa05 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -10,7 +10,7 @@ struct mnt_namespace {
        struct user_namespace   *user_ns;
        u64                     seq;    /* Sequence number to prevent loops */
        wait_queue_head_t poll;
-        int event;
+        u64 event;
 };
 struct mnt_pcp {
@@ -104,6 +104,9 @@ struct proc_mounts {
        struct mnt_namespace *ns;
        struct path root;
        int (*show)(struct seq_file *, struct vfsmount *);
+        void *cached_mount;
+        u64 cached_event;
+        loff_t cached_index;
 };
 #define proc_mounts(p) (container_of((p), struct proc_mounts, m))
diff --git a/fs/namei.c b/fs/namei.c
index 4b491b431990..80168273396b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -358,6 +358,7 @@ int generic_permission(struct inode *inode, int mask)
        return -EACCES;
 }
+EXPORT_SYMBOL(generic_permission);
 /*
 * We _really_ want to just do "generic_permission()" without
@@ -455,6 +456,7 @@ int inode_permission(struct inode *inode, int mask)
                return retval;
        return __inode_permission(inode, mask);
 }
+EXPORT_SYMBOL(inode_permission);
 /**
 * path_get - get a reference to a path
@@ -924,6 +926,7 @@ int follow_up(struct path *path)
        path->mnt = &parent->mnt;
        return 1;
 }
+EXPORT_SYMBOL(follow_up);
 /*
 * Perform an automount
@@ -1085,6 +1088,7 @@ int follow_down_one(struct path *path)
        }
        return 0;
 }
+EXPORT_SYMBOL(follow_down_one);
 static inline bool managed_dentry_might_block(struct dentry *dentry)
 {
@@ -1223,6 +1227,7 @@ int follow_down(struct path *path)
        }
        return 0;
 }
+EXPORT_SYMBOL(follow_down);
 /*
 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
@@ -1537,7 +1542,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
                inode = path->dentry->d_inode;
        }
        err = -ENOENT;
-        if (!inode)
+        if (!inode || d_is_negative(path->dentry))
                goto out_path_put;
        if (should_follow_link(path->dentry, follow)) {
@@ -1796,7 +1801,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                        if (err)
                                return err;
                }
-                if (!d_is_directory(nd->path.dentry)) {
+                if (!d_can_lookup(nd->path.dentry)) {
                        err = -ENOTDIR; 
                        break;
                }
@@ -1817,7 +1822,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                struct dentry *root = nd->root.dentry;
                struct inode *inode = root->d_inode;
                if (*name) {
-                        if (!d_is_directory(root))
+                        if (!d_can_lookup(root))
                                return -ENOTDIR;
                        retval = inode_permission(inode, MAY_EXEC);
                        if (retval)
@@ -1873,7 +1878,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                dentry = f.file->f_path.dentry;
                if (*name) {
-                        if (!d_is_directory(dentry)) {
+                        if (!d_can_lookup(dentry)) {
                                fdput(f);
                                return -ENOTDIR;
                        }
@@ -1955,7 +1960,7 @@ static int path_lookupat(int dfd, const char *name,
                err = complete_walk(nd);
        if (!err && nd->flags & LOOKUP_DIRECTORY) {
-                if (!d_is_directory(nd->path.dentry)) {
+                if (!d_can_lookup(nd->path.dentry)) {
                        path_put(&nd->path);
                        err = -ENOTDIR;
                }
@@ -2025,6 +2030,7 @@ int kern_path(const char *name, unsigned int flags, struct path *path)
                *path = nd.path;
        return res;
 }
+EXPORT_SYMBOL(kern_path);
 /**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
@@ -2049,6 +2055,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                *path = nd.path;
        return err;
 }
+EXPORT_SYMBOL(vfs_path_lookup);
 /*
 * Restricted form of lookup. Doesn't follow links, single-component only,
@@ -2111,6 +2118,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        return __lookup_hash(&this, base, 0);
 }
+EXPORT_SYMBOL(lookup_one_len);
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
                 struct path *path, int *empty)
@@ -2135,6 +2143,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
 {
        return user_path_at_empty(dfd, name, flags, path, NULL);
 }
+EXPORT_SYMBOL(user_path_at);
 /*
 * NB: most callers don't do anything directly with the reference to the
@@ -2240,7 +2249,7 @@ mountpoint_last(struct nameidata *nd, struct path *path)
        mutex_unlock(&dir->d_inode->i_mutex);
 done:
-        if (!dentry->d_inode) {
+        if (!dentry->d_inode || d_is_negative(dentry)) {
                error = -ENOENT;
                dput(dentry);
                goto out;
@@ -2414,11 +2423,11 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
            IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
                return -EPERM;
        if (isdir) {
-                if (!d_is_directory(victim) && !d_is_autodir(victim))
+                if (!d_is_dir(victim))
                        return -ENOTDIR;
                if (IS_ROOT(victim))
                        return -EBUSY;
-        } else if (d_is_directory(victim) || d_is_autodir(victim))
+        } else if (d_is_dir(victim))
                return -EISDIR;
        if (IS_DEADDIR(dir))
                return -ENOENT;
@@ -2477,6 +2486,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
        mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
        return NULL;
 }
+EXPORT_SYMBOL(lock_rename);
 void unlock_rename(struct dentry *p1, struct dentry *p2)
 {
@@ -2486,6 +2496,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
                mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
        }
 }
+EXPORT_SYMBOL(unlock_rename);
 int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                bool want_excl)
@@ -2506,6 +2517,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                fsnotify_create(dir, dentry);
        return error;
 }
+EXPORT_SYMBOL(vfs_create);
 static int may_open(struct path *path, int acc_mode, int flag)
 {
@@ -2569,7 +2581,7 @@ static int handle_truncate(struct file *filp)
        /*
         * Refuse to truncate files with mandatory locks held on them.
         */
-        error = locks_verify_locked(inode);
+        error = locks_verify_locked(filp);
        if (!error)
                error = security_path_truncate(path);
        if (!error) {
@@ -2982,7 +2994,7 @@ retry_lookup:
 finish_lookup:
        /* we _can_ be in RCU mode here */
        error = -ENOENT;
-        if (d_is_negative(path->dentry)) {
+        if (!inode || d_is_negative(path->dentry)) {
                path_to_nameidata(path, nd);
                goto out;
        }
@@ -3016,11 +3028,10 @@ finish_open:
        }
        audit_inode(name, nd->path.dentry, 0);
        error = -EISDIR;
-        if ((open_flag & O_CREAT) &&
+        if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
-            (d_is_directory(nd->path.dentry) || d_is_autodir(nd->path.dentry)))
                goto out;
        error = -ENOTDIR;
-        if ((nd->flags & LOOKUP_DIRECTORY) && !d_is_directory(nd->path.dentry))
+        if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
                goto out;
        if (!S_ISREG(nd->inode->i_mode))
                will_truncate = false;
@@ -3376,6 +3387,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
                fsnotify_create(dir, dentry);
        return error;
 }
+EXPORT_SYMBOL(vfs_mknod);
 static int may_mknod(umode_t mode)
 {
@@ -3465,6 +3477,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
                fsnotify_mkdir(dir, dentry);
        return error;
 }
+EXPORT_SYMBOL(vfs_mkdir);
 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
 {
@@ -3519,6 +3532,7 @@ void dentry_unhash(struct dentry *dentry)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
 }
+EXPORT_SYMBOL(dentry_unhash);
 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
@@ -3556,6 +3570,7 @@ out:
                d_delete(dentry);
        return error;
 }
+EXPORT_SYMBOL(vfs_rmdir);
 static long do_rmdir(int dfd, const char __user *pathname)
 {
@@ -3673,6 +3688,7 @@ out:
        return error;
 }
+EXPORT_SYMBOL(vfs_unlink);
 /*
 * Make sure that the actual truncation of the file will occur outside its
@@ -3744,7 +3760,7 @@ exit1:
 slashes:
        if (d_is_negative(dentry))
                error = -ENOENT;
-        else if (d_is_directory(dentry) || d_is_autodir(dentry))
+        else if (d_is_dir(dentry))
                error = -EISDIR;
        else
                error = -ENOTDIR;
@@ -3786,6 +3802,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
                fsnotify_create(dir, dentry);
        return error;
 }
+EXPORT_SYMBOL(vfs_symlink);
 SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
@@ -3894,6 +3911,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
                fsnotify_link(dir, inode, new_dentry);
        return error;
 }
+EXPORT_SYMBOL(vfs_link);
 /*
 * Hardlinks are often used in delicate situations.  We avoid
@@ -3974,7 +3992,28 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
        return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
 }
-/*
+/**
+ * vfs_rename - rename a filesystem object
+ * @old_dir:    parent of source
+ * @old_dentry: source
+ * @new_dir:    parent of destination
+ * @new_dentry: destination
+ * @delegated_inode: returns an inode needing a delegation break
+ * @flags:      rename flags
+ *
+ * The caller must hold multiple mutexes--see lock_rename()).
+ *
+ * If vfs_rename discovers a delegation in need of breaking at either
+ * the source or destination, it will return -EWOULDBLOCK and return a
+ * reference to the inode in delegated_inode.  The caller should then
+ * break the delegation and retry.  Because breaking a delegation may
+ * take a long time, the caller should drop all locks before doing
+ * so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode.  This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported.
+ *
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
@@ -4002,163 +4041,140 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
 *         ->i_mutex on parents, which works but leads to some truly excessive
 *         locking].
 */
-static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-                          struct inode *new_dir, struct dentry *new_dentry)
+               struct inode *new_dir, struct dentry *new_dentry,
+               struct inode **delegated_inode, unsigned int flags)
 {
-        int error = 0;
+        int error;
+        bool is_dir = d_is_dir(old_dentry);
+        const unsigned char *old_name;
+        struct inode *source = old_dentry->d_inode;
        struct inode *target = new_dentry->d_inode;
+        bool new_is_dir = false;
        unsigned max_links = new_dir->i_sb->s_max_links;
+        if (source == target)
+                return 0;
+        error = may_delete(old_dir, old_dentry, is_dir);
+        if (error)
+                return error;
+        if (!target) {
+                error = may_create(new_dir, new_dentry);
+        } else {
+                new_is_dir = d_is_dir(new_dentry);
+                if (!(flags & RENAME_EXCHANGE))
+                        error = may_delete(new_dir, new_dentry, is_dir);
+                else
+                        error = may_delete(new_dir, new_dentry, new_is_dir);
+        }
+        if (error)
+                return error;
+        if (!old_dir->i_op->rename)
+                return -EPERM;
+        if (flags && !old_dir->i_op->rename2)
+                return -EINVAL;
        /*
         * If we are going to change the parent - check write permissions,
         * we'll need to flip '..'.
         */
        if (new_dir != old_dir) {
-                error = inode_permission(old_dentry->d_inode, MAY_WRITE);
+                if (is_dir) {
-                if (error)
+                        error = inode_permission(source, MAY_WRITE);
-                        return error;
+                        if (error)
+                                return error;
+                }
+                if ((flags & RENAME_EXCHANGE) && new_is_dir) {
+                        error = inode_permission(target, MAY_WRITE);
+                        if (error)
+                                return error;
+                }
        }
-        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
+        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
+                                      flags);
        if (error)
                return error;
+        old_name = fsnotify_oldname_init(old_dentry->d_name.name);
        dget(new_dentry);
-        if (target)
+        if (!is_dir || (flags & RENAME_EXCHANGE))
+                lock_two_nondirectories(source, target);
+        else if (target)
                mutex_lock(&target->i_mutex);
        error = -EBUSY;
        if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
                goto out;
-        error = -EMLINK;
+        if (max_links && new_dir != old_dir) {
-        if (max_links && !target && new_dir != old_dir &&
+                error = -EMLINK;
-            new_dir->i_nlink >= max_links)
+                if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
-                goto out;
+                        goto out;
+                if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
-        if (target)
+                    old_dir->i_nlink >= max_links)
+                        goto out;
+        }
+        if (is_dir && !(flags & RENAME_EXCHANGE) && target)
                shrink_dcache_parent(new_dentry);
-        error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+        if (!is_dir) {
-        if (error)
+                error = try_break_deleg(source, delegated_inode);
-                goto out;
+                if (error)
+                        goto out;
-        if (target) {
-                target->i_flags |= S_DEAD;
-                dont_mount(new_dentry);
        }
-out:
+        if (target && !new_is_dir) {
-        if (target)
-                mutex_unlock(&target->i_mutex);
-        dput(new_dentry);
-        if (!error)
-                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
-                        d_move(old_dentry,new_dentry);
-        return error;
-}
-static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
-                            struct inode *new_dir, struct dentry *new_dentry,
-                            struct inode **delegated_inode)
-{
-        struct inode *target = new_dentry->d_inode;
-        struct inode *source = old_dentry->d_inode;
-        int error;
-        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
-        if (error)
-                return error;
-        dget(new_dentry);
-        lock_two_nondirectories(source, target);
-        error = -EBUSY;
-        if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
-                goto out;
-        error = try_break_deleg(source, delegated_inode);
-        if (error)
-                goto out;
-        if (target) {
                error = try_break_deleg(target, delegated_inode);
                if (error)
                        goto out;
        }
-        error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+        if (!flags) {
+                error = old_dir->i_op->rename(old_dir, old_dentry,
+                                              new_dir, new_dentry);
+        } else {
+                error = old_dir->i_op->rename2(old_dir, old_dentry,
+                                               new_dir, new_dentry, flags);
+        }
        if (error)
                goto out;
-        if (target)
+        if (!(flags & RENAME_EXCHANGE) && target) {
+                if (is_dir)
+                        target->i_flags |= S_DEAD;
                dont_mount(new_dentry);
-        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+        }
-                d_move(old_dentry, new_dentry);
+        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
+                if (!(flags & RENAME_EXCHANGE))
+                        d_move(old_dentry, new_dentry);
+                else
+                        d_exchange(old_dentry, new_dentry);
+        }
 out:
-        unlock_two_nondirectories(source, target);
+        if (!is_dir || (flags & RENAME_EXCHANGE))
+                unlock_two_nondirectories(source, target);
+        else if (target)
+                mutex_unlock(&target->i_mutex);
        dput(new_dentry);
-        return error;
+        if (!error) {
-}
-/**
- * vfs_rename - rename a filesystem object
- * @old_dir:    parent of source
- * @old_dentry: source
- * @new_dir:    parent of destination
- * @new_dentry: destination
- * @delegated_inode: returns an inode needing a delegation break
- *
- * The caller must hold multiple mutexes--see lock_rename()).
- *
- * If vfs_rename discovers a delegation in need of breaking at either
- * the source or destination, it will return -EWOULDBLOCK and return a
- * reference to the inode in delegated_inode.  The caller should then
- * break the delegation and retry.  Because breaking a delegation may
- * take a long time, the caller should drop all locks before doing
- * so.
- *
- * Alternatively, a caller may pass NULL for delegated_inode.  This may
- * be appropriate for callers that expect the underlying filesystem not
- * to be NFS exported.
- */
-int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-               struct inode *new_dir, struct dentry *new_dentry,
-               struct inode **delegated_inode)
-{
-        int error;
-        int is_dir = d_is_directory(old_dentry) || d_is_autodir(old_dentry);
-        const unsigned char *old_name;
-        if (old_dentry->d_inode == new_dentry->d_inode)
-                return 0;
- 
-        error = may_delete(old_dir, old_dentry, is_dir);
-        if (error)
-                return error;
-        if (!new_dentry->d_inode)
-                error = may_create(new_dir, new_dentry);
-        else
-                error = may_delete(new_dir, new_dentry, is_dir);
-        if (error)
-                return error;
-        if (!old_dir->i_op->rename)
-                return -EPERM;
-        old_name = fsnotify_oldname_init(old_dentry->d_name.name);
-        if (is_dir)
-                error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
-        else
-                error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,delegated_inode);
-        if (!error)
                fsnotify_move(old_dir, new_dir, old_name, is_dir,
-                              new_dentry->d_inode, old_dentry);
+                              !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
+                if (flags & RENAME_EXCHANGE) {
+                        fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
+                                      new_is_dir, NULL, new_dentry);
+                }
+        }
        fsnotify_oldname_free(old_name);
        return error;
 }
+EXPORT_SYMBOL(vfs_rename);
-SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
+SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
-                int, newdfd, const char __user *, newname)
+                int, newdfd, const char __user *, newname, unsigned int, flags)
 {
        struct dentry *old_dir, *new_dir;
        struct dentry *old_dentry, *new_dentry;
@@ -4170,6 +4186,13 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
        unsigned int lookup_flags = 0;
        bool should_retry = false;
        int error;
+        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+                return -EINVAL;
+        if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE))
+                return -EINVAL;
 retry:
        from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
        if (IS_ERR(from)) {
@@ -4193,6 +4216,8 @@ retry:
                goto exit2;
        new_dir = newnd.path.dentry;
+        if (flags & RENAME_NOREPLACE)
+                error = -EEXIST;
        if (newnd.last_type != LAST_NORM)
                goto exit2;
@@ -4202,7 +4227,8 @@ retry:
        oldnd.flags &= ~LOOKUP_PARENT;
        newnd.flags &= ~LOOKUP_PARENT;
-        newnd.flags |= LOOKUP_RENAME_TARGET;
+        if (!(flags & RENAME_EXCHANGE))
+                newnd.flags |= LOOKUP_RENAME_TARGET;
 retry_deleg:
        trap = lock_rename(new_dir, old_dir);
@@ -4215,34 +4241,49 @@ retry_deleg:
        error = -ENOENT;
        if (d_is_negative(old_dentry))
                goto exit4;
+        new_dentry = lookup_hash(&newnd);
+        error = PTR_ERR(new_dentry);
+        if (IS_ERR(new_dentry))
+                goto exit4;
+        error = -EEXIST;
+        if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
+                goto exit5;
+        if (flags & RENAME_EXCHANGE) {
+                error = -ENOENT;
+                if (d_is_negative(new_dentry))
+                        goto exit5;
+                if (!d_is_dir(new_dentry)) {
+                        error = -ENOTDIR;
+                        if (newnd.last.name[newnd.last.len])
+                                goto exit5;
+                }
+        }
        /* unless the source is a directory trailing slashes give -ENOTDIR */
-        if (!d_is_directory(old_dentry) && !d_is_autodir(old_dentry)) {
+        if (!d_is_dir(old_dentry)) {
                error = -ENOTDIR;
                if (oldnd.last.name[oldnd.last.len])
-                        goto exit4;
+                        goto exit5;
-                if (newnd.last.name[newnd.last.len])
+                if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len])
-                        goto exit4;
+                        goto exit5;
        }
        /* source should not be ancestor of target */
        error = -EINVAL;
        if (old_dentry == trap)
-                goto exit4;
+                goto exit5;
-        new_dentry = lookup_hash(&newnd);
-        error = PTR_ERR(new_dentry);
-        if (IS_ERR(new_dentry))
-                goto exit4;
        /* target should not be an ancestor of source */
-        error = -ENOTEMPTY;
+        if (!(flags & RENAME_EXCHANGE))
+                error = -ENOTEMPTY;
        if (new_dentry == trap)
                goto exit5;
        error = security_path_rename(&oldnd.path, old_dentry,
-                                     &newnd.path, new_dentry);
+                                     &newnd.path, new_dentry, flags);
        if (error)
                goto exit5;
        error = vfs_rename(old_dir->d_inode, old_dentry,
-                                   new_dir->d_inode, new_dentry,
+                           new_dir->d_inode, new_dentry,
-                                   &delegated_inode);
+                           &delegated_inode, flags);
 exit5:
        dput(new_dentry);
 exit4:
@@ -4272,16 +4313,20 @@ exit:
        return error;
 }
-SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
+SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
+                int, newdfd, const char __user *, newname)
 {
-        return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
+        return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
 }
-int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
+SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
 {
-        int len;
+        return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+}
-        len = PTR_ERR(link);
+int readlink_copy(char __user *buffer, int buflen, const char *link)
+{
+        int len = PTR_ERR(link);
        if (IS_ERR(link))
                goto out;
@@ -4293,6 +4338,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const c
 out:
        return len;
 }
+EXPORT_SYMBOL(readlink_copy);
 /*
 * A helper for ->readlink().  This should be used *ONLY* for symlinks that
@@ -4310,11 +4356,12 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
        if (IS_ERR(cookie))
                return PTR_ERR(cookie);
-        res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
+        res = readlink_copy(buffer, buflen, nd_get_link(&nd));
        if (dentry->d_inode->i_op->put_link)
                dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
        return res;
 }
+EXPORT_SYMBOL(generic_readlink);
 /* get the link contents into pagecache */
 static char *page_getlink(struct dentry * dentry, struct page **ppage)
@@ -4334,14 +4381,14 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
        struct page *page = NULL;
-        char *s = page_getlink(dentry, &page);
+        int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
-        int res = vfs_readlink(dentry,buffer,buflen,s);
        if (page) {
                kunmap(page);
                page_cache_release(page);
        }
        return res;
 }
+EXPORT_SYMBOL(page_readlink);
 void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
 {
@@ -4349,6 +4396,7 @@ void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
        nd_set_link(nd, page_getlink(dentry, &page));
        return page;
 }
+EXPORT_SYMBOL(page_follow_link_light);
 void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 {
@@ -4359,6 +4407,7 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
                page_cache_release(page);
        }
 }
+EXPORT_SYMBOL(page_put_link);
 /*
 * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
@@ -4396,45 +4445,18 @@ retry:
 fail:
        return err;
 }
+EXPORT_SYMBOL(__page_symlink);
 int page_symlink(struct inode *inode, const char *symname, int len)
 {
        return __page_symlink(inode, symname, len,
                        !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
 }
+EXPORT_SYMBOL(page_symlink);
 const struct inode_operations page_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
 };
-EXPORT_SYMBOL(user_path_at);
-EXPORT_SYMBOL(follow_down_one);
-EXPORT_SYMBOL(follow_down);
-EXPORT_SYMBOL(follow_up);
-EXPORT_SYMBOL(get_write_access); /* nfsd */
-EXPORT_SYMBOL(lock_rename);
-EXPORT_SYMBOL(lookup_one_len);
-EXPORT_SYMBOL(page_follow_link_light);
-EXPORT_SYMBOL(page_put_link);
-EXPORT_SYMBOL(page_readlink);
-EXPORT_SYMBOL(__page_symlink);
-EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
-EXPORT_SYMBOL(kern_path);
-EXPORT_SYMBOL(vfs_path_lookup);
-EXPORT_SYMBOL(inode_permission);
-EXPORT_SYMBOL(unlock_rename);
-EXPORT_SYMBOL(vfs_create);
-EXPORT_SYMBOL(vfs_link);
-EXPORT_SYMBOL(vfs_mkdir);
-EXPORT_SYMBOL(vfs_mknod);
-EXPORT_SYMBOL(generic_permission);
-EXPORT_SYMBOL(vfs_readlink);
-EXPORT_SYMBOL(vfs_rename);
-EXPORT_SYMBOL(vfs_rmdir);
-EXPORT_SYMBOL(vfs_symlink);
-EXPORT_SYMBOL(vfs_unlink);
-EXPORT_SYMBOL(dentry_unhash);
-EXPORT_SYMBOL(generic_readlink);
diff --git a/fs/namespace.c b/fs/namespace.c
index 2ffc5a2905d4..182bc41cd887 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -52,7 +52,7 @@ static int __init set_mphash_entries(char *str)
 }
 __setup("mphash_entries=", set_mphash_entries);
-static int event;
+static u64 event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
 static DEFINE_SPINLOCK(mnt_id_lock);
@@ -414,9 +414,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);
 */
 int __mnt_want_write_file(struct file *file)
 {
-        struct inode *inode = file_inode(file);
+        if (!(file->f_mode & FMODE_WRITER))
-        if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
                return __mnt_want_write(file->f_path.mnt);
        else
                return mnt_clone_write(file->f_path.mnt);
@@ -570,13 +568,17 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 static void free_vfsmnt(struct mount *mnt)
 {
        kfree(mnt->mnt_devname);
-        mnt_free_id(mnt);
 #ifdef CONFIG_SMP
        free_percpu(mnt->mnt_pcp);
 #endif
        kmem_cache_free(mnt_cache, mnt);
 }
+static void delayed_free_vfsmnt(struct rcu_head *head)
+{
+        free_vfsmnt(container_of(head, struct mount, mnt_rcu));
+}
 /* call under rcu_read_lock */
 bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
 {
@@ -848,6 +850,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
        root = mount_fs(type, flags, name, data);
        if (IS_ERR(root)) {
+                mnt_free_id(mnt);
                free_vfsmnt(mnt);
                return ERR_CAST(root);
        }
@@ -885,7 +888,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
                        goto out_free;
        }
-        mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD;
+        mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
        /* Don't allow unprivileged users to change mount flags */
        if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
                mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
@@ -928,20 +931,11 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
        return mnt;
 out_free:
+        mnt_free_id(mnt);
        free_vfsmnt(mnt);
        return ERR_PTR(err);
 }
-static void delayed_free(struct rcu_head *head)
-{
-        struct mount *mnt = container_of(head, struct mount, mnt_rcu);
-        kfree(mnt->mnt_devname);
-#ifdef CONFIG_SMP
-        free_percpu(mnt->mnt_pcp);
-#endif
-        kmem_cache_free(mnt_cache, mnt);
-}
 static void mntput_no_expire(struct mount *mnt)
 {
 put_again:
@@ -991,7 +985,7 @@ put_again:
        dput(mnt->mnt.mnt_root);
        deactivate_super(mnt->mnt.mnt_sb);
        mnt_free_id(mnt);
-        call_rcu(&mnt->mnt_rcu, delayed_free);
+        call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
 }
 void mntput(struct vfsmount *mnt)
@@ -1100,14 +1094,29 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        struct proc_mounts *p = proc_mounts(m);
        down_read(&namespace_sem);
-        return seq_list_start(&p->ns->list, *pos);
+        if (p->cached_event == p->ns->event) {
+                void *v = p->cached_mount;
+                if (*pos == p->cached_index)
+                        return v;
+                if (*pos == p->cached_index + 1) {
+                        v = seq_list_next(v, &p->ns->list, &p->cached_index);
+                        return p->cached_mount = v;
+                }
+        }
+        p->cached_event = p->ns->event;
+        p->cached_mount = seq_list_start(&p->ns->list, *pos);
+        p->cached_index = *pos;
+        return p->cached_mount;
 }
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct proc_mounts *p = proc_mounts(m);
-        return seq_list_next(v, &p->ns->list, pos);
+        p->cached_mount = seq_list_next(v, &p->ns->list, pos);
+        p->cached_index = *pos;
+        return p->cached_mount;
 }
 static void m_stop(struct seq_file *m, void *v)
@@ -1661,9 +1670,9 @@ static int attach_recursive_mnt(struct mount *source_mnt,
                if (err)
                        goto out;
                err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
+                lock_mount_hash();
                if (err)
                        goto out_cleanup_ids;
-                lock_mount_hash();
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                        set_mnt_shared(p);
        } else {
@@ -1690,6 +1699,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
        return 0;
 out_cleanup_ids:
+        while (!hlist_empty(&tree_list)) {
+                child = hlist_entry(tree_list.first, struct mount, mnt_hash);
+                umount_tree(child, 0);
+        }
+        unlock_mount_hash();
        cleanup_group_ids(source_mnt, NULL);
 out:
        return err;
@@ -2044,7 +2058,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
        struct mount *parent;
        int err;
-        mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT);
+        mnt_flags &= ~MNT_INTERNAL_FLAGS;
        mp = lock_mount(path);
        if (IS_ERR(mp))
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index c320ac52353e..08b8ea8c353e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -339,7 +339,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
        if (val)
                goto finished;
-        DDPRINTK("ncp_lookup_validate: %pd2 not valid, age=%ld, server lookup\n",
+        ncp_dbg(2, "%pd2 not valid, age=%ld, server lookup\n",
                dentry, NCP_GET_AGE(dentry));
        len = sizeof(__name);
@@ -358,7 +358,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
                        res = ncp_obtain_info(server, dir, __name, &(finfo.i));
        }
        finfo.volume = finfo.i.volNumber;
-        DDPRINTK("ncp_lookup_validate: looked for %pd/%s, res=%d\n",
+        ncp_dbg(2, "looked for %pd/%s, res=%d\n",
                dentry->d_parent, __name, res);
        /*
         * If we didn't find it, or if it has a different dirEntNum to
@@ -372,14 +372,14 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
                        ncp_new_dentry(dentry);
                        val=1;
                } else
-                        DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n");
+                        ncp_dbg(2, "found, but dirEntNum changed\n");
                ncp_update_inode2(inode, &finfo);
                mutex_unlock(&inode->i_mutex);
        }
 finished:
-        DDPRINTK("ncp_lookup_validate: result=%d\n", val);
+        ncp_dbg(2, "result=%d\n", val);
        dput(parent);
        return val;
 }
@@ -453,8 +453,7 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx)
        ctl.page  = NULL;
        ctl.cache = NULL;
-        DDPRINTK("ncp_readdir: reading %pD2, pos=%d\n", file,
+        ncp_dbg(2, "reading %pD2, pos=%d\n", file, (int)ctx->pos);
-                (int) ctx->pos);
        result = -EIO;
        /* Do not generate '.' and '..' when server is dead. */
@@ -697,8 +696,7 @@ ncp_read_volume_list(struct file *file, struct dir_context *ctx,
        struct ncp_entry_info entry;
        int i;
-        DPRINTK("ncp_read_volume_list: pos=%ld\n",
+        ncp_dbg(1, "pos=%ld\n", (unsigned long)ctx->pos);
-                        (unsigned long) ctx->pos);
        for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
                int inval_dentry;
@@ -708,12 +706,11 @@ ncp_read_volume_list(struct file *file, struct dir_context *ctx,
                if (!strlen(info.volume_name))
                        continue;
-                DPRINTK("ncp_read_volume_list: found vol: %s\n",
+                ncp_dbg(1, "found vol: %s\n", info.volume_name);
-                        info.volume_name);
                if (ncp_lookup_volume(server, info.volume_name,
                                        &entry.i)) {
-                        DPRINTK("ncpfs: could not lookup vol %s\n",
+                        ncp_dbg(1, "could not lookup vol %s\n",
                                info.volume_name);
                        continue;
                }
@@ -738,14 +735,13 @@ ncp_do_readdir(struct file *file, struct dir_context *ctx,
        int more;
        size_t bufsize;
-        DPRINTK("ncp_do_readdir: %pD2, fpos=%ld\n", file,
+        ncp_dbg(1, "%pD2, fpos=%ld\n", file, (unsigned long)ctx->pos);
-                (unsigned long) ctx->pos);
+        ncp_vdbg("init %pD, volnum=%d, dirent=%u\n",
-        PPRINTK("ncp_do_readdir: init %pD, volnum=%d, dirent=%u\n",
+                 file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum);
-                file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum);
        err = ncp_initialize_search(server, dir, &seq);
        if (err) {
-                DPRINTK("ncp_do_readdir: init failed, err=%d\n", err);
+                ncp_dbg(1, "init failed, err=%d\n", err);
                return;
        }
        /* We MUST NOT use server->buffer_size handshaked with server if we are
@@ -808,8 +804,7 @@ int ncp_conn_logged_in(struct super_block *sb)
                        goto out;
                result = -ENOENT;
                if (ncp_get_volume_root(server, __name, &volNumber, &dirEntNum, &DosDirNum)) {
-                        PPRINTK("ncp_conn_logged_in: %s not found\n",
+                        ncp_vdbg("%s not found\n", server->m.mounted_vol);
-                                server->m.mounted_vol);
                        goto out;
                }
                dent = sb->s_root;
@@ -822,10 +817,10 @@ int ncp_conn_logged_in(struct super_block *sb)
                                NCP_FINFO(ino)->DosDirNum = DosDirNum;
                                result = 0;
                        } else {
-                                DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n");
+                                ncp_dbg(1, "sb->s_root->d_inode == NULL!\n");
                        }
                } else {
-                        DPRINTK("ncpfs: sb->s_root == NULL!\n");
+                        ncp_dbg(1, "sb->s_root == NULL!\n");
                }
        } else
                result = 0;
@@ -846,7 +841,7 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig
        if (!ncp_conn_valid(server))
                goto finished;
-        PPRINTK("ncp_lookup: server lookup for %pd2\n", dentry);
+        ncp_vdbg("server lookup for %pd2\n", dentry);
        len = sizeof(__name);
        if (ncp_is_server_root(dir)) {
@@ -854,15 +849,15 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig
                                 dentry->d_name.len, 1);
                if (!res)
                        res = ncp_lookup_volume(server, __name, &(finfo.i));
-                        if (!res)
+                if (!res)
-                                ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
+                        ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
        } else {
                res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
                                 dentry->d_name.len, !ncp_preserve_case(dir));
                if (!res)
                        res = ncp_obtain_info(server, dir, __name, &(finfo.i));
        }
-        PPRINTK("ncp_lookup: looked for %pd2, res=%d\n", dentry, res);
+        ncp_vdbg("looked for %pd2, res=%d\n", dentry, res);
        /*
         * If we didn't find an entry, make a negative dentry.
         */
@@ -886,7 +881,7 @@ add_entry:
        }
 finished:
-        PPRINTK("ncp_lookup: result=%d\n", error);
+        ncp_vdbg("result=%d\n", error);
        return ERR_PTR(error);
 }
@@ -909,7 +904,7 @@ out:
        return error;
 out_close:
-        PPRINTK("ncp_instantiate: %pd2 failed, closing file\n", dentry);
+        ncp_vdbg("%pd2 failed, closing file\n", dentry);
        ncp_close_file(NCP_SERVER(dir), finfo->file_handle);
        goto out;
 }
@@ -923,7 +918,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode,
        int opmode;
        __u8 __name[NCP_MAXPATHLEN + 1];
        
-        PPRINTK("ncp_create_new: creating %pd2, mode=%hx\n", dentry, mode);
+        ncp_vdbg("creating %pd2, mode=%hx\n", dentry, mode);
        ncp_age_dentry(server, dentry);
        len = sizeof(__name);
@@ -952,7 +947,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode,
                                error = -ENAMETOOLONG;
                        else if (result < 0)
                                error = result;
-                        DPRINTK("ncp_create: %pd2 failed\n", dentry);
+                        ncp_dbg(1, "%pd2 failed\n", dentry);
                        goto out;
                }
                opmode = O_WRONLY;
@@ -985,7 +980,7 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        int error, len;
        __u8 __name[NCP_MAXPATHLEN + 1];
-        DPRINTK("ncp_mkdir: making %pd2\n", dentry);
+        ncp_dbg(1, "making %pd2\n", dentry);
        ncp_age_dentry(server, dentry);
        len = sizeof(__name);
@@ -1022,7 +1017,7 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
        int error, result, len;
        __u8 __name[NCP_MAXPATHLEN + 1];
-        DPRINTK("ncp_rmdir: removing %pd2\n", dentry);
+        ncp_dbg(1, "removing %pd2\n", dentry);
        len = sizeof(__name);
        error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -1067,13 +1062,13 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
        int error;
        server = NCP_SERVER(dir);
-        DPRINTK("ncp_unlink: unlinking %pd2\n", dentry);
+        ncp_dbg(1, "unlinking %pd2\n", dentry);
        
        /*
         * Check whether to close the file ...
         */
        if (inode) {
-                PPRINTK("ncp_unlink: closing file\n");
+                ncp_vdbg("closing file\n");
                ncp_make_closed(inode);
        }
@@ -1087,7 +1082,7 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
 #endif
        switch (error) {
                case 0x00:
-                        DPRINTK("ncp: removed %pd2\n", dentry);
+                        ncp_dbg(1, "removed %pd2\n", dentry);
                        break;
                case 0x85:
                case 0x8A:
@@ -1120,7 +1115,7 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
        int old_len, new_len;
        __u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1];
-        DPRINTK("ncp_rename: %pd2 to %pd2\n", old_dentry, new_dentry);
+        ncp_dbg(1, "%pd2 to %pd2\n", old_dentry, new_dentry);
        ncp_age_dentry(server, old_dentry);
        ncp_age_dentry(server, new_dentry);
@@ -1150,8 +1145,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
 #endif
        switch (error) {
                case 0x00:
-                        DPRINTK("ncp renamed %pd -> %pd.\n",
+                        ncp_dbg(1, "renamed %pd -> %pd\n",
-                                old_dentry, new_dentry);
+                                old_dentry, new_dentry);
                        break;
                case 0x9E:
                        error = -ENAMETOOLONG;
@@ -1173,7 +1168,7 @@ static int ncp_mknod(struct inode * dir, struct dentry *dentry,
        if (!new_valid_dev(rdev))
                return -EINVAL;
        if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) {
-                DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%ho\n", mode);
+                ncp_dbg(1, "mode = 0%ho\n", mode);
                return ncp_create_new(dir, dentry, mode, rdev, 0);
        }
        return -EPERM; /* Strange, but true */
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 8f5074e1ecb9..77640a8bfb87 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -6,6 +6,8 @@
 *
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <asm/uaccess.h>
 #include <linux/time.h>
@@ -34,11 +36,11 @@ int ncp_make_open(struct inode *inode, int right)
        error = -EINVAL;
        if (!inode) {
-                printk(KERN_ERR "ncp_make_open: got NULL inode\n");
+                pr_err("%s: got NULL inode\n", __func__);
                goto out;
        }
-        DPRINTK("ncp_make_open: opened=%d, volume # %u, dir entry # %u\n",
+        ncp_dbg(1, "opened=%d, volume # %u, dir entry # %u\n",
                atomic_read(&NCP_FINFO(inode)->opened), 
                NCP_FINFO(inode)->volNumber, 
                NCP_FINFO(inode)->dirEntNum);
@@ -71,7 +73,7 @@ int ncp_make_open(struct inode *inode, int right)
                                break;
                }
                if (result) {
-                        PPRINTK("ncp_make_open: failed, result=%d\n", result);
+                        ncp_vdbg("failed, result=%d\n", result);
                        goto out_unlock;
                }
                /*
@@ -83,7 +85,7 @@ int ncp_make_open(struct inode *inode, int right)
        }
        access = NCP_FINFO(inode)->access;
-        PPRINTK("ncp_make_open: file open, access=%x\n", access);
+        ncp_vdbg("file open, access=%x\n", access);
        if (access == right || access == O_RDWR) {
                atomic_inc(&NCP_FINFO(inode)->opened);
                error = 0;
@@ -107,7 +109,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        void* freepage;
        size_t freelen;
-        DPRINTK("ncp_file_read: enter %pd2\n", dentry);
+        ncp_dbg(1, "enter %pd2\n", dentry);
        pos = *ppos;
@@ -124,7 +126,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        error = ncp_make_open(inode, O_RDONLY);
        if (error) {
-                DPRINTK(KERN_ERR "ncp_file_read: open failed, error=%d\n", error);
+                ncp_dbg(1, "open failed, error=%d\n", error);
                return error;
        }
@@ -165,7 +167,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        file_accessed(file);
-        DPRINTK("ncp_file_read: exit %pd2\n", dentry);
+        ncp_dbg(1, "exit %pd2\n", dentry);
 outrel:
        ncp_inode_close(inode);         
        return already_read ? already_read : error;
@@ -182,7 +184,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
        int errno;
        void* bouncebuffer;
-        DPRINTK("ncp_file_write: enter %pd2\n", dentry);
+        ncp_dbg(1, "enter %pd2\n", dentry);
        if ((ssize_t) count < 0)
                return -EINVAL;
        pos = *ppos;
@@ -211,7 +213,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
                return 0;
        errno = ncp_make_open(inode, O_WRONLY);
        if (errno) {
-                DPRINTK(KERN_ERR "ncp_file_write: open failed, error=%d\n", errno);
+                ncp_dbg(1, "open failed, error=%d\n", errno);
                return errno;
        }
        bufsize = NCP_SERVER(inode)->buffer_size;
@@ -261,7 +263,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
                        i_size_write(inode, pos);
                mutex_unlock(&inode->i_mutex);
        }
-        DPRINTK("ncp_file_write: exit %pd2\n", dentry);
+        ncp_dbg(1, "exit %pd2\n", dentry);
 outrel:
        ncp_inode_close(inode);         
        return already_written ? already_written : errno;
@@ -269,7 +271,7 @@ outrel:
 static int ncp_release(struct inode *inode, struct file *file) {
        if (ncp_make_closed(inode)) {
-                DPRINTK("ncp_release: failed to close\n");
+                ncp_dbg(1, "failed to close\n");
        }
        return 0;
 }
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 0af3349de851..03ffde1f44d6 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -2,6 +2,8 @@
 * getopt.c
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/kernel.h>
 #include <linux/string.h>
@@ -46,8 +48,8 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts
                                if (opts->has_arg & OPT_NOPARAM) {
                                        return opts->val;
                                }
-                                printk(KERN_INFO "%s: the %s option requires an argument\n",
+                                pr_info("%s: the %s option requires an argument\n",
-                                       caller, token);
+                                        caller, token);
                                return -EINVAL;
                        }
                        if (opts->has_arg & OPT_INT) {
@@ -57,18 +59,18 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts
                                if (!*v) {
                                        return opts->val;
                                }
-                                printk(KERN_INFO "%s: invalid numeric value in %s=%s\n",
+                                pr_info("%s: invalid numeric value in %s=%s\n",
                                        caller, token, val);
                                return -EDOM;
                        }
                        if (opts->has_arg & OPT_STRING) {
                                return opts->val;
                        }
-                        printk(KERN_INFO "%s: unexpected argument %s to the %s option\n",
+                        pr_info("%s: unexpected argument %s to the %s option\n",
                                caller, val, token);
                        return -EINVAL;
                }
        }
-        printk(KERN_INFO "%s: Unrecognized mount option %s\n", caller, token);
+        pr_info("%s: Unrecognized mount option %s\n", caller, token);
        return -EOPNOTSUPP;
 }
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 2cf2ebecb55f..e31e589369a4 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -9,6 +9,8 @@
 *
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <asm/uaccess.h>
@@ -99,6 +101,7 @@ static void destroy_inodecache(void)
 static int ncp_remount(struct super_block *sb, int *flags, char* data)
 {
+        sync_filesystem(sb);
        *flags |= MS_NODIRATIME;
        return 0;
 }
@@ -132,7 +135,7 @@ void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo)
        NCP_FINFO(inode)->access = nwinfo->access;
        memcpy(NCP_FINFO(inode)->file_handle, nwinfo->file_handle,
                        sizeof(nwinfo->file_handle));
-        DPRINTK("ncp_update_inode: updated %s, volnum=%d, dirent=%u\n",
+        ncp_dbg(1, "updated %s, volnum=%d, dirent=%u\n",
                nwinfo->i.entryName, NCP_FINFO(inode)->volNumber,
                NCP_FINFO(inode)->dirEntNum);
 }
@@ -140,8 +143,7 @@ void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo)
 static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi)
 {
        /* NFS namespace mode overrides others if it's set. */
-        DPRINTK(KERN_DEBUG "ncp_update_dates_and_mode: (%s) nfs.mode=0%o\n",
+        ncp_dbg(1, "(%s) nfs.mode=0%o\n", nwi->entryName, nwi->nfs.mode);
-                nwi->entryName, nwi->nfs.mode);
        if (nwi->nfs.mode) {
                /* XXX Security? */
                inode->i_mode = nwi->nfs.mode;
@@ -229,7 +231,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
        
        ncp_update_attrs(inode, nwinfo);
-        DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode);
+        ncp_dbg(2, "inode->i_mode = %u\n", inode->i_mode);
        set_nlink(inode, 1);
        inode->i_uid = server->m.uid;
@@ -257,7 +259,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
        struct inode *inode;
        if (info == NULL) {
-                printk(KERN_ERR "ncp_iget: info is NULL\n");
+                pr_err("%s: info is NULL\n", __func__);
                return NULL;
        }
@@ -289,23 +291,23 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
                }
                insert_inode_hash(inode);
        } else
-                printk(KERN_ERR "ncp_iget: iget failed!\n");
+                pr_err("%s: iget failed!\n", __func__);
        return inode;
 }
 static void
 ncp_evict_inode(struct inode *inode)
 {
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        if (S_ISDIR(inode->i_mode)) {
-                DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino);
+                ncp_dbg(2, "put directory %ld\n", inode->i_ino);
        }
        if (ncp_make_closed(inode) != 0) {
                /* We can't do anything but complain. */
-                printk(KERN_ERR "ncp_evict_inode: could not close\n");
+                pr_err("%s: could not close\n", __func__);
        }
 }
@@ -468,9 +470,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 {
        struct ncp_mount_data_kernel data;
        struct ncp_server *server;
-        struct file *ncp_filp;
        struct inode *root_inode;
-        struct inode *sock_inode;
        struct socket *sock;
        int error;
        int default_bufsize;
@@ -539,18 +539,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) ||
            !gid_valid(data.gid))
                goto out;
-        error = -EBADF;
+        sock = sockfd_lookup(data.ncp_fd, &error);
-        ncp_filp = fget(data.ncp_fd);
-        if (!ncp_filp)
-                goto out;
-        error = -ENOTSOCK;
-        sock_inode = file_inode(ncp_filp);
-        if (!S_ISSOCK(sock_inode->i_mode))
-                goto out_fput;
-        sock = SOCKET_I(sock_inode);
        if (!sock)
-                goto out_fput;
+                goto out;
-                
        if (sock->type == SOCK_STREAM)
                default_bufsize = 0xF000;
        else
@@ -572,27 +564,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        if (error)
                goto out_fput;
-        server->ncp_filp = ncp_filp;
        server->ncp_sock = sock;
        
        if (data.info_fd != -1) {
-                struct socket *info_sock;
+                struct socket *info_sock = sockfd_lookup(data.info_fd, &error);
-                error = -EBADF;
-                server->info_filp = fget(data.info_fd);
-                if (!server->info_filp)
-                        goto out_bdi;
-                error = -ENOTSOCK;
-                sock_inode = file_inode(server->info_filp);
-                if (!S_ISSOCK(sock_inode->i_mode))
-                        goto out_fput2;
-                info_sock = SOCKET_I(sock_inode);
                if (!info_sock)
-                        goto out_fput2;
+                        goto out_bdi;
+                server->info_sock = info_sock;
                error = -EBADFD;
                if (info_sock->type != SOCK_STREAM)
                        goto out_fput2;
-                server->info_sock = info_sock;
        }
 /*      server->lock = 0;       */
@@ -620,7 +601,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
           now because of PATH_MAX changes.. */
        if (server->m.time_out < 1) {
                server->m.time_out = 10;
-                printk(KERN_INFO "You need to recompile your ncpfs utils..\n");
+                pr_info("You need to recompile your ncpfs utils..\n");
        }
        server->m.time_out = server->m.time_out * HZ / 100;
        server->m.file_mode = (server->m.file_mode & S_IRWXUGO) | S_IFREG;
@@ -681,7 +662,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        ncp_unlock_server(server);
        if (error < 0)
                goto out_rxbuf;
-        DPRINTK("ncp_fill_super: NCP_SBP(sb) = %x\n", (int) NCP_SBP(sb));
+        ncp_dbg(1, "NCP_SBP(sb) = %p\n", NCP_SBP(sb));
        error = -EMSGSIZE;      /* -EREMOTESIDEINCOMPATIBLE */
 #ifdef CONFIG_NCPFS_PACKET_SIGNING
@@ -709,7 +690,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        if (ncp_negotiate_buffersize(server, default_bufsize,
                                     &(server->buffer_size)) != 0)
                goto out_disconnect;
-        DPRINTK("ncpfs: bufsize = %d\n", server->buffer_size);
+        ncp_dbg(1, "bufsize = %d\n", server->buffer_size);
        memset(&finfo, 0, sizeof(finfo));
        finfo.i.attributes      = aDIR;
@@ -738,7 +719,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        root_inode = ncp_iget(sb, &finfo);
        if (!root_inode)
                goto out_disconnect;
-        DPRINTK("ncp_fill_super: root vol=%d\n", NCP_FINFO(root_inode)->volNumber);
+        ncp_dbg(1, "root vol=%d\n", NCP_FINFO(root_inode)->volNumber);
        sb->s_root = d_make_root(root_inode);
        if (!sb->s_root)
                goto out_disconnect;
@@ -764,17 +745,12 @@ out_nls:
        mutex_destroy(&server->root_setup_lock);
        mutex_destroy(&server->mutex);
 out_fput2:
-        if (server->info_filp)
+        if (server->info_sock)
-                fput(server->info_filp);
+                sockfd_put(server->info_sock);
 out_bdi:
        bdi_destroy(&server->bdi);
 out_fput:
-        /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
+        sockfd_put(sock);
-         * 
-         * The previously used put_filp(ncp_filp); was bogus, since
-         * it doesn't perform proper unlocking.
-         */
-        fput(ncp_filp);
 out:
        put_pid(data.wdog_pid);
        sb->s_fs_info = NULL;
@@ -807,9 +783,9 @@ static void ncp_put_super(struct super_block *sb)
        mutex_destroy(&server->root_setup_lock);
        mutex_destroy(&server->mutex);
-        if (server->info_filp)
+        if (server->info_sock)
-                fput(server->info_filp);
+                sockfd_put(server->info_sock);
-        fput(server->ncp_filp);
+        sockfd_put(server->ncp_sock);
        kill_pid(server->m.wdog_pid, SIGTERM, 1);
        put_pid(server->m.wdog_pid);
@@ -984,8 +960,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
        if ((attr->ia_valid & ATTR_SIZE) != 0) {
                int written;
-                DPRINTK("ncpfs: trying to change size to %ld\n",
+                ncp_dbg(1, "trying to change size to %llu\n", attr->ia_size);
-                        attr->ia_size);
                if ((result = ncp_make_open(inode, O_WRONLY)) < 0) {
                        result = -EACCES;
@@ -1071,7 +1046,7 @@ MODULE_ALIAS_FS("ncpfs");
 static int __init init_ncp_fs(void)
 {
        int err;
-        DPRINTK("ncpfs: init_ncp_fs called\n");
+        ncp_dbg(1, "called\n");
        err = init_inodecache();
        if (err)
@@ -1088,7 +1063,7 @@ out1:
 static void __exit exit_ncp_fs(void)
 {
-        DPRINTK("ncpfs: exit_ncp_fs called\n");
+        ncp_dbg(1, "called\n");
        unregister_filesystem(&ncp_fs_type);
        destroy_inodecache();
 }
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 60426ccb3b65..d5659d96ee7f 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -41,7 +41,7 @@ ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
                return -EFAULT;
        if (info.version != NCP_GET_FS_INFO_VERSION) {
-                DPRINTK("info.version invalid: %d\n", info.version);
+                ncp_dbg(1, "info.version invalid: %d\n", info.version);
                return -EINVAL;
        }
        /* TODO: info.addr = server->m.serv_addr; */
@@ -66,7 +66,7 @@ ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
                return -EFAULT;
        if (info2.version != NCP_GET_FS_INFO_VERSION_V2) {
-                DPRINTK("info.version invalid: %d\n", info2.version);
+                ncp_dbg(1, "info.version invalid: %d\n", info2.version);
                return -EINVAL;
        }
        info2.mounted_uid   = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
@@ -132,7 +132,7 @@ ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
                return -EFAULT;
        if (info2.version != NCP_GET_FS_INFO_VERSION_V2) {
-                DPRINTK("info.version invalid: %d\n", info2.version);
+                ncp_dbg(1, "info.version invalid: %d\n", info2.version);
                return -EINVAL;
        }
        info2.mounted_uid   = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
@@ -308,8 +308,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
                else
                        result = server->reply_size;
                ncp_unlock_server(server);
-                DPRINTK("ncp_ioctl: copy %d bytes\n",
+                ncp_dbg(1, "copy %d bytes\n", result);
-                        result);
                if (result >= 0)
                        if (copy_to_user(request.data, bouncebuffer, result))
                                result = -EFAULT;
@@ -385,9 +384,9 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
                                                sr.namespace = server->name_space[sr.volNumber];
                                                result = 0;
                                        } else
-                                                DPRINTK("ncpfs: s_root->d_inode==NULL\n");
+                                                ncp_dbg(1, "s_root->d_inode==NULL\n");
                                } else
-                                        DPRINTK("ncpfs: s_root==NULL\n");
+                                        ncp_dbg(1, "s_root==NULL\n");
                        } else {
                                sr.volNumber = -1;
                                sr.namespace = 0;
@@ -440,11 +439,11 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
                                                        NCP_FINFO(s_inode)->DosDirNum = dosde;
                                                        server->root_setuped = 1;
                                                } else {
-                                                        DPRINTK("ncpfs: s_root->d_inode==NULL\n");
+                                                        ncp_dbg(1, "s_root->d_inode==NULL\n");
                                                        result = -EIO;
                                                }
                                        } else {
-                                                DPRINTK("ncpfs: s_root==NULL\n");
+                                                ncp_dbg(1, "s_root==NULL\n");
                                                result = -EIO;
                                        }
                                }
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 3c5dd55d284c..b359d12eb359 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -107,7 +107,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct inode *inode = file_inode(file);
        
-        DPRINTK("ncp_mmap: called\n");
+        ncp_dbg(1, "called\n");
        if (!ncp_conn_valid(NCP_SERVER(inode)))
                return -EIO;
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
index 31831afe1c3b..b9f69e1b1f43 100644
--- a/fs/ncpfs/ncp_fs.h
+++ b/fs/ncpfs/ncp_fs.h
@@ -2,30 +2,32 @@
 #include "ncp_fs_i.h"
 #include "ncp_fs_sb.h"
-/* define because it is easy to change PRINTK to {*}PRINTK */
-#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
 #undef NCPFS_PARANOIA
 #ifdef NCPFS_PARANOIA
-#define PPRINTK(format, args...) PRINTK(format , ## args)
+#define ncp_vdbg(fmt, ...)                                      \
+        pr_debug(fmt, ##__VA_ARGS__)
 #else
-#define PPRINTK(format, args...)
+#define ncp_vdbg(fmt, ...)                                      \
+do {                                                            \
+        if (0)                                                  \
+                pr_debug(fmt, ##__VA_ARGS__);                   \
+} while (0)
 #endif
 #ifndef DEBUG_NCP
 #define DEBUG_NCP 0
 #endif
-#if DEBUG_NCP > 0
-#define DPRINTK(format, args...) PRINTK(format , ## args)
+#if DEBUG_NCP > 0 && !defined(DEBUG)
-#else
+#define DEBUG
-#define DPRINTK(format, args...)
-#endif
-#if DEBUG_NCP > 1
-#define DDPRINTK(format, args...) PRINTK(format , ## args)
-#else
-#define DDPRINTK(format, args...)
 #endif
+#define ncp_dbg(level, fmt, ...)                                \
+do {                                                            \
+        if (level <= DEBUG_NCP)                                 \
+                pr_debug(fmt, ##__VA_ARGS__);                   \
+} while (0)
 #define NCP_MAX_RPC_TIMEOUT (6*HZ)
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index b81e97adc5a9..55e26fd80886 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -45,9 +45,7 @@ struct ncp_server {
        __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
-        struct file *ncp_filp;  /* File pointer to ncp socket */
        struct socket *ncp_sock;/* ncp socket */
-        struct file *info_filp;
        struct socket *info_sock;
        u8 sequence;
@@ -111,7 +109,7 @@ struct ncp_server {
        spinlock_t requests_lock;       /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
-        void (*data_ready)(struct sock* sk, int len);
+        void (*data_ready)(struct sock* sk);
        void (*error_report)(struct sock* sk);
        void (*write_space)(struct sock* sk);   /* STREAM mode only */
        struct {
@@ -153,7 +151,7 @@ extern void ncp_tcp_tx_proc(struct work_struct *work);
 extern void ncpdgram_rcv_proc(struct work_struct *work);
 extern void ncpdgram_timeout_proc(struct work_struct *work);
 extern void ncpdgram_timeout_call(unsigned long server);
-extern void ncp_tcp_data_ready(struct sock* sk, int len);
+extern void ncp_tcp_data_ready(struct sock* sk);
 extern void ncp_tcp_write_space(struct sock* sk);
 extern void ncp_tcp_error_report(struct sock* sk);
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 981a95617fc9..482387532f54 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -9,14 +9,14 @@
 *
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include "ncp_fs.h"
 static inline void assert_server_locked(struct ncp_server *server)
 {
        if (server->lock == 0) {
-                DPRINTK("ncpfs: server not locked!\n");
+                ncp_dbg(1, "server not locked!\n");
        }
 }
@@ -75,7 +75,7 @@ static void ncp_add_pstring(struct ncp_server *server, const char *s)
        int len = strlen(s);
        assert_server_locked(server);
        if (len > 255) {
-                DPRINTK("ncpfs: string too long: %s\n", s);
+                ncp_dbg(1, "string too long: %s\n", s);
                len = 255;
        }
        ncp_add_byte(server, len);
@@ -225,7 +225,7 @@ int ncp_get_volume_info_with_number(struct ncp_server* server,
        result = -EIO;
        len = ncp_reply_byte(server, 29);
        if (len > NCP_VOLNAME_LEN) {
-                DPRINTK("ncpfs: volume name too long: %d\n", len);
+                ncp_dbg(1, "volume name too long: %d\n", len);
                goto out;
        }
        memcpy(&(target->volume_name), ncp_reply_data(server, 30), len);
@@ -259,7 +259,7 @@ int ncp_get_directory_info(struct ncp_server* server, __u8 n,
        result = -EIO;
        len = ncp_reply_byte(server, 21);
        if (len > NCP_VOLNAME_LEN) {
-                DPRINTK("ncpfs: volume name too long: %d\n", len);
+                ncp_dbg(1, "volume name too long: %d\n", len);
                goto out;
        }
        memcpy(&(target->volume_name), ncp_reply_data(server, 22), len);
@@ -295,9 +295,9 @@ ncp_make_closed(struct inode *inode)
                err = ncp_close_file(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle);
                if (!err)
-                        PPRINTK("ncp_make_closed: volnum=%d, dirent=%u, error=%d\n",
+                        ncp_vdbg("volnum=%d, dirent=%u, error=%d\n",
-                                NCP_FINFO(inode)->volNumber,
+                                 NCP_FINFO(inode)->volNumber,
-                                NCP_FINFO(inode)->dirEntNum, err);
+                                 NCP_FINFO(inode)->dirEntNum, err);
        }
        mutex_unlock(&NCP_FINFO(inode)->open_mutex);
        return err;
@@ -394,8 +394,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server,
                if ((result = ncp_request(server, 87)) == 0) {
                        ncp_extract_nfs_info(ncp_reply_data(server, 0), &target->nfs);
-                        DPRINTK(KERN_DEBUG
+                        ncp_dbg(1, "(%s) mode=0%o, rdev=0x%x\n",
-                                "ncp_obtain_nfs_info: (%s) mode=0%o, rdev=0x%x\n",
                                target->entryName, target->nfs.mode,
                                target->nfs.rdev);
                } else {
@@ -425,7 +424,7 @@ int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *pa
        int result;
        if (target == NULL) {
-                printk(KERN_ERR "ncp_obtain_info: invalid call\n");
+                pr_err("%s: invalid call\n", __func__);
                return -EINVAL;
        }
        ncp_init_request(server);
@@ -498,7 +497,7 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume)
        namespace = ncp_reply_data(server, 2);
        while (no_namespaces > 0) {
-                DPRINTK("get_namespaces: found %d on %d\n", *namespace, volume);
+                ncp_dbg(1, "found %d on %d\n", *namespace, volume);
 #ifdef CONFIG_NCPFS_NFS_NS
                if ((*namespace == NW_NS_NFS) && !(server->m.flags&NCP_MOUNT_NO_NFS)) 
@@ -531,8 +530,7 @@ ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns)
        if (ret_ns)
                *ret_ns = ns;
-        DPRINTK("lookup_vol: namespace[%d] = %d\n",
+        ncp_dbg(1, "namespace[%d] = %d\n", volume, server->name_space[volume]);
-                volume, server->name_space[volume]);
        if (server->name_space[volume] == ns)
                return 0;
@@ -596,7 +594,7 @@ ncp_get_volume_root(struct ncp_server *server,
 {
        int result;
-        DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname);
+        ncp_dbg(1, "looking up vol %s\n", volname);
        ncp_init_request(server);
        ncp_add_byte(server, 22);       /* Subfunction: Generate dir handle */
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 3a1587222c8a..471bc3d1139e 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -8,6 +8,7 @@
 *
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/time.h>
 #include <linux/errno.h>
@@ -96,11 +97,11 @@ static void ncp_req_put(struct ncp_request_reply *req)
                kfree(req);
 }
-void ncp_tcp_data_ready(struct sock *sk, int len)
+void ncp_tcp_data_ready(struct sock *sk)
 {
        struct ncp_server *server = sk->sk_user_data;
-        server->data_ready(sk, len);
+        server->data_ready(sk);
        schedule_work(&server->rcv.tq);
 }
@@ -231,7 +232,7 @@ static void __ncptcp_try_send(struct ncp_server *server)
                return;
        if (result < 0) {
-                printk(KERN_ERR "ncpfs: tcp: Send failed: %d\n", result);
+                pr_err("tcp: Send failed: %d\n", result);
                __ncp_abort_request(server, rq, result);
                return;
        }
@@ -332,7 +333,7 @@ static int ncp_add_request(struct ncp_server *server, struct ncp_request_reply *
        mutex_lock(&server->rcv.creq_mutex);
        if (!ncp_conn_valid(server)) {
                mutex_unlock(&server->rcv.creq_mutex);
-                printk(KERN_ERR "ncpfs: tcp: Server died\n");
+                pr_err("tcp: Server died\n");
                return -EIO;
        }
        ncp_req_get(req);
@@ -405,15 +406,15 @@ void ncpdgram_rcv_proc(struct work_struct *work)
                                }
                                result = _recv(sock, buf, sizeof(buf), MSG_DONTWAIT);
                                if (result < 0) {
-                                        DPRINTK("recv failed with %d\n", result);
+                                        ncp_dbg(1, "recv failed with %d\n", result);
                                        continue;
                                }
                                if (result < 10) {
-                                        DPRINTK("too short (%u) watchdog packet\n", result);
+                                        ncp_dbg(1, "too short (%u) watchdog packet\n", result);
                                        continue;
                                }
                                if (buf[9] != '?') {
-                                        DPRINTK("bad signature (%02X) in watchdog packet\n", buf[9]);
+                                        ncp_dbg(1, "bad signature (%02X) in watchdog packet\n", buf[9]);
                                        continue;
                                }
                                buf[9] = 'Y';
@@ -448,7 +449,7 @@ void ncpdgram_rcv_proc(struct work_struct *work)
                                                        result -= 8;
                                                        hdrl = sock->sk->sk_family == AF_INET ? 8 : 6;
                                                        if (sign_verify_reply(server, server->rxbuf + hdrl, result - hdrl, cpu_to_le32(result), server->rxbuf + result)) {
-                                                                printk(KERN_INFO "ncpfs: Signature violation\n");
+                                                                pr_info("Signature violation\n");
                                                                result = -EIO;
                                                        }
                                                }
@@ -524,7 +525,7 @@ static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len)
                return result;
        }
        if (result > len) {
-                printk(KERN_ERR "ncpfs: tcp: bug in recvmsg (%u > %Zu)\n", result, len);
+                pr_err("tcp: bug in recvmsg (%u > %Zu)\n", result, len);
                return -EIO;                    
        }
        return result;
@@ -552,9 +553,9 @@ static int __ncptcp_rcv_proc(struct ncp_server *server)
                                        __ncptcp_abort(server);
                                }
                                if (result < 0) {
-                                        printk(KERN_ERR "ncpfs: tcp: error in recvmsg: %d\n", result);
+                                        pr_err("tcp: error in recvmsg: %d\n", result);
                                } else {
-                                        DPRINTK(KERN_ERR "ncpfs: tcp: EOF\n");
+                                        ncp_dbg(1, "tcp: EOF\n");
                                }
                                return -EIO;
                        }
@@ -566,20 +567,20 @@ static int __ncptcp_rcv_proc(struct ncp_server *server)
                switch (server->rcv.state) {
                        case 0:
                                if (server->rcv.buf.magic != htonl(NCP_TCP_RCVD_MAGIC)) {
-                                        printk(KERN_ERR "ncpfs: tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic));
+                                        pr_err("tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic));
                                        __ncptcp_abort(server);
                                        return -EIO;
                                }
                                datalen = ntohl(server->rcv.buf.len) & 0x0FFFFFFF;
                                if (datalen < 10) {
-                                        printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen);
+                                        pr_err("tcp: Unexpected reply len %d\n", datalen);
                                        __ncptcp_abort(server);
                                        return -EIO;
                                }
 #ifdef CONFIG_NCPFS_PACKET_SIGNING                              
                                if (server->sign_active) {
                                        if (datalen < 18) {
-                                                printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen);
+                                                pr_err("tcp: Unexpected reply len %d\n", datalen);
                                                __ncptcp_abort(server);
                                                return -EIO;
                                        }
@@ -604,7 +605,7 @@ cont:;
                                                server->rcv.len = datalen - 10;
                                                break;
                                        }                                       
-                                        DPRINTK("ncpfs: tcp: Unexpected NCP type %02X\n", type);
+                                        ncp_dbg(1, "tcp: Unexpected NCP type %02X\n", type);
 skipdata2:;
                                        server->rcv.state = 2;
 skipdata:;
@@ -614,11 +615,11 @@ skipdata:;
                                }
                                req = server->rcv.creq;
                                if (!req) {
-                                        DPRINTK(KERN_ERR "ncpfs: Reply without appropriate request\n");
+                                        ncp_dbg(1, "Reply without appropriate request\n");
                                        goto skipdata2;
                                }
                                if (datalen > req->datalen + 8) {
-                                        printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8);
+                                        pr_err("tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8);
                                        server->rcv.state = 3;
                                        goto skipdata;
                                }
@@ -638,12 +639,12 @@ skipdata:;
                                req = server->rcv.creq;
                                if (req->tx_type != NCP_ALLOC_SLOT_REQUEST) {
                                        if (((struct ncp_reply_header*)server->rxbuf)->sequence != server->sequence) {
-                                                printk(KERN_ERR "ncpfs: tcp: Bad sequence number\n");
+                                                pr_err("tcp: Bad sequence number\n");
                                                __ncp_abort_request(server, req, -EIO);
                                                return -EIO;
                                        }
                                        if ((((struct ncp_reply_header*)server->rxbuf)->conn_low | (((struct ncp_reply_header*)server->rxbuf)->conn_high << 8)) != server->connection) {
-                                                printk(KERN_ERR "ncpfs: tcp: Connection number mismatch\n");
+                                                pr_err("tcp: Connection number mismatch\n");
                                                __ncp_abort_request(server, req, -EIO);
                                                return -EIO;
                                        }
@@ -651,7 +652,7 @@ skipdata:;
 #ifdef CONFIG_NCPFS_PACKET_SIGNING                              
                                if (server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) {
                                        if (sign_verify_reply(server, server->rxbuf + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) {
-                                                printk(KERN_ERR "ncpfs: tcp: Signature violation\n");
+                                                pr_err("tcp: Signature violation\n");
                                                __ncp_abort_request(server, req, -EIO);
                                                return -EIO;
                                        }
@@ -742,7 +743,7 @@ static int ncp_do_request(struct ncp_server *server, int size,
        int result;
        if (server->lock == 0) {
-                printk(KERN_ERR "ncpfs: Server not locked!\n");
+                pr_err("Server not locked!\n");
                return -EIO;
        }
        if (!ncp_conn_valid(server)) {
@@ -781,7 +782,7 @@ static int ncp_do_request(struct ncp_server *server, int size,
                spin_unlock_irqrestore(&current->sighand->siglock, flags);
        }
-        DDPRINTK("do_ncp_rpc_call returned %d\n", result);
+        ncp_dbg(2, "do_ncp_rpc_call returned %d\n", result);
        return result;
 }
@@ -811,7 +812,7 @@ int ncp_request2(struct ncp_server *server, int function,
        result = ncp_do_request(server, server->current_size, reply, size);
        if (result < 0) {
-                DPRINTK("ncp_request_error: %d\n", result);
+                ncp_dbg(1, "ncp_request_error: %d\n", result);
                goto out;
        }
        server->completion = reply->completion_code;
@@ -822,7 +823,7 @@ int ncp_request2(struct ncp_server *server, int function,
        result = reply->completion_code;
        if (result != 0)
-                PPRINTK("ncp_request: completion code=%x\n", result);
+                ncp_vdbg("completion code=%x\n", result);
 out:
        return result;
 }
@@ -865,14 +866,14 @@ void ncp_lock_server(struct ncp_server *server)
 {
        mutex_lock(&server->mutex);
        if (server->lock)
-                printk(KERN_WARNING "ncp_lock_server: was locked!\n");
+                pr_warn("%s: was locked!\n", __func__);
        server->lock = 1;
 }
 void ncp_unlock_server(struct ncp_server *server)
 {
        if (!server->lock) {
-                printk(KERN_WARNING "ncp_unlock_server: was not locked!\n");
+                pr_warn("%s: was not locked!\n", __func__);
                return;
        }
        server->lock = 0;
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index 52439ddc8de0..1a63bfdb4a65 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -112,7 +112,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) {
        __le32 attr;
        unsigned int hdr;
-        DPRINTK("ncp_symlink(dir=%p,dentry=%p,symname=%s)\n",dir,dentry,symname);
+        ncp_dbg(1, "dir=%p, dentry=%p, symname=%s\n", dir, dentry, symname);
        if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber))
                kludge = 0;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 56ff823ca82e..65d849bdf77a 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1213,7 +1213,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
        end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
        if (end != NFS_I(inode)->npages) {
                rcu_read_lock();
-                end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX);
+                end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
                rcu_read_unlock();
        }
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index ae2e87b95453..41db5258e7a7 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -112,7 +112,8 @@ out:
 * TODO: keep track of all layouts (and delegations) in a hash table
 * hashed by filehandle.
 */
-static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, struct nfs_fh *fh)
+static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
+                struct nfs_fh *fh, nfs4_stateid *stateid)
 {
        struct nfs_server *server;
        struct inode *ino;
@@ -120,17 +121,19 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
                list_for_each_entry(lo, &server->layouts, plh_layouts) {
+                        if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid))
+                                continue;
                        if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
                                continue;
                        ino = igrab(lo->plh_inode);
                        if (!ino)
-                                continue;
+                                break;
                        spin_lock(&ino->i_lock);
                        /* Is this layout in the process of being freed? */
                        if (NFS_I(ino)->layout != lo) {
                                spin_unlock(&ino->i_lock);
                                iput(ino);
-                                continue;
+                                break;
                        }
                        pnfs_get_layout_hdr(lo);
                        spin_unlock(&ino->i_lock);
@@ -141,13 +144,14 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
        return NULL;
 }
-static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, struct nfs_fh *fh)
+static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
+                struct nfs_fh *fh, nfs4_stateid *stateid)
 {
        struct pnfs_layout_hdr *lo;
        spin_lock(&clp->cl_lock);
        rcu_read_lock();
-        lo = get_layout_by_fh_locked(clp, fh);
+        lo = get_layout_by_fh_locked(clp, fh, stateid);
        rcu_read_unlock();
        spin_unlock(&clp->cl_lock);
@@ -162,9 +166,9 @@ static u32 initiate_file_draining(struct nfs_client *clp,
        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
        LIST_HEAD(free_me_list);
-        lo = get_layout_by_fh(clp, &args->cbl_fh);
+        lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
        if (!lo)
-                return NFS4ERR_NOMATCHING_LAYOUT;
+                goto out;
        ino = lo->plh_inode;
        spin_lock(&ino->i_lock);
@@ -179,6 +183,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
        pnfs_free_lseg_list(&free_me_list);
        pnfs_put_layout_hdr(lo);
        iput(ino);
+out:
        return rv;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4a48fe4b84b6..d9f3d067cd15 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -69,21 +69,28 @@ const struct address_space_operations nfs_dir_aops = {
 static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
 {
+        struct nfs_inode *nfsi = NFS_I(dir);
        struct nfs_open_dir_context *ctx;
        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
        if (ctx != NULL) {
                ctx->duped = 0;
-                ctx->attr_gencount = NFS_I(dir)->attr_gencount;
+                ctx->attr_gencount = nfsi->attr_gencount;
                ctx->dir_cookie = 0;
                ctx->dup_cookie = 0;
                ctx->cred = get_rpccred(cred);
+                spin_lock(&dir->i_lock);
+                list_add(&ctx->list, &nfsi->open_files);
+                spin_unlock(&dir->i_lock);
                return ctx;
        }
        return  ERR_PTR(-ENOMEM);
 }
-static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
+static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx)
 {
+        spin_lock(&dir->i_lock);
+        list_del(&ctx->list);
+        spin_unlock(&dir->i_lock);
        put_rpccred(ctx->cred);
        kfree(ctx);
 }
@@ -126,7 +133,7 @@ out:
 static int
 nfs_closedir(struct inode *inode, struct file *filp)
 {
-        put_nfs_open_dir_context(filp->private_data);
+        put_nfs_open_dir_context(filp->f_path.dentry->d_inode, filp->private_data);
        return 0;
 }
@@ -306,10 +313,9 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
                                        if (printk_ratelimit()) {
                                                pr_notice("NFS: directory %pD2 contains a readdir loop."
                                                                "Please contact your server vendor.  "
-                                                                "The file: %s has duplicate cookie %llu\n",
+                                                                "The file: %.*s has duplicate cookie %llu\n",
-                                                                desc->file,
+                                                                desc->file, array->array[i].string.len,
-                                                                array->array[i].string.name,
+                                                                array->array[i].string.name, *desc->dir_cookie);
-                                                                *desc->dir_cookie);
                                        }
                                        status = -ELOOP;
                                        goto out;
@@ -437,6 +443,22 @@ void nfs_advise_use_readdirplus(struct inode *dir)
        set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);
 }
+/*
+ * This function is mainly for use by nfs_getattr().
+ *
+ * If this is an 'ls -l', we want to force use of readdirplus.
+ * Do this by checking if there is an active file descriptor
+ * and calling nfs_advise_use_readdirplus, then forcing a
+ * cache flush.
+ */
+void nfs_force_use_readdirplus(struct inode *dir)
+{
+        if (!list_empty(&NFS_I(dir)->open_files)) {
+                nfs_advise_use_readdirplus(dir);
+                nfs_zap_mapping(dir, dir->i_mapping);
+        }
+}
 static
 void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 {
@@ -815,6 +837,17 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
        goto out;
 }
+static bool nfs_dir_mapping_need_revalidate(struct inode *dir)
+{
+        struct nfs_inode *nfsi = NFS_I(dir);
+        if (nfs_attribute_cache_expired(dir))
+                return true;
+        if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+                return true;
+        return false;
+}
 /* The file offset position represents the dirent entry number.  A
   last cookie cache takes care of the common case of reading the
   whole directory.
@@ -847,7 +880,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
        desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
        nfs_block_sillyrename(dentry);
-        if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
+        if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode))
                res = nfs_revalidate_mapping(inode, file->f_mapping);
        if (res < 0)
                goto out;
@@ -1911,6 +1944,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct inode *old_inode = old_dentry->d_inode;
        struct inode *new_inode = new_dentry->d_inode;
        struct dentry *dentry = NULL, *rehash = NULL;
+        struct rpc_task *task;
        int error = -EBUSY;
        dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n",
@@ -1958,8 +1992,16 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (new_inode != NULL)
                NFS_PROTO(new_inode)->return_delegation(new_inode);
-        error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
+        task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
-                                           new_dir, &new_dentry->d_name);
+        if (IS_ERR(task)) {
+                error = PTR_ERR(task);
+                goto out;
+        }
+        error = rpc_wait_for_completion_task(task);
+        if (error == 0)
+                error = task->tk_status;
+        rpc_put_task(task);
        nfs_mark_for_revalidate(old_inode);
 out:
        if (rehash)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5bb790a69c71..284ca901fe16 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -617,6 +617,7 @@ out:
 static const struct vm_operations_struct nfs_file_vm_ops = {
        .fault = filemap_fault,
+        .map_pages = filemap_map_pages,
        .page_mkwrite = nfs_vm_page_mkwrite,
        .remap_pages = generic_file_remap_pages,
 };
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 360114ae8b82..0c438973f3c8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(nfs_clear_inode);
 void nfs_evict_inode(struct inode *inode)
 {
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        nfs_clear_inode(inode);
 }
@@ -588,6 +588,25 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
 }
 EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
+static void nfs_request_parent_use_readdirplus(struct dentry *dentry)
+{
+        struct dentry *parent;
+        parent = dget_parent(dentry);
+        nfs_force_use_readdirplus(parent->d_inode);
+        dput(parent);
+}
+static bool nfs_need_revalidate_inode(struct inode *inode)
+{
+        if (NFS_I(inode)->cache_validity &
+                        (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
+                return true;
+        if (nfs_attribute_cache_expired(inode))
+                return true;
+        return false;
+}
 int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
        struct inode *inode = dentry->d_inode;
@@ -616,10 +635,13 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
            ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
                need_atime = 0;
-        if (need_atime)
+        if (need_atime || nfs_need_revalidate_inode(inode)) {
-                err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+                struct nfs_server *server = NFS_SERVER(inode);
-        else
-                err = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+                if (server->caps & NFS_CAP_READDIRPLUS)
+                        nfs_request_parent_use_readdirplus(dentry);
+                err = __nfs_revalidate_inode(server, inode);
+        }
        if (!err) {
                generic_fillattr(inode, stat);
                stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
@@ -961,9 +983,7 @@ int nfs_attribute_cache_expired(struct inode *inode)
 */
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
-        if (!(NFS_I(inode)->cache_validity &
+        if (!nfs_need_revalidate_inode(inode))
-                        (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
-                        && !nfs_attribute_cache_expired(inode))
                return NFS_STALE(inode) ? -ESTALE : 0;
        return __nfs_revalidate_inode(server, inode);
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b46cf5a67329..dd8bfc2e2464 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -301,6 +301,7 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
                           const char *ip_addr);
 /* dir.c */
+extern void nfs_force_use_readdirplus(struct inode *dir);
 extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
                                            struct shrink_control *sc);
 extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
@@ -474,6 +475,13 @@ extern int nfs_migrate_page(struct address_space *,
 #define nfs_migrate_page NULL
 #endif
+/* unlink.c */
+extern struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+                 struct dentry *old_dentry, struct dentry *new_dentry,
+                 void (*complete)(struct rpc_task *, struct nfs_renamedata *));
+extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
 /* direct.c */
 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
                              struct nfs_direct_req *dreq);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index a462ef0fb5d6..db60149c4579 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -479,41 +479,6 @@ nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
 }
 static int
-nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
-                 struct inode *new_dir, struct qstr *new_name)
-{
-        struct nfs_renameargs   arg = {
-                .old_dir        = NFS_FH(old_dir),
-                .old_name       = old_name,
-                .new_dir        = NFS_FH(new_dir),
-                .new_name       = new_name,
-        };
-        struct nfs_renameres res;
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs3_procedures[NFS3PROC_RENAME],
-                .rpc_argp       = &arg,
-                .rpc_resp       = &res,
-        };
-        int status = -ENOMEM;
-        dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
-        res.old_fattr = nfs_alloc_fattr();
-        res.new_fattr = nfs_alloc_fattr();
-        if (res.old_fattr == NULL || res.new_fattr == NULL)
-                goto out;
-        status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
-        nfs_post_op_update_inode(old_dir, res.old_fattr);
-        nfs_post_op_update_inode(new_dir, res.new_fattr);
-out:
-        nfs_free_fattr(res.old_fattr);
-        nfs_free_fattr(res.new_fattr);
-        dprintk("NFS reply rename: %d\n", status);
-        return status;
-}
-static int
 nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 {
        struct nfs3_linkargs    arg = {
@@ -968,7 +933,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .unlink_setup   = nfs3_proc_unlink_setup,
        .unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,
        .unlink_done    = nfs3_proc_unlink_done,
-        .rename         = nfs3_proc_rename,
        .rename_setup   = nfs3_proc_rename_setup,
        .rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,
        .rename_done    = nfs3_proc_rename_done,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a5b27c2d9689..e1d1badbe53c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -427,6 +427,7 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
 extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
 extern void nfs_inode_find_state_and_recover(struct inode *inode,
                const nfs4_stateid *stateid);
+extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *);
 extern void nfs4_schedule_lease_recovery(struct nfs_client *);
 extern int nfs4_wait_clnt_recover(struct nfs_client *clp);
 extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
@@ -500,6 +501,16 @@ static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_statei
        return memcmp(dst, src, sizeof(*dst)) == 0;
 }
+static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
+{
+        return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0;
+}
+static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2)
+{
+        return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
+}
 static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
 {
        return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 0e46d3d1b6cc..aa9ef4876046 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -531,6 +531,13 @@ int nfs40_walk_client_list(struct nfs_client *new,
                        *result = pos;
                        dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
                                __func__, pos, atomic_read(&pos->cl_count));
+                        goto out;
+                case -ERESTARTSYS:
+                case -ETIMEDOUT:
+                        /* The callback path may have been inadvertently
+                         * changed. Schedule recovery!
+                         */
+                        nfs4_schedule_path_down_recovery(pos);
                default:
                        goto out;
                }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 450bfedbe2f4..397be39c6dc8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1068,6 +1068,7 @@ static void nfs4_opendata_free(struct kref *kref)
        dput(p->dentry);
        nfs_sb_deactive(sb);
        nfs_fattr_free_names(&p->f_attr);
+        kfree(p->f_attr.mdsthreshold);
        kfree(p);
 }
@@ -1137,12 +1138,71 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
        nfs4_state_set_mode_locked(state, state->state | fmode);
 }
-static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
+{
+        struct nfs_client *clp = state->owner->so_server->nfs_client;
+        bool need_recover = false;
+        if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly)
+                need_recover = true;
+        if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly)
+                need_recover = true;
+        if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr)
+                need_recover = true;
+        if (need_recover)
+                nfs4_state_mark_reclaim_nograce(clp, state);
+}
+static bool nfs_need_update_open_stateid(struct nfs4_state *state,
+                nfs4_stateid *stateid)
+{
+        if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0)
+                return true;
+        if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+                nfs_test_and_clear_all_open_stateid(state);
+                return true;
+        }
+        if (nfs4_stateid_is_newer(stateid, &state->open_stateid))
+                return true;
+        return false;
+}
+static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
+                nfs4_stateid *stateid, fmode_t fmode)
 {
+        clear_bit(NFS_O_RDWR_STATE, &state->flags);
+        switch (fmode & (FMODE_READ|FMODE_WRITE)) {
+        case FMODE_WRITE:
+                clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+                break;
+        case FMODE_READ:
+                clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+                break;
+        case 0:
+                clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+                clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+                clear_bit(NFS_OPEN_STATE, &state->flags);
+        }
+        if (stateid == NULL)
+                return;
+        if (!nfs_need_update_open_stateid(state, stateid))
+                return;
        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
                nfs4_stateid_copy(&state->stateid, stateid);
        nfs4_stateid_copy(&state->open_stateid, stateid);
-        set_bit(NFS_OPEN_STATE, &state->flags);
+}
+static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+{
+        write_seqlock(&state->seqlock);
+        nfs_clear_open_stateid_locked(state, stateid, fmode);
+        write_sequnlock(&state->seqlock);
+        if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
+                nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
+}
+static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+{
        switch (fmode) {
                case FMODE_READ:
                        set_bit(NFS_O_RDONLY_STATE, &state->flags);
@@ -1153,13 +1213,11 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
                case FMODE_READ|FMODE_WRITE:
                        set_bit(NFS_O_RDWR_STATE, &state->flags);
        }
-}
+        if (!nfs_need_update_open_stateid(state, stateid))
+                return;
-static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
-{
+                nfs4_stateid_copy(&state->stateid, stateid);
-        write_seqlock(&state->seqlock);
+        nfs4_stateid_copy(&state->open_stateid, stateid);
-        nfs_set_open_stateid_locked(state, stateid, fmode);
-        write_sequnlock(&state->seqlock);
 }
 static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
@@ -1217,6 +1275,8 @@ no_delegation:
                __update_open_stateid(state, open_stateid, NULL, fmode);
                ret = 1;
        }
+        if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
+                nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
        return ret;
 }
@@ -1450,12 +1510,15 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
        struct nfs4_state *newstate;
        int ret;
+        /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */
+        clear_bit(NFS_O_RDWR_STATE, &state->flags);
+        clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+        clear_bit(NFS_O_RDONLY_STATE, &state->flags);
        /* memory barrier prior to reading state->n_* */
        clear_bit(NFS_DELEGATED_STATE, &state->flags);
        clear_bit(NFS_OPEN_STATE, &state->flags);
        smp_rmb();
        if (state->n_rdwr != 0) {
-                clear_bit(NFS_O_RDWR_STATE, &state->flags);
                ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
                if (ret != 0)
                        return ret;
@@ -1463,7 +1526,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
                        return -ESTALE;
        }
        if (state->n_wronly != 0) {
-                clear_bit(NFS_O_WRONLY_STATE, &state->flags);
                ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
                if (ret != 0)
                        return ret;
@@ -1471,7 +1533,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
                        return -ESTALE;
        }
        if (state->n_rdonly != 0) {
-                clear_bit(NFS_O_RDONLY_STATE, &state->flags);
                ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
                if (ret != 0)
                        return ret;
@@ -2244,10 +2305,12 @@ static int _nfs4_do_open(struct inode *dir,
                }
        }
-        if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
+        if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
-                opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
+                if (!opendata->f_attr.mdsthreshold) {
-                if (!opendata->f_attr.mdsthreshold)
+                        opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
-                        goto err_free_label;
+                        if (!opendata->f_attr.mdsthreshold)
+                                goto err_free_label;
+                }
                opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
        }
        if (dentry->d_inode != NULL)
@@ -2275,11 +2338,10 @@ static int _nfs4_do_open(struct inode *dir,
        if (opendata->file_created)
                *opened |= FILE_CREATED;
-        if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
+        if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
                *ctx_th = opendata->f_attr.mdsthreshold;
-        else
+                opendata->f_attr.mdsthreshold = NULL;
-                kfree(opendata->f_attr.mdsthreshold);
+        }
-        opendata->f_attr.mdsthreshold = NULL;
        nfs4_label_free(olabel);
@@ -2289,7 +2351,6 @@ static int _nfs4_do_open(struct inode *dir,
 err_free_label:
        nfs4_label_free(olabel);
 err_opendata_put:
-        kfree(opendata->f_attr.mdsthreshold);
        nfs4_opendata_put(opendata);
 err_put_state_owner:
        nfs4_put_state_owner(sp);
@@ -2479,26 +2540,6 @@ static void nfs4_free_closedata(void *data)
        kfree(calldata);
 }
-static void nfs4_close_clear_stateid_flags(struct nfs4_state *state,
-                fmode_t fmode)
-{
-        spin_lock(&state->owner->so_lock);
-        clear_bit(NFS_O_RDWR_STATE, &state->flags);
-        switch (fmode & (FMODE_READ|FMODE_WRITE)) {
-        case FMODE_WRITE:
-                clear_bit(NFS_O_RDONLY_STATE, &state->flags);
-                break;
-        case FMODE_READ:
-                clear_bit(NFS_O_WRONLY_STATE, &state->flags);
-                break;
-        case 0:
-                clear_bit(NFS_O_RDONLY_STATE, &state->flags);
-                clear_bit(NFS_O_WRONLY_STATE, &state->flags);
-                clear_bit(NFS_OPEN_STATE, &state->flags);
-        }
-        spin_unlock(&state->owner->so_lock);
-}
 static void nfs4_close_done(struct rpc_task *task, void *data)
 {
        struct nfs4_closedata *calldata = data;
@@ -2517,9 +2558,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                        if (calldata->roc)
                                pnfs_roc_set_barrier(state->inode,
                                                     calldata->roc_barrier);
-                        nfs_set_open_stateid(state, &calldata->res.stateid, 0);
+                        nfs_clear_open_stateid(state, &calldata->res.stateid, 0);
                        renew_lease(server, calldata->timestamp);
-                        break;
+                        goto out_release;
                case -NFS4ERR_ADMIN_REVOKED:
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_OLD_STATEID:
@@ -2533,7 +2574,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                                goto out_release;
                        }
        }
-        nfs4_close_clear_stateid_flags(state, calldata->arg.fmode);
+        nfs_clear_open_stateid(state, NULL, calldata->arg.fmode);
 out_release:
        nfs_release_seqid(calldata->arg.seqid);
        nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -3507,49 +3548,6 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
        return 1;
 }
-static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
-                struct inode *new_dir, struct qstr *new_name)
-{
-        struct nfs_server *server = NFS_SERVER(old_dir);
-        struct nfs_renameargs arg = {
-                .old_dir = NFS_FH(old_dir),
-                .new_dir = NFS_FH(new_dir),
-                .old_name = old_name,
-                .new_name = new_name,
-        };
-        struct nfs_renameres res = {
-                .server = server,
-        };
-        struct rpc_message msg = {
-                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
-                .rpc_argp = &arg,
-                .rpc_resp = &res,
-        };
-        int status = -ENOMEM;
-        status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
-        if (!status) {
-                update_changeattr(old_dir, &res.old_cinfo);
-                update_changeattr(new_dir, &res.new_cinfo);
-        }
-        return status;
-}
-static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
-                struct inode *new_dir, struct qstr *new_name)
-{
-        struct nfs4_exception exception = { };
-        int err;
-        do {
-                err = _nfs4_proc_rename(old_dir, old_name,
-                                        new_dir, new_name);
-                trace_nfs4_rename(old_dir, old_name, new_dir, new_name, err);
-                err = nfs4_handle_exception(NFS_SERVER(old_dir), err,
-                                &exception);
-        } while (exception.retry);
-        return err;
-}
 static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 {
        struct nfs_server *server = NFS_SERVER(inode);
@@ -4884,6 +4882,20 @@ nfs4_init_uniform_client_string(const struct nfs_client *clp,
                                nodename);
 }
+/*
+ * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback
+ * services.  Advertise one based on the address family of the
+ * clientaddr.
+ */
+static unsigned int
+nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
+{
+        if (strchr(clp->cl_ipaddr, ':') != NULL)
+                return scnprintf(buf, len, "tcp6");
+        else
+                return scnprintf(buf, len, "tcp");
+}
 /**
 * nfs4_proc_setclientid - Negotiate client ID
 * @clp: state data structure
@@ -4925,12 +4937,10 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                                                setclientid.sc_name,
                                                sizeof(setclientid.sc_name));
        /* cb_client4 */
-        rcu_read_lock();
+        setclientid.sc_netid_len =
-        setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
+                                nfs4_init_callback_netid(clp,
-                                sizeof(setclientid.sc_netid), "%s",
+                                                setclientid.sc_netid,
-                                rpc_peeraddr2str(clp->cl_rpcclient,
+                                                sizeof(setclientid.sc_netid));
-                                                        RPC_DISPLAY_NETID));
-        rcu_read_unlock();
        setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
                                sizeof(setclientid.sc_uaddr), "%s.%u.%u",
                                clp->cl_ipaddr, port >> 8, port & 255);
@@ -8408,7 +8418,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .unlink_setup   = nfs4_proc_unlink_setup,
        .unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare,
        .unlink_done    = nfs4_proc_unlink_done,
-        .rename         = nfs4_proc_rename,
        .rename_setup   = nfs4_proc_rename_setup,
        .rename_rpc_prepare = nfs4_proc_rename_rpc_prepare,
        .rename_done    = nfs4_proc_rename_done,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0deb32105ccf..2349518eef2c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1316,7 +1316,7 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st
        return 1;
 }
-static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
 {
        set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
        clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -2075,8 +2075,10 @@ again:
        switch (status) {
        case 0:
                break;
-        case -NFS4ERR_DELAY:
        case -ETIMEDOUT:
+                if (clnt->cl_softrtry)
+                        break;
+        case -NFS4ERR_DELAY:
        case -EAGAIN:
                ssleep(1);
        case -NFS4ERR_STALE_CLIENTID:
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 808f29574412..6f340f02f2ba 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -90,7 +90,7 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
 */
 static void nfs4_evict_inode(struct inode *inode)
 {
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        pnfs_return_layout(inode);
        pnfs_destroy_layout(NFS_I(inode));
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 72f3bf1754ef..73ce8d4fe2c8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -203,8 +203,7 @@ static int nfs4_stat_to_errno(int);
                                 2 + encode_verifier_maxsz + 5 + \
                                nfs4_label_maxsz)
 #define decode_readdir_maxsz    (op_decode_hdr_maxsz + \
-                                 decode_verifier_maxsz + \
+                                 decode_verifier_maxsz)
-                                nfs4_label_maxsz + nfs4_fattr_maxsz)
 #define encode_readlink_maxsz   (op_encode_hdr_maxsz)
 #define decode_readlink_maxsz   (op_decode_hdr_maxsz + 1)
 #define encode_write_maxsz      (op_encode_hdr_maxsz + \
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 4755858e37a0..cb53d450ae32 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -662,7 +662,18 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
 */
 static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
 {
-        return (s32)s1 - (s32)s2 > 0;
+        return (s32)(s1 - s2) > 0;
+}
+static void
+pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
+                const nfs4_stateid *new,
+                struct list_head *free_me_list)
+{
+        if (nfs4_stateid_match_other(&lo->plh_stateid, new))
+                return;
+        /* Layout is new! Kill existing layout segments */
+        pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
 }
 /* update lo->plh_stateid with new if is more recent */
@@ -1315,6 +1326,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        struct nfs4_layoutget_res *res = &lgp->res;
        struct pnfs_layout_segment *lseg;
        struct inode *ino = lo->plh_inode;
+        LIST_HEAD(free_me);
        int status = 0;
        /* Inject layout blob into I/O device driver */
@@ -1341,6 +1353,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
                goto out_forget_reply;
        }
+        /* Check that the new stateid matches the old stateid */
+        pnfs_verify_layout_stateid(lo, &res->stateid, &free_me);
        /* Done processing layoutget. Set the layout stateid */
        pnfs_set_layout_stateid(lo, &res->stateid, false);
@@ -1355,6 +1369,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        }
        spin_unlock(&ino->i_lock);
+        pnfs_free_lseg_list(&free_me);
        return lseg;
 out:
        return ERR_PTR(status);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index fddbba2d9eff..e55ce9e8b034 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -357,30 +357,6 @@ nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
 }
 static int
-nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
-                struct inode *new_dir, struct qstr *new_name)
-{
-        struct nfs_renameargs   arg = {
-                .old_dir        = NFS_FH(old_dir),
-                .old_name       = old_name,
-                .new_dir        = NFS_FH(new_dir),
-                .new_name       = new_name,
-        };
-        struct rpc_message msg = {
-                .rpc_proc       = &nfs_procedures[NFSPROC_RENAME],
-                .rpc_argp       = &arg,
-        };
-        int                     status;
-        dprintk("NFS call  rename %s -> %s\n", old_name->name, new_name->name);
-        status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
-        nfs_mark_for_revalidate(old_dir);
-        nfs_mark_for_revalidate(new_dir);
-        dprintk("NFS reply rename: %d\n", status);
-        return status;
-}
-static int
 nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
 {
        struct nfs_linkargs     arg = {
@@ -745,7 +721,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .unlink_setup   = nfs_proc_unlink_setup,
        .unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare,
        .unlink_done    = nfs_proc_unlink_done,
-        .rename         = nfs_proc_rename,
        .rename_setup   = nfs_proc_rename_setup,
        .rename_rpc_prepare = nfs_proc_rename_rpc_prepare,
        .rename_done    = nfs_proc_rename_done,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 910ed906eb82..2cb56943e232 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2215,6 +2215,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
        struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
        u32 nfsvers = nfss->nfs_client->rpc_ops->version;
+        sync_filesystem(sb);
        /*
         * Userspace mount programs that send binary options generally send
         * them populated with default values. We have no way to know which
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 11d78944de79..de54129336c6 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -14,6 +14,7 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/namei.h>
+#include <linux/fsnotify.h>
 #include "internal.h"
 #include "nfs4_fs.h"
@@ -353,8 +354,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
                return;
        }
-        if (task->tk_status != 0)
+        if (data->complete)
-                nfs_cancel_async_unlink(old_dentry);
+                data->complete(task, data);
 }
 /**
@@ -399,9 +400,10 @@ static const struct rpc_call_ops nfs_rename_ops = {
 *
 * It's expected that valid references to the dentries and inodes are held
 */
-static struct rpc_task *
+struct rpc_task *
 nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
-                 struct dentry *old_dentry, struct dentry *new_dentry)
+                 struct dentry *old_dentry, struct dentry *new_dentry,
+                 void (*complete)(struct rpc_task *, struct nfs_renamedata *))
 {
        struct nfs_renamedata *data;
        struct rpc_message msg = { };
@@ -438,6 +440,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
        data->new_dentry = dget(new_dentry);
        nfs_fattr_init(&data->old_fattr);
        nfs_fattr_init(&data->new_fattr);
+        data->complete = complete;
        /* set up nfs_renameargs */
        data->args.old_dir = NFS_FH(old_dir);
@@ -456,6 +459,27 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
        return rpc_run_task(&task_setup_data);
 }
+/*
+ * Perform tasks needed when a sillyrename is done such as cancelling the
+ * queued async unlink if it failed.
+ */
+static void
+nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data)
+{
+        struct dentry *dentry = data->old_dentry;
+        if (task->tk_status != 0) {
+                nfs_cancel_async_unlink(dentry);
+                return;
+        }
+        /*
+         * vfs_unlink and the like do not issue this when a file is
+         * sillyrenamed, so do it here.
+         */
+        fsnotify_nameremove(dentry, 0);
+}
 #define SILLYNAME_PREFIX ".nfs"
 #define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1)
 #define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1)
@@ -548,7 +572,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
        }
        /* run the rename task, undo unlink if it fails */
-        task = nfs_async_rename(dir, dir, dentry, sdentry);
+        task = nfs_async_rename(dir, dir, dentry, sdentry,
+                                        nfs_complete_sillyrename);
        if (IS_ERR(task)) {
                error = -EBUSY;
                nfs_cancel_async_unlink(dentry);
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index a812fd1b92a4..b481e1f5eecc 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -39,9 +39,13 @@ struct nfs4_acl;
 struct svc_fh;
 struct svc_rqst;
-/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
+/*
- * fit in a page: */
+ * Maximum ACL we'll accept from a client; chosen (somewhat
-#define NFS4_ACL_MAX 170
+ * arbitrarily) so that kmalloc'ing the ACL shouldn't require a
+ * high-order allocation.  This allows 204 ACEs on x86_64:
+ */
+#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \
+                        / sizeof(struct nfs4_ace))
 struct nfs4_acl *nfs4_acl_new(int);
 int nfs4_acl_get_whotype(char *, u32);
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 06cddd572264..2645be435e75 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -71,10 +71,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
        if (gid_eq(new->fsgid, INVALID_GID))
                new->fsgid = exp->ex_anon_gid;
-        ret = set_groups(new, gi);
+        set_groups(new, gi);
        put_group_info(gi);
-        if (ret < 0)
-                goto error;
        if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID))
                new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
@@ -89,7 +87,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 oom:
        ret = -ENOMEM;
-error:
        abort_creds(new);
        return ret;
 }
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index d190e33d0ec2..f66c66b9f182 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -402,8 +402,10 @@ sort_pacl(struct posix_acl *pacl)
         * by uid/gid. */
        int i, j;
-        if (pacl->a_count <= 4)
+        /* no users or groups */
-                return; /* no users or groups */
+        if (!pacl || pacl->a_count <= 4)
+                return;
        i = 1;
        while (pacl->a_entries[i].e_tag == ACL_USER)
                i++;
@@ -530,19 +532,21 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
        /*
         * ACLs with no ACEs are treated differently in the inheritable
-         * and effective cases: when there are no inheritable ACEs, we
+         * and effective cases: when there are no inheritable ACEs,
-         * set a zero-length default posix acl:
+         * calls ->set_acl with a NULL ACL structure.
         */
-        if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) {
+        if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT))
-                pacl = posix_acl_alloc(0, GFP_KERNEL);
+                return NULL;
-                return pacl ? pacl : ERR_PTR(-ENOMEM);
-        }
        /*
         * When there are no effective ACEs, the following will end
         * up setting a 3-element effective posix ACL with all
         * permissions zero.
         */
-        nace = 4 + state->users->n + state->groups->n;
+        if (!state->users->n && !state->groups->n)
+                nace = 3;
+        else /* Note we also include a MASK ACE in this case: */
+                nace = 4 + state->users->n + state->groups->n;
        pacl = posix_acl_alloc(nace, GFP_KERNEL);
        if (!pacl)
                return ERR_PTR(-ENOMEM);
@@ -586,9 +590,11 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
                add_to_mask(state, &state->groups->aces[i].perms);
        }
-        pace++;
+        if (state->users->n || state->groups->n) {
-        pace->e_tag = ACL_MASK;
+                pace++;
-        low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
+                pace->e_tag = ACL_MASK;
+                low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
+        }
        pace++;
        pace->e_tag = ACL_OTHER;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7f05cd140de3..2c73cae9899d 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,7 @@
 */
 #include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprt.h>
 #include <linux/sunrpc/svc_xprt.h>
 #include <linux/slab.h>
 #include "nfsd.h"
@@ -635,11 +636,29 @@ static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc
        }
 }
+static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args)
+{
+        struct rpc_xprt *xprt;
+        if (args->protocol != XPRT_TRANSPORT_BC_TCP)
+                return rpc_create(args);
+        xprt = args->bc_xprt->xpt_bc_xprt;
+        if (xprt) {
+                xprt_get(xprt);
+                return rpc_create_xprt(args, xprt);
+        }
+        return rpc_create(args);
+}
 static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
 {
+        int maxtime = max_cb_time(clp->net);
        struct rpc_timeout      timeparms = {
-                .to_initval     = max_cb_time(clp->net),
+                .to_initval     = maxtime,
                .to_retries     = 0,
+                .to_maxval      = maxtime,
        };
        struct rpc_create_args args = {
                .net            = clp->net,
@@ -674,7 +693,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
                args.authflavor = ses->se_cb_sec.flavor;
        }
        /* Create RPC client */
-        client = rpc_create(&args);
+        client = create_backchannel_client(&args);
        if (IS_ERR(client)) {
                dprintk("NFSD: couldn't create callback client: %ld\n",
                        PTR_ERR(client));
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 82189b208af3..d543222babf3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1273,6 +1273,8 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        struct nfsd4_op *op;
        struct nfsd4_operation *opdesc;
        struct nfsd4_compound_state *cstate = &resp->cstate;
+        struct svc_fh *current_fh = &cstate->current_fh;
+        struct svc_fh *save_fh = &cstate->save_fh;
        int             slack_bytes;
        u32             plen = 0;
        __be32          status;
@@ -1288,11 +1290,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
        resp->tag = args->tag;
        resp->opcnt = 0;
        resp->rqstp = rqstp;
-        resp->cstate.minorversion = args->minorversion;
+        cstate->minorversion = args->minorversion;
-        resp->cstate.replay_owner = NULL;
+        cstate->replay_owner = NULL;
-        resp->cstate.session = NULL;
+        cstate->session = NULL;
-        fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
+        fh_init(current_fh, NFS4_FHSIZE);
-        fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
+        fh_init(save_fh, NFS4_FHSIZE);
        /*
         * Don't use the deferral mechanism for NFSv4; compounds make it
         * too hard to avoid non-idempotency problems.
@@ -1345,20 +1347,28 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
                opdesc = OPDESC(op);
-                if (!cstate->current_fh.fh_dentry) {
+                if (!current_fh->fh_dentry) {
                        if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) {
                                op->status = nfserr_nofilehandle;
                                goto encode_op;
                        }
-                } else if (cstate->current_fh.fh_export->ex_fslocs.migrated &&
+                } else if (current_fh->fh_export->ex_fslocs.migrated &&
                          !(opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) {
                        op->status = nfserr_moved;
                        goto encode_op;
                }
+                fh_clear_wcc(current_fh);
                /* If op is non-idempotent */
                if (opdesc->op_flags & OP_MODIFIES_SOMETHING) {
                        plen = opdesc->op_rsize_bop(rqstp, op);
+                        /*
+                         * If there's still another operation, make sure
+                         * we'll have space to at least encode an error:
+                         */
+                        if (resp->opcnt < args->opcnt)
+                                plen += COMPOUND_ERR_SLACK_SPACE;
                        op->status = nfsd4_check_resp_size(resp, plen);
                }
@@ -1377,12 +1387,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
                                clear_current_stateid(cstate);
                        if (need_wrongsec_check(rqstp))
-                                op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp);
+                                op->status = check_nfsd_access(current_fh->fh_export, rqstp);
                }
 encode_op:
                /* Only from SEQUENCE */
-                if (resp->cstate.status == nfserr_replay_cache) {
+                if (cstate->status == nfserr_replay_cache) {
                        dprintk("%s NFS4.1 replay from cache\n", __func__);
                        status = op->status;
                        goto out;
@@ -1411,10 +1421,10 @@ encode_op:
                nfsd4_increment_op_stats(op->opnum);
        }
-        resp->cstate.status = status;
+        cstate->status = status;
-        fh_put(&resp->cstate.current_fh);
+        fh_put(current_fh);
-        fh_put(&resp->cstate.save_fh);
+        fh_put(save_fh);
-        BUG_ON(resp->cstate.replay_owner);
+        BUG_ON(cstate->replay_owner);
 out:
        /* Reset deferral mechanism for RPC deferrals */
        rqstp->rq_usedeferral = 1;
@@ -1523,7 +1533,8 @@ static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *o
 static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
-        return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32);
+        return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
+                                                                sizeof(__be32);
 }
 static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d5d070fbeb35..9a77a5a21557 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1078,6 +1078,18 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
                return NULL;
        }
        clp->cl_name.len = name.len;
+        INIT_LIST_HEAD(&clp->cl_sessions);
+        idr_init(&clp->cl_stateids);
+        atomic_set(&clp->cl_refcount, 0);
+        clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+        INIT_LIST_HEAD(&clp->cl_idhash);
+        INIT_LIST_HEAD(&clp->cl_openowners);
+        INIT_LIST_HEAD(&clp->cl_delegations);
+        INIT_LIST_HEAD(&clp->cl_lru);
+        INIT_LIST_HEAD(&clp->cl_callbacks);
+        INIT_LIST_HEAD(&clp->cl_revoked);
+        spin_lock_init(&clp->cl_lock);
+        rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
        return clp;
 }
@@ -1095,6 +1107,7 @@ free_client(struct nfs4_client *clp)
                WARN_ON_ONCE(atomic_read(&ses->se_ref));
                free_session(ses);
        }
+        rpc_destroy_wait_queue(&clp->cl_cb_waitq);
        free_svc_cred(&clp->cl_cred);
        kfree(clp->cl_name.data);
        idr_destroy(&clp->cl_stateids);
@@ -1347,7 +1360,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
        if (clp == NULL)
                return NULL;
-        INIT_LIST_HEAD(&clp->cl_sessions);
        ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
        if (ret) {
                spin_lock(&nn->client_lock);
@@ -1355,20 +1367,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
                spin_unlock(&nn->client_lock);
                return NULL;
        }
-        idr_init(&clp->cl_stateids);
-        atomic_set(&clp->cl_refcount, 0);
-        clp->cl_cb_state = NFSD4_CB_UNKNOWN;
-        INIT_LIST_HEAD(&clp->cl_idhash);
-        INIT_LIST_HEAD(&clp->cl_openowners);
-        INIT_LIST_HEAD(&clp->cl_delegations);
-        INIT_LIST_HEAD(&clp->cl_lru);
-        INIT_LIST_HEAD(&clp->cl_callbacks);
-        INIT_LIST_HEAD(&clp->cl_revoked);
-        spin_lock_init(&clp->cl_lock);
        nfsd4_init_callback(&clp->cl_cb_null);
        clp->cl_time = get_seconds();
        clear_bit(0, &clp->cl_cb_slot_busy);
-        rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
        copy_verf(clp, verf);
        rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
        gen_confirm(clp);
@@ -1538,7 +1539,7 @@ out_err:
 }
 /*
- * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size.
+ * Cache a reply. nfsd4_check_resp_size() has bounded the cache size.
 */
 void
 nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
@@ -1596,7 +1597,7 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
 * The sequence operation is not cached because we can use the slot and
 * session values.
 */
-__be32
+static __be32
 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
                         struct nfsd4_sequence *seq)
 {
@@ -1605,9 +1606,8 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
        dprintk("--> %s slot %p\n", __func__, slot);
-        /* Either returns 0 or nfserr_retry_uncached */
        status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp);
-        if (status == nfserr_retry_uncached_rep)
+        if (status)
                return status;
        /* The sequence operation has been encoded, cstate->datap set. */
@@ -2287,7 +2287,8 @@ out:
        if (!list_empty(&clp->cl_revoked))
                seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
 out_no_session:
-        kfree(conn);
+        if (conn)
+                free_conn(conn);
        spin_unlock(&nn->client_lock);
        return status;
 out_put_session:
@@ -3627,8 +3628,11 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
                return nfserr_bad_stateid;
        status = lookup_clientid(&stateid->si_opaque.so_clid, sessions,
                                                        nn, &cl);
-        if (status == nfserr_stale_clientid)
+        if (status == nfserr_stale_clientid) {
+                if (sessions)
+                        return nfserr_bad_stateid;
                return nfserr_stale_stateid;
+        }
        if (status)
                return status;
        *s = find_stateid_by_type(cl, stateid, typemask);
@@ -3713,9 +3717,16 @@ out:
 static __be32
 nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
 {
-        if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner)))
+        struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
+        if (check_for_locks(stp->st_file, lo))
                return nfserr_locks_held;
-        release_lock_stateid(stp);
+        /*
+         * Currently there's a 1-1 lock stateid<->lockowner
+         * correspondance, and we have to delete the lockowner when we
+         * delete the lock stateid:
+         */
+        unhash_lockowner(lo);
        return nfs_ok;
 }
@@ -4155,6 +4166,10 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c
        if (!same_owner_str(&lo->lo_owner, owner, clid))
                return false;
+        if (list_empty(&lo->lo_owner.so_stateids)) {
+                WARN_ON_ONCE(1);
+                return false;
+        }
        lst = list_first_entry(&lo->lo_owner.so_stateids,
                               struct nfs4_ol_stateid, st_perstateowner);
        return lst->st_file->fi_inode == inode;
@@ -5062,7 +5077,6 @@ nfs4_state_destroy_net(struct net *net)
        int i;
        struct nfs4_client *clp = NULL;
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-        struct rb_node *node, *tmp;
        for (i = 0; i < CLIENT_HASH_SIZE; i++) {
                while (!list_empty(&nn->conf_id_hashtbl[i])) {
@@ -5071,13 +5085,11 @@ nfs4_state_destroy_net(struct net *net)
                }
        }
-        node = rb_first(&nn->unconf_name_tree);
+        for (i = 0; i < CLIENT_HASH_SIZE; i++) {
-        while (node != NULL) {
+                while (!list_empty(&nn->unconf_id_hashtbl[i])) {
-                tmp = node;
+                        clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
-                node = rb_next(tmp);
+                        destroy_client(clp);
-                clp = rb_entry(tmp, struct nfs4_client, cl_namenode);
+                }
-                rb_erase(tmp, &nn->unconf_name_tree);
-                destroy_client(clp);
        }
        kfree(nn->sessionid_hashtbl);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 63f2395c57ed..18881f34737a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -294,7 +294,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                READ32(nace);
                if (nace > NFS4_ACL_MAX)
-                        return nfserr_resource;
+                        return nfserr_fbig;
                *acl = nfs4_acl_new(nace);
                if (*acl == NULL)
@@ -1222,7 +1222,6 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
        }
        write->wr_head.iov_base = p;
        write->wr_head.iov_len = avail;
-        WARN_ON(avail != (XDR_QUADLEN(avail) << 2));
        write->wr_pagelist = argp->pagelist;
        len = XDR_QUADLEN(write->wr_buflen) << 2;
@@ -2483,6 +2482,8 @@ out_acl:
                        goto out;
        }
        if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+                if ((buflen -= 16) < 0)
+                        goto out_resource;
                WRITE32(3);
                WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
                WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
@@ -2499,8 +2500,10 @@ out:
                security_release_secctx(context, contextlen);
 #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
        kfree(acl);
-        if (tempfh)
+        if (tempfh) {
                fh_put(tempfh);
+                kfree(tempfh);
+        }
        return status;
 out_nfserr:
        status = nfserrno(err);
@@ -3471,6 +3474,9 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
        struct nfsd4_test_stateid_id *stateid, *next;
        __be32 *p;
+        if (nfserr)
+                return nfserr;
        RESERVE_SPACE(4 + (4 * test_stateid->ts_num_ids));
        *p++ = htonl(test_stateid->ts_num_ids);
@@ -3579,8 +3585,6 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
                return 0;
        session = resp->cstate.session;
-        if (session == NULL)
-                return 0;
        if (xb->page_len == 0) {
                length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
@@ -3620,7 +3624,7 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
        BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
               !nfsd4_enc_ops[op->opnum]);
        op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
-        /* nfsd4_check_drc_limit guarantees enough room for error status */
+        /* nfsd4_check_resp_size guarantees enough room for error status */
        if (!op->status)
                op->status = nfsd4_check_resp_size(resp, 0);
        if (so) {
@@ -3691,6 +3695,12 @@ int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp)
 int
 nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args)
 {
+        if (rqstp->rq_arg.head[0].iov_len % 4) {
+                /* client is nuts */
+                dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)",
+                        __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid));
+                return 0;
+        }
        args->p = p;
        args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
        args->pagelist = rqstp->rq_arg.pages;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7f555179bf81..f34d9de802ab 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -699,6 +699,11 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net)
        if (err != 0 || fd < 0)
                return -EINVAL;
+        if (svc_alien_sock(net, fd)) {
+                printk(KERN_ERR "%s: socket net is different to NFSd's one\n", __func__);
+                return -EINVAL;
+        }
        err = nfsd_create_serv(net);
        if (err != 0)
                return err;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 30f34ab02137..479eb681c27c 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -282,7 +282,7 @@ void		nfsd_lockd_shutdown(void);
 * reason.
 */
 #define COMPOUND_SLACK_SPACE            140    /* OP_GETFH */
-#define COMPOUND_ERR_SLACK_SPACE        12     /* OP_SETATTR */
+#define COMPOUND_ERR_SLACK_SPACE        16     /* OP_SETATTR */
 #define NFSD_LAUNDROMAT_MINTIMEOUT      1   /* seconds */
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 4775bc4896c8..ad67964d0bb1 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -133,6 +133,17 @@ fh_init(struct svc_fh *fhp, int maxsize)
 #ifdef CONFIG_NFSD_V3
 /*
+ * The wcc data stored in current_fh should be cleared
+ * between compound ops.
+ */
+static inline void
+fh_clear_wcc(struct svc_fh *fhp)
+{
+        fhp->fh_post_saved = 0;
+        fhp->fh_pre_saved = 0;
+}
+/*
 * Fill in the pre_op attr for the wcc data
 */
 static inline void
@@ -152,7 +163,8 @@ fill_pre_wcc(struct svc_fh *fhp)
 extern void fill_post_wcc(struct svc_fh *);
 #else
-#define fill_pre_wcc(ignored)
+#define fh_clear_wcc(ignored)
+#define fill_pre_wcc(ignored)
 #define fill_post_wcc(notused)
 #endif /* CONFIG_NFSD_V3 */
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index b17d93214d01..9c769a47ac5a 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -152,7 +152,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
        type = (stat->mode & S_IFMT);
        *p++ = htonl(nfs_ftypes[type >> 12]);
-        *p++ = htonl((u32) (stat->mode & S_IALLUGO));
+        *p++ = htonl((u32) stat->mode);
        *p++ = htonl((u32) stat->nlink);
        *p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
        *p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6d7be3f80356..16f0673a423c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -404,6 +404,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        umode_t         ftype = 0;
        __be32          err;
        int             host_err;
+        bool            get_write_count;
        int             size_change = 0;
        if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
@@ -411,10 +412,18 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        if (iap->ia_valid & ATTR_SIZE)
                ftype = S_IFREG;
+        /* Callers that do fh_verify should do the fh_want_write: */
+        get_write_count = !fhp->fh_dentry;
        /* Get inode */
        err = fh_verify(rqstp, fhp, ftype, accmode);
        if (err)
                goto out;
+        if (get_write_count) {
+                host_err = fh_want_write(fhp);
+                if (host_err)
+                        return nfserrno(host_err);
+        }
        dentry = fhp->fh_dentry;
        inode = dentry->d_inode;
@@ -1694,7 +1703,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
                goto out_dput_new;
-        host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL);
+        host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
        if (!host_err) {
                host_err = commit_metadata(tfhp);
                if (!host_err)
@@ -1706,10 +1715,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        dput(odentry);
 out_nfserr:
        err = nfserrno(host_err);
+        /*
-        /* we cannot reply on fh_unlock on the two filehandles,
+         * We cannot rely on fh_unlock on the two filehandles,
         * as that would do the wrong thing if the two directories
-         * were the same, so again we do it by hand
+         * were the same, so again we do it by hand.
         */
        fill_post_wcc(ffhp);
        fill_post_wcc(tfhp);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d278a0d03496..5ea7df305083 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -574,8 +574,6 @@ extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                struct nfsd4_compound_state *,
                struct nfsd4_setclientid_confirm *setclientid_confirm);
 extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
-extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
-                struct nfsd4_sequence *seq);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
                struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
 extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *);
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index deaa3d33a0aa..0d58075f34e2 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -942,6 +942,18 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
        struct inode *cpfile;
        int err;
+        if (cpsize > sb->s_blocksize) {
+                printk(KERN_ERR
+                       "NILFS: too large checkpoint size: %zu bytes.\n",
+                       cpsize);
+                return -EINVAL;
+        } else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) {
+                printk(KERN_ERR
+                       "NILFS: too small checkpoint size: %zu bytes.\n",
+                       cpsize);
+                return -EINVAL;
+        }
        cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
        if (unlikely(!cpfile))
                return -ENOMEM;
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index fa0f80308c2d..0d5fada91191 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -484,6 +484,18 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
        struct nilfs_dat_info *di;
        int err;
+        if (entry_size > sb->s_blocksize) {
+                printk(KERN_ERR
+                       "NILFS: too large DAT entry size: %zu bytes.\n",
+                       entry_size);
+                return -EINVAL;
+        } else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) {
+                printk(KERN_ERR
+                       "NILFS: too small DAT entry size: %zu bytes.\n",
+                       entry_size);
+                return -EINVAL;
+        }
        dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
        if (unlikely(!dat))
                return -ENOMEM;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 08fdb77852ac..f3a82fbcae02 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -134,6 +134,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static const struct vm_operations_struct nilfs_file_vm_ops = {
        .fault          = filemap_fault,
+        .map_pages      = filemap_map_pages,
        .page_mkwrite   = nilfs_page_mkwrite,
        .remap_pages    = generic_file_remap_pages,
 };
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7e350c562e0e..b9c5726120e3 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -783,16 +783,14 @@ void nilfs_evict_inode(struct inode *inode)
        int ret;
        if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
-                if (inode->i_data.nrpages)
+                truncate_inode_pages_final(&inode->i_data);
-                        truncate_inode_pages(&inode->i_data, 0);
                clear_inode(inode);
                nilfs_clear_inode(inode);
                return;
        }
        nilfs_transaction_begin(sb, &ti, 0); /* never fails */
-        if (inode->i_data.nrpages)
+        truncate_inode_pages_final(&inode->i_data);
-                truncate_inode_pages(&inode->i_data, 0);
        /* TODO: some of the following operations may fail.  */
        nilfs_truncate_bmap(ii, 0);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 2b34021948e4..422fb54b7377 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1072,6 +1072,48 @@ out:
 }
 /**
+ * nilfs_ioctl_trim_fs() - trim ioctl handle function
+ * @inode: inode object
+ * @argp: pointer on argument from userspace
+ *
+ * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It
+ * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which
+ * performs the actual trim operation.
+ *
+ * Return Value: On success, 0 is returned or negative error code, otherwise.
+ */
+static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
+{
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+        struct request_queue *q = bdev_get_queue(nilfs->ns_bdev);
+        struct fstrim_range range;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (!blk_queue_discard(q))
+                return -EOPNOTSUPP;
+        if (copy_from_user(&range, argp, sizeof(range)))
+                return -EFAULT;
+        range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity);
+        down_read(&nilfs->ns_segctor_sem);
+        ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range);
+        up_read(&nilfs->ns_segctor_sem);
+        if (ret < 0)
+                return ret;
+        if (copy_to_user(argp, &range, sizeof(range)))
+                return -EFAULT;
+        return 0;
+}
+/**
 * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated
 * @inode: inode object
 * @argp: pointer on argument from userspace
@@ -1163,6 +1205,95 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
        return ret;
 }
+/**
+ * nilfs_ioctl_set_suinfo - set segment usage info
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: Expects an array of nilfs_suinfo_update structures
+ * encapsulated in nilfs_argv and updates the segment usage info
+ * according to the flags in nilfs_suinfo_update.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EPERM - Not enough permissions
+ *
+ * %-EFAULT - Error copying input data
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid values in input (segment number, flags or nblocks)
+ */
+static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp,
+                                unsigned int cmd, void __user *argp)
+{
+        struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+        struct nilfs_transaction_info ti;
+        struct nilfs_argv argv;
+        size_t len;
+        void __user *base;
+        void *kbuf;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        ret = mnt_want_write_file(filp);
+        if (ret)
+                return ret;
+        ret = -EFAULT;
+        if (copy_from_user(&argv, argp, sizeof(argv)))
+                goto out;
+        ret = -EINVAL;
+        if (argv.v_size < sizeof(struct nilfs_suinfo_update))
+                goto out;
+        if (argv.v_nmembs > nilfs->ns_nsegments)
+                goto out;
+        if (argv.v_nmembs >= UINT_MAX / argv.v_size)
+                goto out;
+        len = argv.v_size * argv.v_nmembs;
+        if (!len) {
+                ret = 0;
+                goto out;
+        }
+        base = (void __user *)(unsigned long)argv.v_base;
+        kbuf = vmalloc(len);
+        if (!kbuf) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        if (copy_from_user(kbuf, base, len)) {
+                ret = -EFAULT;
+                goto out_free;
+        }
+        nilfs_transaction_begin(inode->i_sb, &ti, 0);
+        ret = nilfs_sufile_set_suinfo(nilfs->ns_sufile, kbuf, argv.v_size,
+                        argv.v_nmembs);
+        if (unlikely(ret < 0))
+                nilfs_transaction_abort(inode->i_sb);
+        else
+                nilfs_transaction_commit(inode->i_sb); /* never fails */
+out_free:
+        vfree(kbuf);
+out:
+        mnt_drop_write_file(filp);
+        return ret;
+}
 long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = file_inode(filp);
@@ -1189,6 +1320,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                return nilfs_ioctl_get_info(inode, filp, cmd, argp,
                                            sizeof(struct nilfs_suinfo),
                                            nilfs_ioctl_do_get_suinfo);
+        case NILFS_IOCTL_SET_SUINFO:
+                return nilfs_ioctl_set_suinfo(inode, filp, cmd, argp);
        case NILFS_IOCTL_GET_SUSTAT:
                return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
        case NILFS_IOCTL_GET_VINFO:
@@ -1205,6 +1338,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                return nilfs_ioctl_resize(inode, filp, argp);
        case NILFS_IOCTL_SET_ALLOC_RANGE:
                return nilfs_ioctl_set_alloc_range(inode, argp);
+        case FITRIM:
+                return nilfs_ioctl_trim_fs(inode, argp);
        default:
                return -ENOTTY;
        }
@@ -1228,6 +1363,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        case NILFS_IOCTL_GET_CPINFO:
        case NILFS_IOCTL_GET_CPSTAT:
        case NILFS_IOCTL_GET_SUINFO:
+        case NILFS_IOCTL_SET_SUINFO:
        case NILFS_IOCTL_GET_SUSTAT:
        case NILFS_IOCTL_GET_VINFO:
        case NILFS_IOCTL_GET_BDESCS:
@@ -1235,6 +1371,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        case NILFS_IOCTL_SYNC:
        case NILFS_IOCTL_RESIZE:
        case NILFS_IOCTL_SET_ALLOC_RANGE:
+        case FITRIM:
                break;
        default:
                return -ENOIOCTLCMD;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3127e9f438a7..2a869c35c362 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -870,6 +870,289 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 }
 /**
+ * nilfs_sufile_set_suinfo - sets segment usage info
+ * @sufile: inode of segment usage file
+ * @buf: array of suinfo_update
+ * @supsz: byte size of suinfo_update
+ * @nsup: size of suinfo_update array
+ *
+ * Description: Takes an array of nilfs_suinfo_update structs and updates
+ * segment usage accordingly. Only the fields indicated by the sup_flags
+ * are updated.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid values in input (segment number, flags or nblocks)
+ */
+ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
+                                unsigned supsz, size_t nsup)
+{
+        struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+        struct buffer_head *header_bh, *bh;
+        struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup;
+        struct nilfs_segment_usage *su;
+        void *kaddr;
+        unsigned long blkoff, prev_blkoff;
+        int cleansi, cleansu, dirtysi, dirtysu;
+        long ncleaned = 0, ndirtied = 0;
+        int ret = 0;
+        if (unlikely(nsup == 0))
+                return ret;
+        for (sup = buf; sup < supend; sup = (void *)sup + supsz) {
+                if (sup->sup_segnum >= nilfs->ns_nsegments
+                        || (sup->sup_flags &
+                                (~0UL << __NR_NILFS_SUINFO_UPDATE_FIELDS))
+                        || (nilfs_suinfo_update_nblocks(sup) &&
+                                sup->sup_sui.sui_nblocks >
+                                nilfs->ns_blocks_per_segment))
+                        return -EINVAL;
+        }
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        sup = buf;
+        blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum);
+        ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh);
+        if (ret < 0)
+                goto out_header;
+        for (;;) {
+                kaddr = kmap_atomic(bh->b_page);
+                su = nilfs_sufile_block_get_segment_usage(
+                        sufile, sup->sup_segnum, bh, kaddr);
+                if (nilfs_suinfo_update_lastmod(sup))
+                        su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod);
+                if (nilfs_suinfo_update_nblocks(sup))
+                        su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks);
+                if (nilfs_suinfo_update_flags(sup)) {
+                        /*
+                         * Active flag is a virtual flag projected by running
+                         * nilfs kernel code - drop it not to write it to
+                         * disk.
+                         */
+                        sup->sup_sui.sui_flags &=
+                                        ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+                        cleansi = nilfs_suinfo_clean(&sup->sup_sui);
+                        cleansu = nilfs_segment_usage_clean(su);
+                        dirtysi = nilfs_suinfo_dirty(&sup->sup_sui);
+                        dirtysu = nilfs_segment_usage_dirty(su);
+                        if (cleansi && !cleansu)
+                                ++ncleaned;
+                        else if (!cleansi && cleansu)
+                                --ncleaned;
+                        if (dirtysi && !dirtysu)
+                                ++ndirtied;
+                        else if (!dirtysi && dirtysu)
+                                --ndirtied;
+                        su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags);
+                }
+                kunmap_atomic(kaddr);
+                sup = (void *)sup + supsz;
+                if (sup >= supend)
+                        break;
+                prev_blkoff = blkoff;
+                blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum);
+                if (blkoff == prev_blkoff)
+                        continue;
+                /* get different block */
+                mark_buffer_dirty(bh);
+                put_bh(bh);
+                ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh);
+                if (unlikely(ret < 0))
+                        goto out_mark;
+        }
+        mark_buffer_dirty(bh);
+        put_bh(bh);
+ out_mark:
+        if (ncleaned || ndirtied) {
+                nilfs_sufile_mod_counter(header_bh, (u64)ncleaned,
+                                (u64)ndirtied);
+                NILFS_SUI(sufile)->ncleansegs += ncleaned;
+        }
+        nilfs_mdt_mark_dirty(sufile);
+ out_header:
+        put_bh(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+        return ret;
+}
+/**
+ * nilfs_sufile_trim_fs() - trim ioctl handle function
+ * @sufile: inode of segment usage file
+ * @range: fstrim_range structure
+ *
+ * start:       First Byte to trim
+ * len:         number of Bytes to trim from start
+ * minlen:      minimum extent length in Bytes
+ *
+ * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes
+ * from start to start+len. start is rounded up to the next block boundary
+ * and start+len is rounded down. For each clean segment blkdev_issue_discard
+ * function is invoked.
+ *
+ * Return Value: On success, 0 is returned or negative error code, otherwise.
+ */
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
+{
+        struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+        struct buffer_head *su_bh;
+        struct nilfs_segment_usage *su;
+        void *kaddr;
+        size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size;
+        sector_t seg_start, seg_end, start_block, end_block;
+        sector_t start = 0, nblocks = 0;
+        u64 segnum, segnum_end, minlen, len, max_blocks, ndiscarded = 0;
+        int ret = 0;
+        unsigned int sects_per_block;
+        sects_per_block = (1 << nilfs->ns_blocksize_bits) /
+                        bdev_logical_block_size(nilfs->ns_bdev);
+        len = range->len >> nilfs->ns_blocksize_bits;
+        minlen = range->minlen >> nilfs->ns_blocksize_bits;
+        max_blocks = ((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segment);
+        if (!len || range->start >= max_blocks << nilfs->ns_blocksize_bits)
+                return -EINVAL;
+        start_block = (range->start + nilfs->ns_blocksize - 1) >>
+                        nilfs->ns_blocksize_bits;
+        /*
+         * range->len can be very large (actually, it is set to
+         * ULLONG_MAX by default) - truncate upper end of the range
+         * carefully so as not to overflow.
+         */
+        if (max_blocks - start_block < len)
+                end_block = max_blocks - 1;
+        else
+                end_block = start_block + len - 1;
+        segnum = nilfs_get_segnum_of_block(nilfs, start_block);
+        segnum_end = nilfs_get_segnum_of_block(nilfs, end_block);
+        down_read(&NILFS_MDT(sufile)->mi_sem);
+        while (segnum <= segnum_end) {
+                n = nilfs_sufile_segment_usages_in_block(sufile, segnum,
+                                segnum_end);
+                ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+                                                           &su_bh);
+                if (ret < 0) {
+                        if (ret != -ENOENT)
+                                goto out_sem;
+                        /* hole */
+                        segnum += n;
+                        continue;
+                }
+                kaddr = kmap_atomic(su_bh->b_page);
+                su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
+                                su_bh, kaddr);
+                for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
+                        if (!nilfs_segment_usage_clean(su))
+                                continue;
+                        nilfs_get_segment_range(nilfs, segnum, &seg_start,
+                                                &seg_end);
+                        if (!nblocks) {
+                                /* start new extent */
+                                start = seg_start;
+                                nblocks = seg_end - seg_start + 1;
+                                continue;
+                        }
+                        if (start + nblocks == seg_start) {
+                                /* add to previous extent */
+                                nblocks += seg_end - seg_start + 1;
+                                continue;
+                        }
+                        /* discard previous extent */
+                        if (start < start_block) {
+                                nblocks -= start_block - start;
+                                start = start_block;
+                        }
+                        if (nblocks >= minlen) {
+                                kunmap_atomic(kaddr);
+                                ret = blkdev_issue_discard(nilfs->ns_bdev,
+                                                start * sects_per_block,
+                                                nblocks * sects_per_block,
+                                                GFP_NOFS, 0);
+                                if (ret < 0) {
+                                        put_bh(su_bh);
+                                        goto out_sem;
+                                }
+                                ndiscarded += nblocks;
+                                kaddr = kmap_atomic(su_bh->b_page);
+                                su = nilfs_sufile_block_get_segment_usage(
+                                        sufile, segnum, su_bh, kaddr);
+                        }
+                        /* start new extent */
+                        start = seg_start;
+                        nblocks = seg_end - seg_start + 1;
+                }
+                kunmap_atomic(kaddr);
+                put_bh(su_bh);
+        }
+        if (nblocks) {
+                /* discard last extent */
+                if (start < start_block) {
+                        nblocks -= start_block - start;
+                        start = start_block;
+                }
+                if (start + nblocks > end_block + 1)
+                        nblocks = end_block - start + 1;
+                if (nblocks >= minlen) {
+                        ret = blkdev_issue_discard(nilfs->ns_bdev,
+                                        start * sects_per_block,
+                                        nblocks * sects_per_block,
+                                        GFP_NOFS, 0);
+                        if (!ret)
+                                ndiscarded += nblocks;
+                }
+        }
+out_sem:
+        up_read(&NILFS_MDT(sufile)->mi_sem);
+        range->len = ndiscarded << nilfs->ns_blocksize_bits;
+        return ret;
+}
+/**
 * nilfs_sufile_read - read or get sufile inode
 * @sb: super block instance
 * @susize: size of a segment usage entry
@@ -886,6 +1169,18 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
        void *kaddr;
        int err;
+        if (susize > sb->s_blocksize) {
+                printk(KERN_ERR
+                       "NILFS: too large segment usage size: %zu bytes.\n",
+                       susize);
+                return -EINVAL;
+        } else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) {
+                printk(KERN_ERR
+                       "NILFS: too small segment usage size: %zu bytes.\n",
+                       susize);
+                return -EINVAL;
+        }
        sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
        if (unlikely(!sufile))
                return -ENOMEM;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index e84bc5b51fc1..b8afd72f2379 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -44,6 +44,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
 int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
 ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
                                size_t);
+ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t);
 int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
                         void (*dofunc)(struct inode *, __u64,
@@ -65,6 +66,7 @@ void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
 int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
 int nilfs_sufile_read(struct super_block *sb, size_t susize,
                      struct nilfs_inode *raw_inode, struct inode **inodep);
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range);
 /**
 * nilfs_sufile_scrap - make a segment garbage
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7ac2a122ca1d..8c532b2ca3ab 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1129,6 +1129,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        unsigned long old_mount_opt;
        int err;
+        sync_filesystem(sb);
        old_sb_flags = sb->s_flags;
        old_mount_opt = nilfs->ns_mount_opt;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 94c451ce6d24..8ba8229ba076 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -399,6 +399,16 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
                return -EINVAL;
        nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
+        if (nilfs->ns_inode_size > nilfs->ns_blocksize) {
+                printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n",
+                       nilfs->ns_inode_size);
+                return -EINVAL;
+        } else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) {
+                printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n",
+                       nilfs->ns_inode_size);
+                return -EINVAL;
+        }
        nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
        nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index dc638f786d5c..ee9cb3795c2b 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -60,8 +60,8 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
 }
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-static int fanotify_get_response_from_access(struct fsnotify_group *group,
+static int fanotify_get_response(struct fsnotify_group *group,
-                                             struct fanotify_event_info *event)
+                                 struct fanotify_perm_event_info *event)
 {
        int ret;
@@ -142,6 +142,40 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
        return false;
 }
+struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
+                                                 struct path *path)
+{
+        struct fanotify_event_info *event;
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        if (mask & FAN_ALL_PERM_EVENTS) {
+                struct fanotify_perm_event_info *pevent;
+                pevent = kmem_cache_alloc(fanotify_perm_event_cachep,
+                                          GFP_KERNEL);
+                if (!pevent)
+                        return NULL;
+                event = &pevent->fae;
+                pevent->response = 0;
+                goto init;
+        }
+#endif
+        event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
+        if (!event)
+                return NULL;
+init: __maybe_unused
+        fsnotify_init_event(&event->fse, inode, mask);
+        event->tgid = get_pid(task_tgid(current));
+        if (path) {
+                event->path = *path;
+                path_get(&event->path);
+        } else {
+                event->path.mnt = NULL;
+                event->path.dentry = NULL;
+        }
+        return event;
+}
 static int fanotify_handle_event(struct fsnotify_group *group,
                                 struct inode *inode,
                                 struct fsnotify_mark *inode_mark,
@@ -171,25 +205,11 @@ static int fanotify_handle_event(struct fsnotify_group *group,
        pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
                 mask);
-        event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
+        event = fanotify_alloc_event(inode, mask, data);
        if (unlikely(!event))
                return -ENOMEM;
        fsn_event = &event->fse;
-        fsnotify_init_event(fsn_event, inode, mask);
-        event->tgid = get_pid(task_tgid(current));
-        if (data_type == FSNOTIFY_EVENT_PATH) {
-                struct path *path = data;
-                event->path = *path;
-                path_get(&event->path);
-        } else {
-                event->path.mnt = NULL;
-                event->path.dentry = NULL;
-        }
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-        event->response = 0;
-#endif
        ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge);
        if (ret) {
                /* Permission events shouldn't be merged */
@@ -202,7 +222,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
        if (mask & FAN_ALL_PERM_EVENTS) {
-                ret = fanotify_get_response_from_access(group, event);
+                ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event));
                fsnotify_destroy_event(group, fsn_event);
        }
 #endif
@@ -225,6 +245,13 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
        event = FANOTIFY_E(fsn_event);
        path_put(&event->path);
        put_pid(event->tgid);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        if (fsn_event->mask & FAN_ALL_PERM_EVENTS) {
+                kmem_cache_free(fanotify_perm_event_cachep,
+                                FANOTIFY_PE(fsn_event));
+                return;
+        }
+#endif
        kmem_cache_free(fanotify_event_cachep, event);
 }
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 32a2f034fb94..2a5fb14115df 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -3,13 +3,12 @@
 #include <linux/slab.h>
 extern struct kmem_cache *fanotify_event_cachep;
+extern struct kmem_cache *fanotify_perm_event_cachep;
 /*
- * Lifetime of the structure differs for normal and permission events. In both
+ * Structure for normal fanotify events. It gets allocated in
- * cases the structure is allocated in fanotify_handle_event(). For normal
+ * fanotify_handle_event() and freed when the information is retrieved by
- * events the structure is freed immediately after reporting it to userspace.
+ * userspace
- * For permission events we free it only after we receive response from
- * userspace.
 */
 struct fanotify_event_info {
        struct fsnotify_event fse;
@@ -19,12 +18,33 @@ struct fanotify_event_info {
         */
        struct path path;
        struct pid *tgid;
+};
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-        u32 response;   /* userspace answer to question */
+/*
-#endif
+ * Structure for permission fanotify events. It gets allocated and freed in
+ * fanotify_handle_event() since we wait there for user response. When the
+ * information is retrieved by userspace the structure is moved from
+ * group->notification_list to group->fanotify_data.access_list to wait for
+ * user response.
+ */
+struct fanotify_perm_event_info {
+        struct fanotify_event_info fae;
+        int response;   /* userspace answer to question */
+        int fd;         /* fd we passed to userspace for this event */
 };
+static inline struct fanotify_perm_event_info *
+FANOTIFY_PE(struct fsnotify_event *fse)
+{
+        return container_of(fse, struct fanotify_perm_event_info, fae.fse);
+}
+#endif
 static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
 {
        return container_of(fse, struct fanotify_event_info, fse);
 }
+struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
+                                                 struct path *path);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 287a22c04149..732648b270dc 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -28,14 +28,8 @@
 extern const struct fsnotify_ops fanotify_fsnotify_ops;
 static struct kmem_cache *fanotify_mark_cache __read_mostly;
-static struct kmem_cache *fanotify_response_event_cache __read_mostly;
 struct kmem_cache *fanotify_event_cachep __read_mostly;
+struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
-struct fanotify_response_event {
-        struct list_head list;
-        __s32 fd;
-        struct fanotify_event_info *event;
-};
 /*
 * Get an fsnotify notification event if one exists and is small
@@ -135,33 +129,34 @@ static int fill_event_metadata(struct fsnotify_group *group,
 }
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group,
+static struct fanotify_perm_event_info *dequeue_event(
-                                                  __s32 fd)
+                                struct fsnotify_group *group, int fd)
 {
-        struct fanotify_response_event *re, *return_re = NULL;
+        struct fanotify_perm_event_info *event, *return_e = NULL;
-        mutex_lock(&group->fanotify_data.access_mutex);
+        spin_lock(&group->fanotify_data.access_lock);
-        list_for_each_entry(re, &group->fanotify_data.access_list, list) {
+        list_for_each_entry(event, &group->fanotify_data.access_list,
-                if (re->fd != fd)
+                            fae.fse.list) {
+                if (event->fd != fd)
                        continue;
-                list_del_init(&re->list);
+                list_del_init(&event->fae.fse.list);
-                return_re = re;
+                return_e = event;
                break;
        }
-        mutex_unlock(&group->fanotify_data.access_mutex);
+        spin_unlock(&group->fanotify_data.access_lock);
-        pr_debug("%s: found return_re=%p\n", __func__, return_re);
+        pr_debug("%s: found return_re=%p\n", __func__, return_e);
-        return return_re;
+        return return_e;
 }
 static int process_access_response(struct fsnotify_group *group,
                                   struct fanotify_response *response_struct)
 {
-        struct fanotify_response_event *re;
+        struct fanotify_perm_event_info *event;
-        __s32 fd = response_struct->fd;
+        int fd = response_struct->fd;
-        __u32 response = response_struct->response;
+        int response = response_struct->response;
        pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
                 fd, response);
@@ -181,58 +176,15 @@ static int process_access_response(struct fsnotify_group *group,
        if (fd < 0)
                return -EINVAL;
-        re = dequeue_re(group, fd);
+        event = dequeue_event(group, fd);
-        if (!re)
+        if (!event)
                return -ENOENT;
-        re->event->response = response;
+        event->response = response;
        wake_up(&group->fanotify_data.access_waitq);
-        kmem_cache_free(fanotify_response_event_cache, re);
-        return 0;
-}
-static int prepare_for_access_response(struct fsnotify_group *group,
-                                       struct fsnotify_event *event,
-                                       __s32 fd)
-{
-        struct fanotify_response_event *re;
-        if (!(event->mask & FAN_ALL_PERM_EVENTS))
-                return 0;
-        re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL);
-        if (!re)
-                return -ENOMEM;
-        re->event = FANOTIFY_E(event);
-        re->fd = fd;
-        mutex_lock(&group->fanotify_data.access_mutex);
-        if (atomic_read(&group->fanotify_data.bypass_perm)) {
-                mutex_unlock(&group->fanotify_data.access_mutex);
-                kmem_cache_free(fanotify_response_event_cache, re);
-                FANOTIFY_E(event)->response = FAN_ALLOW;
-                return 0;
-        }
-                
-        list_add_tail(&re->list, &group->fanotify_data.access_list);
-        mutex_unlock(&group->fanotify_data.access_mutex);
-        return 0;
-}
-#else
-static int prepare_for_access_response(struct fsnotify_group *group,
-                                       struct fsnotify_event *event,
-                                       __s32 fd)
-{
        return 0;
 }
 #endif
 static ssize_t copy_event_to_user(struct fsnotify_group *group,
@@ -247,7 +199,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
        ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);
        if (ret < 0)
-                goto out;
+                return ret;
        fd = fanotify_event_metadata.fd;
        ret = -EFAULT;
@@ -255,9 +207,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
                         fanotify_event_metadata.event_len))
                goto out_close_fd;
-        ret = prepare_for_access_response(group, event, fd);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-        if (ret)
+        if (event->mask & FAN_ALL_PERM_EVENTS)
-                goto out_close_fd;
+                FANOTIFY_PE(event)->fd = fd;
+#endif
        if (fd != FAN_NOFD)
                fd_install(fd, f);
@@ -268,13 +221,6 @@ out_close_fd:
                put_unused_fd(fd);
                fput(f);
        }
-out:
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-        if (event->mask & FAN_ALL_PERM_EVENTS) {
-                FANOTIFY_E(event)->response = FAN_DENY;
-                wake_up(&group->fanotify_data.access_waitq);
-        }
-#endif
        return ret;
 }
@@ -314,35 +260,50 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
                kevent = get_one_event(group, count);
                mutex_unlock(&group->notification_mutex);
-                if (kevent) {
+                if (IS_ERR(kevent)) {
                        ret = PTR_ERR(kevent);
-                        if (IS_ERR(kevent))
+                        break;
+                }
+                if (!kevent) {
+                        ret = -EAGAIN;
+                        if (file->f_flags & O_NONBLOCK)
                                break;
-                        ret = copy_event_to_user(group, kevent, buf);
-                        /*
+                        ret = -ERESTARTSYS;
-                         * Permission events get destroyed after we
+                        if (signal_pending(current))
-                         * receive response
+                                break;
-                         */
-                        if (!(kevent->mask & FAN_ALL_PERM_EVENTS))
+                        if (start != buf)
-                                fsnotify_destroy_event(group, kevent);
-                        if (ret < 0)
                                break;
-                        buf += ret;
+                        schedule();
-                        count -= ret;
                        continue;
                }
-                ret = -EAGAIN;
+                ret = copy_event_to_user(group, kevent, buf);
-                if (file->f_flags & O_NONBLOCK)
+                /*
-                        break;
+                 * Permission events get queued to wait for response.  Other
-                ret = -ERESTARTSYS;
+                 * events can be destroyed now.
-                if (signal_pending(current))
+                 */
-                        break;
+                if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) {
+                        fsnotify_destroy_event(group, kevent);
-                if (start != buf)
+                        if (ret < 0)
-                        break;
+                                break;
+                } else {
-                schedule();
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+                        if (ret < 0) {
+                                FANOTIFY_PE(kevent)->response = FAN_DENY;
+                                wake_up(&group->fanotify_data.access_waitq);
+                                break;
+                        }
+                        spin_lock(&group->fanotify_data.access_lock);
+                        list_add_tail(&kevent->list,
+                                      &group->fanotify_data.access_list);
+                        spin_unlock(&group->fanotify_data.access_lock);
+#endif
+                }
+                buf += ret;
+                count -= ret;
        }
        finish_wait(&group->notification_waitq, &wait);
@@ -383,22 +344,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
        struct fsnotify_group *group = file->private_data;
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-        struct fanotify_response_event *re, *lre;
+        struct fanotify_perm_event_info *event, *next;
-        mutex_lock(&group->fanotify_data.access_mutex);
+        spin_lock(&group->fanotify_data.access_lock);
        atomic_inc(&group->fanotify_data.bypass_perm);
-        list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
+        list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
-                pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
+                                 fae.fse.list) {
-                         re, re->event);
+                pr_debug("%s: found group=%p event=%p\n", __func__, group,
+                         event);
-                list_del_init(&re->list);
+                list_del_init(&event->fae.fse.list);
-                re->event->response = FAN_ALLOW;
+                event->response = FAN_ALLOW;
-                kmem_cache_free(fanotify_response_event_cache, re);
        }
-        mutex_unlock(&group->fanotify_data.access_mutex);
+        spin_unlock(&group->fanotify_data.access_lock);
        wake_up(&group->fanotify_data.access_waitq);
 #endif
@@ -731,21 +691,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
        group->fanotify_data.user = user;
        atomic_inc(&user->fanotify_listeners);
-        oevent = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
+        oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL);
        if (unlikely(!oevent)) {
                fd = -ENOMEM;
                goto out_destroy_group;
        }
        group->overflow_event = &oevent->fse;
-        fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
-        oevent->tgid = get_pid(task_tgid(current));
-        oevent->path.mnt = NULL;
-        oevent->path.dentry = NULL;
+        if (force_o_largefile())
+                event_f_flags |= O_LARGEFILE;
        group->fanotify_data.f_flags = event_f_flags;
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-        oevent->response = 0;
+        spin_lock_init(&group->fanotify_data.access_lock);
-        mutex_init(&group->fanotify_data.access_mutex);
        init_waitqueue_head(&group->fanotify_data.access_waitq);
        INIT_LIST_HEAD(&group->fanotify_data.access_list);
        atomic_set(&group->fanotify_data.bypass_perm, 0);
@@ -920,9 +877,11 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
 static int __init fanotify_user_setup(void)
 {
        fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
-        fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
-                                                   SLAB_PANIC);
        fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+        fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event_info,
+                                                SLAB_PANIC);
+#endif
        return 0;
 }
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
index 807150e2c2b9..dd6103cc93c1 100644
--- a/fs/ntfs/debug.c
+++ b/fs/ntfs/debug.c
@@ -18,16 +18,9 @@
 * distribution in the file COPYING); if not, write to the Free Software
 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include "debug.h"
-/*
- * A static buffer to hold the error string being displayed and a spinlock
- * to protect concurrent accesses to it.
- */
-static char err_buf[1024];
-static DEFINE_SPINLOCK(err_buf_lock);
 /**
 * __ntfs_warning - output a warning to the syslog
 * @function:   name of function outputting the warning
@@ -50,6 +43,7 @@ static DEFINE_SPINLOCK(err_buf_lock);
 void __ntfs_warning(const char *function, const struct super_block *sb,
                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        int flen = 0;
@@ -59,17 +53,15 @@ void __ntfs_warning(const char *function, const struct super_block *sb,
 #endif
        if (function)
                flen = strlen(function);
-        spin_lock(&err_buf_lock);
        va_start(args, fmt);
-        vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+        vaf.fmt = fmt;
-        va_end(args);
+        vaf.va = &args;
        if (sb)
-                printk(KERN_ERR "NTFS-fs warning (device %s): %s(): %s\n",
+                pr_warn("(device %s): %s(): %pV\n",
-                                sb->s_id, flen ? function : "", err_buf);
+                        sb->s_id, flen ? function : "", &vaf);
        else
-                printk(KERN_ERR "NTFS-fs warning: %s(): %s\n",
+                pr_warn("%s(): %pV\n", flen ? function : "", &vaf);
-                                flen ? function : "", err_buf);
+        va_end(args);
-        spin_unlock(&err_buf_lock);
 }
 /**
@@ -94,6 +86,7 @@ void __ntfs_warning(const char *function, const struct super_block *sb,
 void __ntfs_error(const char *function, const struct super_block *sb,
                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        int flen = 0;
@@ -103,17 +96,15 @@ void __ntfs_error(const char *function, const struct super_block *sb,
 #endif
        if (function)
                flen = strlen(function);
-        spin_lock(&err_buf_lock);
        va_start(args, fmt);
-        vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+        vaf.fmt = fmt;
-        va_end(args);
+        vaf.va = &args;
        if (sb)
-                printk(KERN_ERR "NTFS-fs error (device %s): %s(): %s\n",
+                pr_err("(device %s): %s(): %pV\n",
-                                sb->s_id, flen ? function : "", err_buf);
+                       sb->s_id, flen ? function : "", &vaf);
        else
-                printk(KERN_ERR "NTFS-fs error: %s(): %s\n",
+                pr_err("%s(): %pV\n", flen ? function : "", &vaf);
-                                flen ? function : "", err_buf);
+        va_end(args);
-        spin_unlock(&err_buf_lock);
 }
 #ifdef DEBUG
@@ -124,6 +115,7 @@ int debug_msgs = 0;
 void __ntfs_debug (const char *file, int line, const char *function,
                const char *fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
        int flen = 0;
@@ -131,13 +123,11 @@ void __ntfs_debug (const char *file, int line, const char *function,
                return;
        if (function)
                flen = strlen(function);
-        spin_lock(&err_buf_lock);
        va_start(args, fmt);
-        vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf);
        va_end(args);
-        printk(KERN_DEBUG "NTFS-fs DEBUG (%s, %d): %s(): %s\n", file, line,
-                        flen ? function : "", err_buf);
-        spin_unlock(&err_buf_lock);
 }
 /* Dump a runlist. Caller has to provide synchronisation for @rl. */
@@ -149,12 +139,12 @@ void ntfs_debug_dump_runlist(const runlist_element *rl)
        if (!debug_msgs)
                return;
-        printk(KERN_DEBUG "NTFS-fs DEBUG: Dumping runlist (values in hex):\n");
+        pr_debug("Dumping runlist (values in hex):\n");
        if (!rl) {
-                printk(KERN_DEBUG "Run list not present.\n");
+                pr_debug("Run list not present.\n");
                return;
        }
-        printk(KERN_DEBUG "VCN              LCN               Run length\n");
+        pr_debug("VCN              LCN               Run length\n");
        for (i = 0; ; i++) {
                LCN lcn = (rl + i)->lcn;
@@ -163,13 +153,13 @@ void ntfs_debug_dump_runlist(const runlist_element *rl)
                        if (index > -LCN_ENOENT - 1)
                                index = 3;
-                        printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n",
+                        pr_debug("%-16Lx %s %-16Lx%s\n",
                                        (long long)(rl + i)->vcn, lcn_str[index],
                                        (long long)(rl + i)->length,
                                        (rl + i)->length ? "" :
                                                " (runlist end)");
                } else
-                        printk(KERN_DEBUG "%-16Lx %-16Lx  %-16Lx%s\n",
+                        pr_debug("%-16Lx %-16Lx  %-16Lx%s\n",
                                        (long long)(rl + i)->vcn,
                                        (long long)(rl + i)->lcn,
                                        (long long)(rl + i)->length,
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 53c27eaf2307..61bf091e32a8 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h
@@ -48,7 +48,12 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl);
 #else   /* !DEBUG */
-#define ntfs_debug(f, a...)             do {} while (0)
+#define ntfs_debug(fmt, ...)                                            \
+do {                                                                    \
+        if (0)                                                          \
+                no_printk(fmt, ##__VA_ARGS__);                          \
+} while (0)
 #define ntfs_debug_dump_runlist(rl)     do {} while (0)
 #endif  /* !DEBUG */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index ffb9b3675736..f47af5e6e230 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1704,8 +1704,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
        iput(bvi);
 skip_large_index_stuff:
        /* Setup the operations for this index inode. */
-        vi->i_op = NULL;
-        vi->i_fop = NULL;
        vi->i_mapping->a_ops = &ntfs_mst_aops;
        vi->i_blocks = ni->allocated_size >> 9;
        /*
@@ -2259,7 +2257,7 @@ void ntfs_evict_big_inode(struct inode *vi)
 {
        ntfs_inode *ni = NTFS_I(vi);
-        truncate_inode_pages(&vi->i_data, 0);
+        truncate_inode_pages_final(&vi->i_data);
        clear_inode(vi);
 #ifdef NTFS_RW
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 82650d52d916..9de2491f2926 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -19,6 +19,7 @@
 * distribution in the file COPYING); if not, write to the Free Software
 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/stddef.h>
 #include <linux/init.h>
@@ -468,6 +469,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
        ntfs_debug("Entering with remount options string: %s", opt);
+        sync_filesystem(sb);
 #ifndef NTFS_RW
        /* For read-only compiled driver, enforce read-only flag. */
        *flags |= MS_RDONLY;
@@ -1894,7 +1897,7 @@ get_ctx_vol_failed:
        vol->minor_ver = vi->minor_ver;
        ntfs_attr_put_search_ctx(ctx);
        unmap_mft_record(NTFS_I(vol->vol_ino));
-        printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver,
+        pr_info("volume version %i.%i.\n", vol->major_ver,
                        vol->minor_ver);
        if (vol->major_ver < 3 && NVolSparseEnabled(vol)) {
                ntfs_warning(vol->sb, "Disabling sparse support due to NTFS "
@@ -3093,7 +3096,7 @@ static int __init init_ntfs_fs(void)
        int err = 0;
        /* This may be ugly but it results in pretty output so who cares. (-8 */
-        printk(KERN_INFO "NTFS driver " NTFS_VERSION " [Flags: R/"
+        pr_info("driver " NTFS_VERSION " [Flags: R/"
 #ifdef NTFS_RW
                        "W"
 #else
@@ -3113,16 +3116,15 @@ static int __init init_ntfs_fs(void)
                        sizeof(ntfs_index_context), 0 /* offset */,
                        SLAB_HWCACHE_ALIGN, NULL /* ctor */);
        if (!ntfs_index_ctx_cache) {
-                printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+                pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name);
-                                ntfs_index_ctx_cache_name);
                goto ictx_err_out;
        }
        ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
                        sizeof(ntfs_attr_search_ctx), 0 /* offset */,
                        SLAB_HWCACHE_ALIGN, NULL /* ctor */);
        if (!ntfs_attr_ctx_cache) {
-                printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+                pr_crit("NTFS: Failed to create %s!\n",
-                                ntfs_attr_ctx_cache_name);
+                        ntfs_attr_ctx_cache_name);
                goto actx_err_out;
        }
@@ -3130,8 +3132,7 @@ static int __init init_ntfs_fs(void)
                        (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
                        SLAB_HWCACHE_ALIGN, NULL);
        if (!ntfs_name_cache) {
-                printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+                pr_crit("Failed to create %s!\n", ntfs_name_cache_name);
-                                ntfs_name_cache_name);
                goto name_err_out;
        }
@@ -3139,8 +3140,7 @@ static int __init init_ntfs_fs(void)
                        sizeof(ntfs_inode), 0,
                        SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
        if (!ntfs_inode_cache) {
-                printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+                pr_crit("Failed to create %s!\n", ntfs_inode_cache_name);
-                                ntfs_inode_cache_name);
                goto inode_err_out;
        }
@@ -3149,15 +3149,14 @@ static int __init init_ntfs_fs(void)
                        SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
                        ntfs_big_inode_init_once);
        if (!ntfs_big_inode_cache) {
-                printk(KERN_CRIT "NTFS: Failed to create %s!\n",
+                pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
-                                ntfs_big_inode_cache_name);
                goto big_inode_err_out;
        }
        /* Register the ntfs sysctls. */
        err = ntfs_sysctl(1);
        if (err) {
-                printk(KERN_CRIT "NTFS: Failed to register NTFS sysctls!\n");
+                pr_crit("Failed to register NTFS sysctls!\n");
                goto sysctl_err_out;
        }
@@ -3166,7 +3165,7 @@ static int __init init_ntfs_fs(void)
                ntfs_debug("NTFS driver registered successfully.");
                return 0; /* Success! */
        }
-        printk(KERN_CRIT "NTFS: Failed to register NTFS filesystem driver!\n");
+        pr_crit("Failed to register NTFS filesystem driver!\n");
        /* Unregister the ntfs sysctls. */
        ntfs_sysctl(0);
@@ -3182,8 +3181,7 @@ actx_err_out:
        kmem_cache_destroy(ntfs_index_ctx_cache);
 ictx_err_out:
        if (!err) {
-                printk(KERN_CRIT "NTFS: Aborting NTFS filesystem driver "
+                pr_crit("Aborting NTFS filesystem driver registration...\n");
-                                "registration...\n");
                err = -ENOMEM;
        }
        return err;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 555f4cddefe3..7e8282dcea2a 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -205,6 +205,7 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
        di->i_mode = cpu_to_le16(inode->i_mode);
        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        ocfs2_update_inode_fsync_trans(handle, inode, 0);
        ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e2edff38be52..b4deb5f750d9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5728,6 +5728,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
        }
        ocfs2_et_update_clusters(et, -len);
+        ocfs2_update_inode_fsync_trans(handle, inode, 1);
        ocfs2_journal_dirty(handle, et->et_root_bh);
@@ -6932,6 +6933,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
+        ocfs2_update_inode_fsync_trans(handle, inode, 1);
        ocfs2_dinode_new_extent_list(inode, di);
        ocfs2_journal_dirty(handle, di_bh);
@@ -7208,6 +7210,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        ocfs2_update_inode_fsync_trans(handle, inode, 1);
        ocfs2_journal_dirty(handle, di_bh);
 out_commit:
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index aeb44e879c51..d310d12a9adc 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -571,7 +571,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 {
        struct inode *inode = file_inode(iocb->ki_filp);
        int level;
-        wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -582,10 +581,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
        if (ocfs2_iocb_is_unaligned_aio(iocb)) {
                ocfs2_iocb_clear_unaligned_aio(iocb);
-                if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
+                mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
-                    waitqueue_active(wq)) {
-                        wake_up_all(wq);
-                }
        }
        ocfs2_iocb_clear_rw_locked(iocb);
@@ -2043,6 +2039,7 @@ out_write_size:
        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
        di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+        ocfs2_update_inode_fsync_trans(handle, inode, 1);
        ocfs2_journal_dirty(handle, wc->w_di_bh);
        ocfs2_commit_trans(osb, handle);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index f671e49beb34..6cae155d54df 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -102,9 +102,4 @@ enum ocfs2_iocb_lock_bits {
 #define ocfs2_iocb_is_unaligned_aio(iocb) \
        test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define OCFS2_IOEND_WQ_HASH_SZ  37
-#define ocfs2_ioend_wq(v)   (&ocfs2__ioend_wq[((unsigned long)(v)) %\
-                                            OCFS2_IOEND_WQ_HASH_SZ])
-extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 5b704c63a103..1edcb141f639 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -90,7 +90,6 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
                 * information for this bh as it's not marked locally
                 * uptodate. */
                ret = -EIO;
-                put_bh(bh);
                mlog_errno(ret);
        }
@@ -420,7 +419,6 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
        if (!buffer_uptodate(bh)) {
                ret = -EIO;
-                put_bh(bh);
                mlog_errno(ret);
        }
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index a4b07730b2e1..b7f57271d49c 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -41,7 +41,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
        return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
 }
 static struct kobj_attribute attr_version =
-        __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
+        __ATTR(interface_revision, S_IRUGO, version_show, NULL);
 static struct attribute *o2cb_attrs[] = {
        &attr_version.attr,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2cd2406b4140..c6b90e670389 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -137,7 +137,7 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] =
 static void o2net_sc_connect_completed(struct work_struct *work);
 static void o2net_rx_until_empty(struct work_struct *work);
 static void o2net_shutdown_sc(struct work_struct *work);
-static void o2net_listen_data_ready(struct sock *sk, int bytes);
+static void o2net_listen_data_ready(struct sock *sk);
 static void o2net_sc_send_keep_req(struct work_struct *work);
 static void o2net_idle_timer(unsigned long data);
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
@@ -262,17 +262,17 @@ static void o2net_update_recv_stats(struct o2net_sock_container *sc)
 #endif /* CONFIG_OCFS2_FS_STATS */
-static inline int o2net_reconnect_delay(void)
+static inline unsigned int o2net_reconnect_delay(void)
 {
        return o2nm_single_cluster->cl_reconnect_delay_ms;
 }
-static inline int o2net_keepalive_delay(void)
+static inline unsigned int o2net_keepalive_delay(void)
 {
        return o2nm_single_cluster->cl_keepalive_delay_ms;
 }
-static inline int o2net_idle_timeout(void)
+static inline unsigned int o2net_idle_timeout(void)
 {
        return o2nm_single_cluster->cl_idle_timeout_ms;
 }
@@ -597,9 +597,9 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 }
 /* see o2net_register_callbacks() */
-static void o2net_data_ready(struct sock *sk, int bytes)
+static void o2net_data_ready(struct sock *sk)
 {
-        void (*ready)(struct sock *sk, int bytes);
+        void (*ready)(struct sock *sk);
        read_lock(&sk->sk_callback_lock);
        if (sk->sk_user_data) {
@@ -613,7 +613,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
        }
        read_unlock(&sk->sk_callback_lock);
-        ready(sk, bytes);
+        ready(sk);
 }
 /* see o2net_register_callbacks() */
@@ -916,57 +916,30 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
 static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
 {
-        int ret;
+        struct kvec vec = { .iov_len = len, .iov_base = data, };
-        mm_segment_t oldfs;
+        struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
-        struct kvec vec = {
+        return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags);
-                .iov_len = len,
-                .iov_base = data,
-        };
-        struct msghdr msg = {
-                .msg_iovlen = 1,
-                .msg_iov = (struct iovec *)&vec,
-                .msg_flags = MSG_DONTWAIT,
-        };
-        oldfs = get_fs();
-        set_fs(get_ds());
-        ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
-        set_fs(oldfs);
-        return ret;
 }
 static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
                              size_t veclen, size_t total)
 {
        int ret;
-        mm_segment_t oldfs;
+        struct msghdr msg;
-        struct msghdr msg = {
-                .msg_iov = (struct iovec *)vec,
-                .msg_iovlen = veclen,
-        };
        if (sock == NULL) {
                ret = -EINVAL;
                goto out;
        }
-        oldfs = get_fs();
+        ret = kernel_sendmsg(sock, &msg, vec, veclen, total);
-        set_fs(get_ds());
+        if (likely(ret == total))
-        ret = sock_sendmsg(sock, &msg, total);
+                return 0;
-        set_fs(oldfs);
+        mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total);
-        if (ret != total) {
+        if (ret >= 0)
-                mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
+                ret = -EPIPE; /* should be smarter, I bet */
-                     total);
-                if (ret >= 0)
-                        ret = -EPIPE; /* should be smarter, I bet */
-                goto out;
-        }
-        ret = 0;
 out:
-        if (ret < 0)
+        mlog(0, "returning error: %d\n", ret);
-                mlog(0, "returning error: %d\n", ret);
        return ret;
 }
@@ -1953,9 +1926,9 @@ static void o2net_accept_many(struct work_struct *work)
                cond_resched();
 }
-static void o2net_listen_data_ready(struct sock *sk, int bytes)
+static void o2net_listen_data_ready(struct sock *sk)
 {
-        void (*ready)(struct sock *sk, int bytes);
+        void (*ready)(struct sock *sk);
        read_lock(&sk->sk_callback_lock);
        ready = sk->sk_user_data;
@@ -1964,18 +1937,29 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes)
                goto out;
        }
-        /* ->sk_data_ready is also called for a newly established child socket
+        /* This callback may called twice when a new connection
-         * before it has been accepted and the acceptor has set up their
+         * is  being established as a child socket inherits everything
-         * data_ready.. we only want to queue listen work for our listening
+         * from a parent LISTEN socket, including the data_ready cb of
-         * socket */
+         * the parent. This leads to a hazard. In o2net_accept_one()
+         * we are still initializing the child socket but have not
+         * changed the inherited data_ready callback yet when
+         * data starts arriving.
+         * We avoid this hazard by checking the state.
+         * For the listening socket,  the state will be TCP_LISTEN; for the new
+         * socket, will be  TCP_ESTABLISHED. Also, in this case,
+         * sk->sk_user_data is not a valid function pointer.
+         */
        if (sk->sk_state == TCP_LISTEN) {
-                mlog(ML_TCP, "bytes: %d\n", bytes);
                queue_work(o2net_wq, &o2net_listen_work);
+        } else {
+                ready = NULL;
        }
 out:
        read_unlock(&sk->sk_callback_lock);
-        ready(sk, bytes);
+        if (ready != NULL)
+                ready(sk);
 }
 static int o2net_open_listening_sock(__be32 addr, __be16 port)
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4cbcb65784a3..dc024367110a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -165,7 +165,7 @@ struct o2net_sock_container {
        /* original handlers for the sockets */
        void                    (*sc_state_change)(struct sock *sk);
-        void                    (*sc_data_ready)(struct sock *sk, int bytes);
+        void                    (*sc_data_ready)(struct sock *sk);
        u32                     sc_msg_key;
        u16                     sc_msg_type;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 0d3a97d2d5f6..e2e05a106beb 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -37,7 +37,6 @@
 #include "dlmglue.h"
 #include "file.h"
 #include "inode.h"
-#include "super.h"
 #include "ocfs2_trace.h"
 void ocfs2_dentry_attach_gen(struct dentry *dentry)
@@ -346,52 +345,6 @@ out_attach:
        return ret;
 }
-DEFINE_SPINLOCK(dentry_list_lock);
-/* We limit the number of dentry locks to drop in one go. We have
- * this limit so that we don't starve other users of ocfs2_wq. */
-#define DL_INODE_DROP_COUNT 64
-/* Drop inode references from dentry locks */
-static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
-{
-        struct ocfs2_dentry_lock *dl;
-        spin_lock(&dentry_list_lock);
-        while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
-                dl = osb->dentry_lock_list;
-                osb->dentry_lock_list = dl->dl_next;
-                spin_unlock(&dentry_list_lock);
-                iput(dl->dl_inode);
-                kfree(dl);
-                spin_lock(&dentry_list_lock);
-        }
-        spin_unlock(&dentry_list_lock);
-}
-void ocfs2_drop_dl_inodes(struct work_struct *work)
-{
-        struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
-                                               dentry_lock_work);
-        __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
-        /*
-         * Don't queue dropping if umount is in progress. We flush the
-         * list in ocfs2_dismount_volume
-         */
-        spin_lock(&dentry_list_lock);
-        if (osb->dentry_lock_list &&
-            !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
-                queue_work(ocfs2_wq, &osb->dentry_lock_work);
-        spin_unlock(&dentry_list_lock);
-}
-/* Flush the whole work queue */
-void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
-{
-        __ocfs2_drop_dl_inodes(osb, -1);
-}
 /*
 * ocfs2_dentry_iput() and friends.
 *
@@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
 static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
                                   struct ocfs2_dentry_lock *dl)
 {
+        iput(dl->dl_inode);
        ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
        ocfs2_lock_res_free(&dl->dl_lockres);
+        kfree(dl);
-        /* We leave dropping of inode reference to ocfs2_wq as that can
-         * possibly lead to inode deletion which gets tricky */
-        spin_lock(&dentry_list_lock);
-        if (!osb->dentry_lock_list &&
-            !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
-                queue_work(ocfs2_wq, &osb->dentry_lock_work);
-        dl->dl_next = osb->dentry_lock_list;
-        osb->dentry_lock_list = dl;
-        spin_unlock(&dentry_list_lock);
 }
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
                           struct ocfs2_dentry_lock *dl)
 {
-        int unlock;
+        int unlock = 0;
        BUG_ON(dl->dl_count == 0);
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index b79eff709958..55f58892b153 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,13 +29,8 @@
 extern const struct dentry_operations ocfs2_dentry_ops;
 struct ocfs2_dentry_lock {
-        /* Use count of dentry lock */
        unsigned int            dl_count;
-        union {
+        u64                     dl_parent_blkno;
-                /* Linked list of dentry locks to release */
-                struct ocfs2_dentry_lock *dl_next;
-                u64                     dl_parent_blkno;
-        };
        /*
         * The ocfs2_dentry_lock keeps an inode reference until
@@ -49,14 +44,9 @@ struct ocfs2_dentry_lock {
 int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
                             u64 parent_blkno);
-extern spinlock_t dentry_list_lock;
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
                           struct ocfs2_dentry_lock *dl);
-void ocfs2_drop_dl_inodes(struct work_struct *work);
-void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
 struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
                                      int skip_unhashed);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 91a7e85ac8fd..0717662b4aef 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2957,6 +2957,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                ocfs2_init_dir_trailer(dir, dirdata_bh, i);
        }
+        ocfs2_update_inode_fsync_trans(handle, dir, 1);
        ocfs2_journal_dirty(handle, dirdata_bh);
        if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3005,6 +3006,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        di->i_size = cpu_to_le64(sb->s_blocksize);
        di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
+        ocfs2_update_inode_fsync_trans(handle, dir, 1);
        /*
         * This should never fail as our extent list is empty and all
@@ -3338,6 +3340,7 @@ do_extend:
        } else {
                de->rec_len = cpu_to_le16(sb->s_blocksize);
        }
+        ocfs2_update_inode_fsync_trans(handle, dir, 1);
        ocfs2_journal_dirty(handle, new_bh);
        dir_i_size += dir->i_sb->s_blocksize;
@@ -3896,6 +3899,7 @@ out_commit:
                dquot_free_space_nodirty(dir,
                                ocfs2_clusters_to_bytes(dir->i_sb, 1));
+        ocfs2_update_inode_fsync_trans(handle, dir, 1);
        ocfs2_commit_trans(osb, handle);
 out:
@@ -4134,6 +4138,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
                mlog_errno(ret);
        did_quota = 0;
+        ocfs2_update_inode_fsync_trans(handle, dir, 1);
        ocfs2_journal_dirty(handle, dx_root_bh);
 out_commit:
@@ -4401,6 +4406,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
        di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
        spin_unlock(&OCFS2_I(dir)->ip_lock);
        di->i_dx_root = cpu_to_le64(0ULL);
+        ocfs2_update_inode_fsync_trans(handle, dir, 1);
        ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 33660a4a52fa..c973690dc0bc 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1123,7 +1123,6 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
        struct dlm_ctxt *dlm = NULL;
        char *local = NULL;
        int status = 0;
-        int locked = 0;
        qr = (struct dlm_query_region *) msg->buf;
@@ -1132,10 +1131,8 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
        /* buffer used in dlm_mast_regions() */
        local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
-        if (!local) {
+        if (!local)
-                status = -ENOMEM;
+                return -ENOMEM;
-                goto bail;
-        }
        status = -EINVAL;
@@ -1144,16 +1141,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
        if (!dlm) {
                mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
                     "before join domain\n", qr->qr_node, qr->qr_domain);
-                goto bail;
+                goto out_domain_lock;
        }
        spin_lock(&dlm->spinlock);
-        locked = 1;
        if (dlm->joining_node != qr->qr_node) {
                mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
                     "but joining node is %d\n", qr->qr_node, qr->qr_domain,
                     dlm->joining_node);
-                goto bail;
+                goto out_dlm_lock;
        }
        /* Support for global heartbeat was added in 1.1 */
@@ -1163,14 +1159,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
                     "but active dlm protocol is %d.%d\n", qr->qr_node,
                     qr->qr_domain, dlm->dlm_locking_proto.pv_major,
                     dlm->dlm_locking_proto.pv_minor);
-                goto bail;
+                goto out_dlm_lock;
        }
        status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions));
-bail:
+out_dlm_lock:
-        if (locked)
+        spin_unlock(&dlm->spinlock);
-                spin_unlock(&dlm->spinlock);
+out_domain_lock:
        spin_unlock(&dlm_domain_lock);
        kfree(local);
@@ -1877,19 +1874,19 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
                goto bail;
        }
-        status = dlm_debug_init(dlm);
+        status = dlm_launch_thread(dlm);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        status = dlm_launch_thread(dlm);
+        status = dlm_launch_recovery_thread(dlm);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        status = dlm_launch_recovery_thread(dlm);
+        status = dlm_debug_init(dlm);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index af3f7aa73e13..ee1f88419cb0 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -472,11 +472,15 @@ bail:
 void dlm_destroy_master_caches(void)
 {
-        if (dlm_lockname_cache)
+        if (dlm_lockname_cache) {
                kmem_cache_destroy(dlm_lockname_cache);
+                dlm_lockname_cache = NULL;
+        }
-        if (dlm_lockres_cache)
+        if (dlm_lockres_cache) {
                kmem_cache_destroy(dlm_lockres_cache);
+                dlm_lockres_cache = NULL;
+        }
 }
 static void dlm_lockres_release(struct kref *kref)
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 7035af09cc03..fe29f7978f81 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -537,7 +537,10 @@ master_here:
                /* success!  see if any other nodes need recovery */
                mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
                     dlm->name, dlm->reco.dead_node, dlm->node_num);
-                dlm_reset_recovery(dlm);
+                spin_lock(&dlm->spinlock);
+                __dlm_reset_recovery(dlm);
+                dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+                spin_unlock(&dlm->spinlock);
        }
        dlm_end_recovery(dlm);
@@ -695,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                if (all_nodes_done) {
                        int ret;
+                        /* Set this flag on recovery master to avoid
+                         * a new recovery for another dead node start
+                         * before the recovery is not done. That may
+                         * cause recovery hung.*/
+                        spin_lock(&dlm->spinlock);
+                        dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+                        spin_unlock(&dlm->spinlock);
                        /* all nodes are now in DLM_RECO_NODE_DATA_DONE state
                         * just send a finalize message to everyone and
                         * clean up */
@@ -1750,13 +1761,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                                     struct dlm_migratable_lockres *mres)
 {
        struct dlm_migratable_lock *ml;
-        struct list_head *queue;
+        struct list_head *queue, *iter;
        struct list_head *tmpq = NULL;
        struct dlm_lock *newlock = NULL;
        struct dlm_lockstatus *lksb = NULL;
        int ret = 0;
        int i, j, bad;
-        struct dlm_lock *lock = NULL;
+        struct dlm_lock *lock;
        u8 from = O2NM_MAX_NODES;
        unsigned int added = 0;
        __be64 c;
@@ -1791,14 +1802,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                        /* MIGRATION ONLY! */
                        BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
+                        lock = NULL;
                        spin_lock(&res->spinlock);
                        for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
                                tmpq = dlm_list_idx_to_ptr(res, j);
-                                list_for_each_entry(lock, tmpq, list) {
+                                list_for_each(iter, tmpq) {
-                                        if (lock->ml.cookie != ml->cookie)
+                                        lock = list_entry(iter,
-                                                lock = NULL;
+                                                  struct dlm_lock, list);
-                                        else
+                                        if (lock->ml.cookie == ml->cookie)
                                                break;
+                                        lock = NULL;
                                }
                                if (lock)
                                        break;
@@ -2882,8 +2895,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
                                BUG();
                        }
                        dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+                        __dlm_reset_recovery(dlm);
                        spin_unlock(&dlm->spinlock);
-                        dlm_reset_recovery(dlm);
                        dlm_kick_recovery_thread(dlm);
                        break;
                default:
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 19986959d149..6bd690b5a061 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3144,22 +3144,60 @@ out:
        return 0;
 }
+static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+                                       struct ocfs2_lock_res *lockres);
 /* Mark the lockres as being dropped. It will no longer be
 * queued if blocking, but we still may have to wait on it
 * being dequeued from the downconvert thread before we can consider
 * it safe to drop.
 *
 * You can *not* attempt to call cluster_lock on this lockres anymore. */
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+                                struct ocfs2_lock_res *lockres)
 {
        int status;
        struct ocfs2_mask_waiter mw;
-        unsigned long flags;
+        unsigned long flags, flags2;
        ocfs2_init_mask_waiter(&mw);
        spin_lock_irqsave(&lockres->l_lock, flags);
        lockres->l_flags |= OCFS2_LOCK_FREEING;
+        if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) {
+                /*
+                 * We know the downconvert is queued but not in progress
+                 * because we are the downconvert thread and processing
+                 * different lock. So we can just remove the lock from the
+                 * queue. This is not only an optimization but also a way
+                 * to avoid the following deadlock:
+                 *   ocfs2_dentry_post_unlock()
+                 *     ocfs2_dentry_lock_put()
+                 *       ocfs2_drop_dentry_lock()
+                 *         iput()
+                 *           ocfs2_evict_inode()
+                 *             ocfs2_clear_inode()
+                 *               ocfs2_mark_lockres_freeing()
+                 *                 ... blocks waiting for OCFS2_LOCK_QUEUED
+                 *                 since we are the downconvert thread which
+                 *                 should clear the flag.
+                 */
+                spin_unlock_irqrestore(&lockres->l_lock, flags);
+                spin_lock_irqsave(&osb->dc_task_lock, flags2);
+                list_del_init(&lockres->l_blocked_list);
+                osb->blocked_lock_count--;
+                spin_unlock_irqrestore(&osb->dc_task_lock, flags2);
+                /*
+                 * Warn if we recurse into another post_unlock call.  Strictly
+                 * speaking it isn't a problem but we need to be careful if
+                 * that happens (stack overflow, deadlocks, ...) so warn if
+                 * ocfs2 grows a path for which this can happen.
+                 */
+                WARN_ON_ONCE(lockres->l_ops->post_unlock);
+                /* Since the lock is freeing we don't do much in the fn below */
+                ocfs2_process_blocked_lock(osb, lockres);
+                return;
+        }
        while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
                lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
                spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -3180,7 +3218,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
 {
        int ret;
-        ocfs2_mark_lockres_freeing(lockres);
+        ocfs2_mark_lockres_freeing(osb, lockres);
        ret = ocfs2_drop_lock(osb, lockres);
        if (ret)
                mlog_errno(ret);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 1d596d8c4a4a..d293a22c32c5 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -157,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
 void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+                                struct ocfs2_lock_res *lockres);
 void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
                               struct ocfs2_lock_res *lockres);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 51632c40e896..8970dcf74de5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,9 +175,13 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
                           int datasync)
 {
        int err = 0;
-        journal_t *journal;
        struct inode *inode = file->f_mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        journal_t *journal = osb->journal->j_journal;
+        int ret;
+        tid_t commit_tid;
+        bool needs_barrier = false;
        trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
                              OCFS2_I(inode)->ip_blkno,
@@ -192,29 +196,19 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
        if (err)
                return err;
-        /*
+        commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
-         * Probably don't need the i_mutex at all in here, just putting it here
+        if (journal->j_flags & JBD2_BARRIER &&
-         * to be consistent with how fsync used to be called, someone more
+            !jbd2_trans_will_send_data_barrier(journal, commit_tid))
-         * familiar with the fs could possibly remove it.
+                needs_barrier = true;
-         */
+        err = jbd2_complete_transaction(journal, commit_tid);
-        mutex_lock(&inode->i_mutex);
+        if (needs_barrier) {
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
+                ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-                /*
+                if (!err)
-                 * We still have to flush drive's caches to get data to the
+                        err = ret;
-                 * platter
-                 */
-                if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
-                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-                goto bail;
        }
-        journal = osb->journal->j_journal;
-        err = jbd2_journal_force_commit(journal);
-bail:
        if (err)
                mlog_errno(err);
-        mutex_unlock(&inode->i_mutex);
        return (err < 0) ? -EIO : 0;
 }
@@ -292,6 +286,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
        inode->i_atime = CURRENT_TIME;
        di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
        di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+        ocfs2_update_inode_fsync_trans(handle, inode, 0);
        ocfs2_journal_dirty(handle, bh);
 out_commit:
@@ -341,6 +336,7 @@ int ocfs2_simple_size_update(struct inode *inode,
        if (ret < 0)
                mlog_errno(ret);
+        ocfs2_update_inode_fsync_trans(handle, inode, 0);
        ocfs2_commit_trans(osb, handle);
 out:
        return ret;
@@ -435,6 +431,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
        di->i_size = cpu_to_le64(new_i_size);
        di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
        di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        ocfs2_update_inode_fsync_trans(handle, inode, 0);
        ocfs2_journal_dirty(handle, fe_bh);
@@ -650,7 +647,7 @@ restarted_transaction:
                        mlog_errno(status);
                goto leave;
        }
+        ocfs2_update_inode_fsync_trans(handle, inode, 1);
        ocfs2_journal_dirty(handle, bh);
        spin_lock(&OCFS2_I(inode)->ip_lock);
@@ -743,6 +740,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret)
                mlog_errno(ret);
+        ocfs2_update_inode_fsync_trans(handle, inode, 1);
 out:
        if (ret) {
@@ -840,6 +838,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
                di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
                di->i_mtime_nsec = di->i_ctime_nsec;
                ocfs2_journal_dirty(handle, di_bh);
+                ocfs2_update_inode_fsync_trans(handle, inode, 1);
                ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
        }
@@ -1344,6 +1343,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
        di = (struct ocfs2_dinode *) bh->b_data;
        di->i_mode = cpu_to_le16(inode->i_mode);
+        ocfs2_update_inode_fsync_trans(handle, inode, 0);
        ocfs2_journal_dirty(handle, bh);
@@ -1576,6 +1576,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
                if (ret)
                        mlog_errno(ret);
        }
+        ocfs2_update_inode_fsync_trans(handle, inode, 1);
        ocfs2_commit_trans(osb, handle);
 out:
@@ -2061,13 +2062,6 @@ out:
        return ret;
 }
-static void ocfs2_aiodio_wait(struct inode *inode)
-{
-        wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
-        wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
-}
 static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
 {
        int blockmask = inode->i_sb->s_blocksize - 1;
@@ -2345,10 +2339,8 @@ relock:
                 * Wait on previous unaligned aio to complete before
                 * proceeding.
                 */
-                ocfs2_aiodio_wait(inode);
+                mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
+                /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
-                /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
-                atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
                ocfs2_iocb_set_unaligned_aio(iocb);
        }
@@ -2375,15 +2367,18 @@ relock:
        if (direct_io) {
                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
-                                                    ppos, count, ocount);
+                                                    count, ocount);
                if (written < 0) {
                        ret = written;
                        goto out_dio;
                }
        } else {
+                struct iov_iter from;
+                iov_iter_init(&from, iov, nr_segs, count, 0);
                current->backing_dev_info = file->f_mapping->backing_dev_info;
-                written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
+                written = generic_perform_write(file, &from, *ppos);
-                                                      ppos, count, 0);
+                if (likely(written >= 0))
+                        iocb->ki_pos = *ppos + written;
                current->backing_dev_info = NULL;
        }
@@ -2428,7 +2423,7 @@ out_dio:
        if (unaligned_dio) {
                ocfs2_iocb_clear_unaligned_aio(iocb);
-                atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
+                mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
        }
 out:
@@ -2645,7 +2640,16 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
        case SEEK_SET:
                break;
        case SEEK_END:
-                offset += inode->i_size;
+                /* SEEK_END requires the OCFS2 inode lock for the file
+                 * because it references the file's size.
+                 */
+                ret = ocfs2_inode_lock(inode, NULL, 0);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                offset += i_size_read(inode);
+                ocfs2_inode_unlock(inode, 0);
                break;
        case SEEK_CUR:
                if (offset == 0) {
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f29a90fde619..437de7f768c6 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -130,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
        struct inode *inode = NULL;
        struct super_block *sb = osb->sb;
        struct ocfs2_find_inode_args args;
+        journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
        trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
                               sysfile_type);
@@ -169,6 +170,32 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
                goto bail;
        }
+        /*
+         * Set transaction id's of transactions that have to be committed
+         * to finish f[data]sync. We set them to currently running transaction
+         * as we cannot be sure that the inode or some of its metadata isn't
+         * part of the transaction - the inode could have been reclaimed and
+         * now it is reread from disk.
+         */
+        if (journal) {
+                transaction_t *transaction;
+                tid_t tid;
+                struct ocfs2_inode_info *oi = OCFS2_I(inode);
+                read_lock(&journal->j_state_lock);
+                if (journal->j_running_transaction)
+                        transaction = journal->j_running_transaction;
+                else
+                        transaction = journal->j_committing_transaction;
+                if (transaction)
+                        tid = transaction->t_tid;
+                else
+                        tid = journal->j_commit_sequence;
+                read_unlock(&journal->j_state_lock);
+                oi->i_sync_tid = tid;
+                oi->i_datasync_tid = tid;
+        }
 bail:
        if (!IS_ERR(inode)) {
                trace_ocfs2_iget_end(inode, 
@@ -804,11 +831,13 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
                goto bail;
        }
-        /* If we're coming from downconvert_thread we can't go into our own
+        /*
-         * voting [hello, deadlock city!], so unforuntately we just
+         * If we're coming from downconvert_thread we can't go into our own
-         * have to skip deleting this guy. That's OK though because
+         * voting [hello, deadlock city!] so we cannot delete the inode. But
-         * the node who's doing the actual deleting should handle it
+         * since we dropped last inode ref when downconverting dentry lock,
-         * anyway. */
+         * we cannot have the file open and thus the node doing unlink will
+         * take care of deleting the inode.
+         */
        if (current == osb->dc_task)
                goto bail;
@@ -822,12 +851,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
                goto bail_unlock;
        }
-        /* If we have allowd wipe of this inode for another node, it
-         * will be marked here so we can safely skip it. Recovery will
-         * cleanup any inodes we might inadvertently skip here. */
-        if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE)
-                goto bail_unlock;
        ret = 1;
 bail_unlock:
        spin_unlock(&oi->ip_lock);
@@ -941,7 +964,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
                (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
        if (sync_data)
                filemap_write_and_wait(inode->i_mapping);
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
 }
 static void ocfs2_delete_inode(struct inode *inode)
@@ -960,8 +983,6 @@ static void ocfs2_delete_inode(struct inode *inode)
        if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
                goto bail;
-        dquot_initialize(inode);
        if (!ocfs2_inode_is_valid_to_delete(inode)) {
                /* It's probably not necessary to truncate_inode_pages
                 * here but we do it for safety anyway (it will most
@@ -970,6 +991,8 @@ static void ocfs2_delete_inode(struct inode *inode)
                goto bail;
        }
+        dquot_initialize(inode);
        /* We want to block signals in delete_inode as the lock and
         * messaging paths may return us -ERESTARTSYS. Which would
         * cause us to exit early, resulting in inodes being orphaned
@@ -1057,6 +1080,7 @@ static void ocfs2_clear_inode(struct inode *inode)
 {
        int status;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        clear_inode(inode);
        trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
@@ -1073,9 +1097,9 @@ static void ocfs2_clear_inode(struct inode *inode)
        /* Do these before all the other work so that we don't bounce
         * the downconvert thread while waiting to destroy the locks. */
-        ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
+        ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres);
-        ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
+        ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
-        ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
+        ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
        ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
                           &oi->ip_la_data_resv);
@@ -1157,7 +1181,7 @@ void ocfs2_evict_inode(struct inode *inode)
            (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
                ocfs2_delete_inode(inode);
        } else {
-                truncate_inode_pages(&inode->i_data, 0);
+                truncate_inode_pages_final(&inode->i_data);
        }
        ocfs2_clear_inode(inode);
 }
@@ -1260,6 +1284,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
        ocfs2_journal_dirty(handle, bh);
+        ocfs2_update_inode_fsync_trans(handle, inode, 1);
 leave:
        return status;
 }
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 621fc73bf23d..a6c991c0fc98 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -44,7 +44,7 @@ struct ocfs2_inode_info
        struct rw_semaphore             ip_xattr_sem;
        /* Number of outstanding AIO's which are not page aligned */
-        atomic_t                        ip_unaligned_aio;
+        struct mutex                    ip_unaligned_aio;
        /* These fields are protected by ip_lock */
        spinlock_t                      ip_lock;
@@ -73,6 +73,13 @@ struct ocfs2_inode_info
        u32                             ip_dir_lock_gen;
        struct ocfs2_alloc_reservation  ip_la_data_resv;
+        /*
+         * Transactions that contain inode's metadata needed to complete
+         * fsync and fdatasync, respectively.
+         */
+        tid_t i_sync_tid;
+        tid_t i_datasync_tid;
 };
 /*
@@ -84,8 +91,6 @@ struct ocfs2_inode_info
 #define OCFS2_INODE_BITMAP              0x00000004
 /* This inode has been wiped from disk */
 #define OCFS2_INODE_DELETED             0x00000008
-/* Another node is deleting, so our delete is a nop */
-#define OCFS2_INODE_SKIP_DELETE         0x00000010
 /* Has the inode been orphaned on another node?
 *
 * This hints to ocfs2_drop_inode that it should clear i_nlink before
@@ -100,11 +105,11 @@ struct ocfs2_inode_info
 * rely on ocfs2_delete_inode to sort things out under the proper
 * cluster locks.
 */
-#define OCFS2_INODE_MAYBE_ORPHANED      0x00000020
+#define OCFS2_INODE_MAYBE_ORPHANED      0x00000010
 /* Does someone have the file open O_DIRECT */
-#define OCFS2_INODE_OPEN_DIRECT         0x00000040
+#define OCFS2_INODE_OPEN_DIRECT         0x00000020
 /* Tell the inode wipe code it's not in orphan dir */
-#define OCFS2_INODE_SKIP_ORPHAN_DIR     0x00000080
+#define OCFS2_INODE_SKIP_ORPHAN_DIR     0x00000040
 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 {
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 8ca3c29accbf..490229f43731 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -413,11 +413,12 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
                }
                status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
-                if (status < 0)
-                        goto bail;
                iput(inode_alloc);
                inode_alloc = NULL;
+                if (status < 0)
+                        goto bail;
        }
        o2info_set_request_filled(&oifi->ifi_req);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 44fc3e530c3d..03ea9314fecd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -2132,12 +2132,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                iter = oi->ip_next_orphan;
                spin_lock(&oi->ip_lock);
-                /* The remote delete code may have set these on the
-                 * assumption that the other node would wipe them
-                 * successfully.  If they are still in the node's
-                 * orphan dir, we need to reset that state. */
-                oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
                /* Set the proper information to get us going into
                 * ocfs2_delete_inode. */
                oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 9ff4e8cf9d97..7f8cde94abfe 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -626,4 +626,15 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
                                new_size);
 }
+static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
+                                                  struct inode *inode,
+                                                  int datasync)
+{
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
+        oi->i_sync_tid = handle->h_transaction->t_tid;
+        if (datasync)
+                oi->i_datasync_tid = handle->h_transaction->t_tid;
+}
 #endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index e57c804069ea..6b6d092b0998 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -82,6 +82,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
        }
        ret = flock_lock_file_wait(file, fl);
+        if (ret)
+                ocfs2_file_unlock(file);
 out:
        mutex_unlock(&fp->fp_mutex);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 64c304d668f0..599eb4c4c8be 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -151,6 +151,7 @@ static int __ocfs2_move_extent(handle_t *handle,
                                                        old_blkno, len);
        }
+        ocfs2_update_inode_fsync_trans(handle, inode, 0);
 out:
        ocfs2_free_path(path);
        return ret;
@@ -690,8 +691,11 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
        ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
                                         goal_bit, len);
-        if (ret)
+        if (ret) {
+                ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
+                                               le16_to_cpu(gd->bg_chain));
                mlog_errno(ret);
+        }
        /*
         * Here we should write the new page out first if we are
@@ -957,6 +961,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
        inode->i_ctime = CURRENT_TIME;
        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+        ocfs2_update_inode_fsync_trans(handle, inode, 0);
        ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 3683643f3f0e..2060fc398445 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -450,7 +450,6 @@ leave:
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
-        kfree(si.name);
        kfree(si.value);
        ocfs2_free_dir_lookup_result(&lookup);
@@ -495,6 +494,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_extent_list *fel;
        u16 feat;
+        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        *new_fe_bh = NULL;
@@ -576,8 +576,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,
                        mlog_errno(status);
        }
-        status = 0; /* error in ocfs2_create_new_inode_locks is not
+        oi->i_sync_tid = handle->h_transaction->t_tid;
-                     * critical */
+        oi->i_datasync_tid = handle->h_transaction->t_tid;
 leave:
        if (status < 0) {
@@ -1855,7 +1855,6 @@ bail:
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
-        kfree(si.name);
        kfree(si.value);
        ocfs2_free_dir_lookup_result(&lookup);
        if (inode_ac)
@@ -2481,6 +2480,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
        di->i_orphaned_slot = 0;
        set_nlink(inode, 1);
        ocfs2_set_links_count(di, inode->i_nlink);
+        ocfs2_update_inode_fsync_trans(handle, inode, 1);
        ocfs2_journal_dirty(handle, di_bh);
        status = ocfs2_add_entry(handle, dentry, inode,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 553f53cc73ae..8d64a97a9d5e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -30,6 +30,7 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/list.h>
+#include <linux/llist.h>
 #include <linux/rbtree.h>
 #include <linux/workqueue.h>
 #include <linux/kref.h>
@@ -274,19 +275,16 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
 };
-#define OCFS2_OSB_SOFT_RO                       0x0001
+#define OCFS2_OSB_SOFT_RO       0x0001
-#define OCFS2_OSB_HARD_RO                       0x0002
+#define OCFS2_OSB_HARD_RO       0x0002
-#define OCFS2_OSB_ERROR_FS                      0x0004
+#define OCFS2_OSB_ERROR_FS      0x0004
-#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED        0x0008
+#define OCFS2_DEFAULT_ATIME_QUANTUM     60
-#define OCFS2_DEFAULT_ATIME_QUANTUM             60
 struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
 struct ocfs2_replay_map;
 struct ocfs2_quota_recovery;
-struct ocfs2_dentry_lock;
 struct ocfs2_super
 {
        struct task_struct *commit_task;
@@ -414,10 +412,9 @@ struct ocfs2_super
        struct list_head blocked_lock_list;
        unsigned long blocked_lock_count;
-        /* List of dentry locks to release. Anyone can add locks to
+        /* List of dquot structures to drop last reference to */
-         * the list, ocfs2_wq processes the list  */
+        struct llist_head dquot_drop_list;
-        struct ocfs2_dentry_lock *dentry_lock_list;
+        struct work_struct dquot_drop_work;
-        struct work_struct dentry_lock_work;
        wait_queue_head_t               osb_mount_event;
@@ -449,6 +446,8 @@ struct ocfs2_super
        /* rb tree root for refcount lock. */
        struct rb_root  osb_rf_lock_tree;
        struct ocfs2_refcount_tree *osb_ref_tree_lru;
+        struct mutex system_file_mutex;
 };
 #define OCFS2_SB(sb)        ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -579,18 +578,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
        spin_unlock(&osb->osb_lock);
 }
-static inline unsigned long  ocfs2_test_osb_flag(struct ocfs2_super *osb,
-                                                 unsigned long flag)
-{
-        unsigned long ret;
-        spin_lock(&osb->osb_lock);
-        ret = osb->osb_flags & flag;
-        spin_unlock(&osb->osb_lock);
-        return ret;
-}
 static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
                                     int hard)
 {
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index d5ab56cbe5c5..f266d67df3c6 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -28,6 +28,7 @@ struct ocfs2_dquot {
        unsigned int dq_use_count;      /* Number of nodes having reference to this entry in global quota file */
        s64 dq_origspace;       /* Last globally synced space usage */
        s64 dq_originodes;      /* Last globally synced inode usage */
+        struct llist_node list; /* Member of list of dquots to drop */
 };
 /* Description of one chunk to recover in memory */
@@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
 int ocfs2_create_local_dquot(struct dquot *dquot);
 int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
 int ocfs2_local_write_dquot(struct dquot *dquot);
+void ocfs2_drop_dquot_refs(struct work_struct *work);
 extern const struct dquot_operations ocfs2_quota_operations;
 extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index d7b5108789e2..b990a62cff50 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -10,6 +10,7 @@
 #include <linux/jiffies.h>
 #include <linux/writeback.h>
 #include <linux/workqueue.h>
+#include <linux/llist.h>
 #include <cluster/masklog.h>
@@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
               OCFS2_INODE_UPDATE_CREDITS;
 }
+void ocfs2_drop_dquot_refs(struct work_struct *work)
+{
+        struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+                                               dquot_drop_work);
+        struct llist_node *list;
+        struct ocfs2_dquot *odquot, *next_odquot;
+        list = llist_del_all(&osb->dquot_drop_list);
+        llist_for_each_entry_safe(odquot, next_odquot, list, list) {
+                /* Drop the reference we acquired in ocfs2_dquot_release() */
+                dqput(&odquot->dq_dquot);
+        }
+}
+/*
+ * Called when the last reference to dquot is dropped. If we are called from
+ * downconvert thread, we cannot do all the handling here because grabbing
+ * quota lock could deadlock (the node holding the quota lock could need some
+ * other cluster lock to proceed but with blocked downconvert thread we cannot
+ * release any lock).
+ */
 static int ocfs2_release_dquot(struct dquot *dquot)
 {
        handle_t *handle;
@@ -694,6 +716,19 @@ static int ocfs2_release_dquot(struct dquot *dquot)
        /* Check whether we are not racing with some other dqget() */
        if (atomic_read(&dquot->dq_count) > 1)
                goto out;
+        /* Running from downconvert thread? Postpone quota processing to wq */
+        if (current == osb->dc_task) {
+                /*
+                 * Grab our own reference to dquot and queue it for delayed
+                 * dropping.  Quota code rechecks after calling
+                 * ->release_dquot() and won't free dquot structure.
+                 */
+                dqgrab(dquot);
+                /* First entry on list -> queue work */
+                if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
+                        queue_work(ocfs2_wq, &osb->dquot_drop_work);
+                goto out;
+        }
        status = ocfs2_lock_global_qf(oinfo, 1);
        if (status < 0)
                goto out;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index ca5ce14cbddc..83f1a665ae97 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -496,7 +496,7 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
 }
 static struct kobj_attribute ocfs2_attr_max_locking_protocol =
-        __ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
+        __ATTR(max_locking_protocol, S_IRUGO,
               ocfs2_max_locking_protocol_show, NULL);
 static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
@@ -528,7 +528,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
 }
 static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
-        __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
+        __ATTR(loaded_cluster_plugins, S_IRUGO,
               ocfs2_loaded_cluster_plugins_show, NULL);
 static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
@@ -550,7 +550,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
 }
 static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
-        __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
+        __ATTR(active_cluster_plugin, S_IRUGO,
               ocfs2_active_cluster_plugin_show, NULL);
 static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
@@ -599,15 +599,29 @@ static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
 static struct kobj_attribute ocfs2_attr_cluster_stack =
-        __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
+        __ATTR(cluster_stack, S_IRUGO | S_IWUSR,
               ocfs2_cluster_stack_show,
               ocfs2_cluster_stack_store);
+static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr,
+                                        char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "1\n");
+}
+static struct kobj_attribute ocfs2_attr_dlm_recover_support =
+        __ATTR(dlm_recover_callback_support, S_IRUGO,
+               ocfs2_dlm_recover_show, NULL);
 static struct attribute *ocfs2_attrs[] = {
        &ocfs2_attr_max_locking_protocol.attr,
        &ocfs2_attr_loaded_cluster_plugins.attr,
        &ocfs2_attr_active_cluster_plugin.attr,
        &ocfs2_attr_cluster_stack.attr,
+        &ocfs2_attr_dlm_recover_support.attr,
        NULL,
 };
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 47ae2663a6f5..0cb889a17ae1 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -771,6 +771,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
        i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
        alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
+        ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
        status = 0;
@@ -1607,6 +1608,21 @@ out:
        return ret;
 }
+void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
+                                       struct buffer_head *di_bh,
+                                       u32 num_bits,
+                                       u16 chain)
+{
+        u32 tmp_used;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+        struct ocfs2_chain_list *cl;
+        cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
+        tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+        di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
+        le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
+}
 static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
                                         struct ocfs2_extent_rec *rec,
                                         struct ocfs2_chain_list *cl)
@@ -1707,8 +1723,12 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
                                         res->sr_bit_offset, res->sr_bits);
-        if (ret < 0)
+        if (ret < 0) {
+                ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
+                                               res->sr_bits,
+                                               le16_to_cpu(gd->bg_chain));
                mlog_errno(ret);
+        }
 out_loc_only:
        *bits_left = le16_to_cpu(gd->bg_free_bits_count);
@@ -1838,6 +1858,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                                            res->sr_bit_offset,
                                            res->sr_bits);
        if (status < 0) {
+                ocfs2_rollback_alloc_dinode_counts(alloc_inode,
+                                        ac->ac_bh, res->sr_bits, chain);
                mlog_errno(status);
                goto bail;
        }
@@ -2091,7 +2113,7 @@ int ocfs2_find_new_inode_loc(struct inode *dir,
        ac->ac_find_loc_priv = res;
        *fe_blkno = res->sr_blkno;
+        ocfs2_update_inode_fsync_trans(handle, dir, 0);
 out:
        if (handle)
                ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
@@ -2149,6 +2171,8 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle,
                                         res->sr_bit_offset,
                                         res->sr_bits);
        if (ret < 0) {
+                ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
+                                               ac->ac_bh, res->sr_bits, chain);
                mlog_errno(ret);
                goto out;
        }
@@ -2870,6 +2894,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
        status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
        if (status < 0) {
                mutex_unlock(&inode_alloc_inode->i_mutex);
+                iput(inode_alloc_inode);
                mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
                     (u32)suballoc_slot, status);
                goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 218d8036b3e7..2d2501767c0c 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -91,6 +91,10 @@ int ocfs2_alloc_dinode_update_counts(struct inode *inode,
                         struct buffer_head *di_bh,
                         u32 num_bits,
                         u16 chain);
+void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
+                         struct buffer_head *di_bh,
+                         u32 num_bits,
+                         u16 chain);
 int ocfs2_block_group_set_bits(handle_t *handle,
                         struct inode *alloc_inode,
                         struct ocfs2_group_desc *bg,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 49d84f80f36c..a7cdd56f4c79 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -561,6 +561,9 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
        if (!oi)
                return NULL;
+        oi->i_sync_tid = 0;
+        oi->i_datasync_tid = 0;
        jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
        return &oi->vfs_inode;
 }
@@ -631,6 +634,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        u32 tmp;
+        sync_filesystem(sb);
        if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
            !ocfs2_check_set_options(sb, &parsed_options)) {
                ret = -EINVAL;
@@ -1238,30 +1243,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
        return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
 }
-static void ocfs2_kill_sb(struct super_block *sb)
-{
-        struct ocfs2_super *osb = OCFS2_SB(sb);
-        /* Failed mount? */
-        if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
-                goto out;
-        /* Prevent further queueing of inode drop events */
-        spin_lock(&dentry_list_lock);
-        ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
-        spin_unlock(&dentry_list_lock);
-        /* Wait for work to finish and/or remove it */
-        cancel_work_sync(&osb->dentry_lock_work);
-out:
-        kill_block_super(sb);
-}
 static struct file_system_type ocfs2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ocfs2",
        .mount          = ocfs2_mount,
-        .kill_sb        = ocfs2_kill_sb,
+        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
        .next           = NULL
 };
@@ -1612,14 +1598,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
        return 0;
 }
-wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
 static int __init ocfs2_init(void)
 {
-        int status, i;
+        int status;
-        for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
-                init_waitqueue_head(&ocfs2__ioend_wq[i]);
        status = init_ocfs2_uptodate_cache();
        if (status < 0)
@@ -1761,7 +1742,7 @@ static void ocfs2_inode_init_once(void *data)
        ocfs2_extent_map_init(&oi->vfs_inode);
        INIT_LIST_HEAD(&oi->ip_io_markers);
        oi->ip_dir_start_lookup = 0;
-        atomic_set(&oi->ip_unaligned_aio, 0);
+        mutex_init(&oi->ip_unaligned_aio);
        init_rwsem(&oi->ip_alloc_sem);
        init_rwsem(&oi->ip_xattr_sem);
        mutex_init(&oi->ip_io_mutex);
@@ -1932,17 +1913,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        debugfs_remove(osb->osb_ctxt);
-        /*
-         * Flush inode dropping work queue so that deletes are
-         * performed while the filesystem is still working
-         */
-        ocfs2_drop_all_dl_inodes(osb);
        /* Orphan scan should be stopped as early as possible */
        ocfs2_orphan_scan_stop(osb);
        ocfs2_disable_quotas(osb);
+        /* All dquots should be freed by now */
+        WARN_ON(!llist_empty(&osb->dquot_drop_list));
+        /* Wait for worker to be done with the work structure in osb */
+        cancel_work_sync(&osb->dquot_drop_work);
        ocfs2_shutdown_local_alloc(osb);
        /* This will disable recovery and flush any recovery work. */
@@ -2077,7 +2057,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
        struct inode *inode = NULL;
        struct ocfs2_journal *journal;
-        __le32 uuid_net_key;
        struct ocfs2_super *osb;
        u64 total_blocks;
@@ -2123,6 +2102,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
        spin_lock_init(&osb->osb_xattr_lock);
        ocfs2_init_steal_slots(osb);
+        mutex_init(&osb->system_file_mutex);
        atomic_set(&osb->alloc_stats.moves, 0);
        atomic_set(&osb->alloc_stats.local_data, 0);
        atomic_set(&osb->alloc_stats.bitmap_data, 0);
@@ -2276,8 +2257,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
        INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
        journal->j_state = OCFS2_JOURNAL_FREE;
-        INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
+        INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
-        osb->dentry_lock_list = NULL;
+        init_llist_head(&osb->dquot_drop_list);
        /* get some pseudo constants for clustersize bits */
        osb->s_clustersize_bits =
@@ -2311,8 +2292,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
                goto bail;
        }
-        memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
        strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
        osb->vol_label[63] = '\0';
        osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index f053688d22a3..af155c183123 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -113,9 +113,11 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
        } else
                arr = get_local_system_inode(osb, type, slot);
+        mutex_lock(&osb->system_file_mutex);
        if (arr && ((inode = *arr) != NULL)) {
                /* get a ref in addition to the array ref */
                inode = igrab(inode);
+                mutex_unlock(&osb->system_file_mutex);
                BUG_ON(!inode);
                return inode;
@@ -129,6 +131,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
                *arr = igrab(inode);
                BUG_ON(!*arr);
        }
+        mutex_unlock(&osb->system_file_mutex);
        return inode;
 }
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 185fa3b7f962..016f01df3825 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -369,7 +369,7 @@ static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
 * them fully.
 */
 static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
-                                   u64 xb_blkno)
+                                   u64 xb_blkno, int new)
 {
        int i, rc = 0;
@@ -383,9 +383,16 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
                }
                if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
-                                           bucket->bu_bhs[i]))
+                                           bucket->bu_bhs[i])) {
-                        ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+                        if (new)
-                                                      bucket->bu_bhs[i]);
+                                ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+                                                              bucket->bu_bhs[i]);
+                        else {
+                                set_buffer_uptodate(bucket->bu_bhs[i]);
+                                ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+                                                          bucket->bu_bhs[i]);
+                        }
+                }
        }
        if (rc)
@@ -2602,6 +2609,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
        oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
+        ocfs2_update_inode_fsync_trans(handle, inode, 0);
        ocfs2_journal_dirty(handle, di_bh);
 out_commit:
@@ -3200,8 +3208,15 @@ meta_guess:
                        clusters_add += 1;
                }
        } else {
-                meta_add += 1;
                credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+                if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+                        struct ocfs2_extent_list *el = &def_xv.xv.xr_list;
+                        meta_add += ocfs2_extend_meta_needed(el);
+                        credits += ocfs2_calc_extend_credits(inode->i_sb,
+                                                             el);
+                } else {
+                        meta_add += 1;
+                }
        }
 out:
        if (clusters_need)
@@ -3614,6 +3629,7 @@ int ocfs2_xattr_set(struct inode *inode,
        }
        ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+        ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0);
        ocfs2_commit_trans(osb, ctxt.handle);
@@ -4294,7 +4310,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
-        ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
+        ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -4638,7 +4654,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
         * Even if !new_bucket_head, we're overwriting t_bucket.  Thus,
         * there's no need to read it.
         */
-        ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
+        ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -4804,7 +4820,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
         * Even if !t_is_new, we're overwriting t_bucket.  Thus,
         * there's no need to read it.
         */
-        ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
+        ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new);
        if (ret)
                goto out;
@@ -5476,6 +5492,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
        ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
        if (ret)
                mlog_errno(ret);
+        ocfs2_update_inode_fsync_trans(handle, inode, 0);
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -6830,7 +6847,7 @@ static int ocfs2_reflink_xattr_bucket(handle_t *handle,
                        break;
                }
-                ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno);
+                ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1);
                if (ret) {
                        mlog_errno(ret);
                        break;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index d8b0afde2179..ec58c7659183 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -183,7 +183,7 @@ int omfs_sync_inode(struct inode *inode)
 */
 static void omfs_evict_inode(struct inode *inode)
 {
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        if (inode->i_nlink)
diff --git a/fs/open.c b/fs/open.c
index b9ed8b25c108..9d64679cec73 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -231,7 +231,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                return -EINVAL;
        /* Return error if mode is not supported */
-        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+                return -EOPNOTSUPP;
+        /* Punch hole and zero range are mutually exclusive */
+        if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
+            (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
        /* Punch hole must have keep size set */
@@ -239,17 +245,31 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
            !(mode & FALLOC_FL_KEEP_SIZE))
                return -EOPNOTSUPP;
+        /* Collapse range should only be used exclusively. */
+        if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
+            (mode & ~FALLOC_FL_COLLAPSE_RANGE))
+                return -EINVAL;
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
-        /* It's not possible punch hole on append only file */
+        /*
-        if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode))
+         * We can only allow pure fallocate on append only files
+         */
+        if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
                return -EPERM;
        if (IS_IMMUTABLE(inode))
                return -EPERM;
        /*
+         * We can not allow to do any fallocate operation on an active
+         * swapfile
+         */
+        if (IS_SWAPFILE(inode))
+                ret = -ETXTBSY;
+        /*
         * Revalidate the write permissions, in case security policy has
         * changed since the files were opened.
         */
@@ -632,35 +652,6 @@ out:
        return error;
 }
-/*
- * You have to be very careful that these write
- * counts get cleaned up in error cases and
- * upon __fput().  This should probably never
- * be called outside of __dentry_open().
- */
-static inline int __get_file_write_access(struct inode *inode,
-                                          struct vfsmount *mnt)
-{
-        int error;
-        error = get_write_access(inode);
-        if (error)
-                return error;
-        /*
-         * Do not take mount writer counts on
-         * special files since no writes to
-         * the mount itself will occur.
-         */
-        if (!special_file(inode->i_mode)) {
-                /*
-                 * Balanced in __fput()
-                 */
-                error = __mnt_want_write(mnt);
-                if (error)
-                        put_write_access(inode);
-        }
-        return error;
-}
 int open_check_o_direct(struct file *f)
 {
        /* NB: we're sure to have correct a_ops only after f_op->open */
@@ -685,26 +676,28 @@ static int do_dentry_open(struct file *f,
        f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
                                FMODE_PREAD | FMODE_PWRITE;
-        if (unlikely(f->f_flags & O_PATH))
-                f->f_mode = FMODE_PATH;
        path_get(&f->f_path);
        inode = f->f_inode = f->f_path.dentry->d_inode;
-        if (f->f_mode & FMODE_WRITE) {
-                error = __get_file_write_access(inode, f->f_path.mnt);
-                if (error)
-                        goto cleanup_file;
-                if (!special_file(inode->i_mode))
-                        file_take_write(f);
-        }
        f->f_mapping = inode->i_mapping;
-        if (unlikely(f->f_mode & FMODE_PATH)) {
+        if (unlikely(f->f_flags & O_PATH)) {
+                f->f_mode = FMODE_PATH;
                f->f_op = &empty_fops;
                return 0;
        }
+        if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
+                error = get_write_access(inode);
+                if (unlikely(error))
+                        goto cleanup_file;
+                error = __mnt_want_write(f->f_path.mnt);
+                if (unlikely(error)) {
+                        put_write_access(inode);
+                        goto cleanup_file;
+                }
+                f->f_mode |= FMODE_WRITER;
+        }
        /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
        if (S_ISREG(inode->i_mode))
                f->f_mode |= FMODE_ATOMIC_POS;
@@ -741,18 +734,9 @@ static int do_dentry_open(struct file *f,
 cleanup_all:
        fops_put(f->f_op);
-        if (f->f_mode & FMODE_WRITE) {
+        if (f->f_mode & FMODE_WRITER) {
                put_write_access(inode);
-                if (!special_file(inode->i_mode)) {
+                __mnt_drop_write(f->f_path.mnt);
-                        /*
-                         * We don't consider this a real
-                         * mnt_want/drop_write() pair
-                         * because it all happenend right
-                         * here, so just reset the state.
-                         */
-                        file_reset_write(f);
-                        __mnt_drop_write(f->f_path.mnt);
-                }
        }
 cleanup_file:
        path_put(&f->f_path);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 8c0ceb8dd1f7..15e4500cda3e 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -368,6 +368,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
 static int openprom_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_NOATIME;
        return 0;
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index 78fd0d0788db..034bffac3f97 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -142,55 +142,6 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
        return 0;
 }
-static int
-pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
-                      int atomic)
-{
-        unsigned long copy;
-        while (len > 0) {
-                while (!iov->iov_len)
-                        iov++;
-                copy = min_t(unsigned long, len, iov->iov_len);
-                if (atomic) {
-                        if (__copy_to_user_inatomic(iov->iov_base, from, copy))
-                                return -EFAULT;
-                } else {
-                        if (copy_to_user(iov->iov_base, from, copy))
-                                return -EFAULT;
-                }
-                from += copy;
-                len -= copy;
-                iov->iov_base += copy;
-                iov->iov_len -= copy;
-        }
-        return 0;
-}
-/*
- * Attempt to pre-fault in the user memory, so we can use atomic copies.
- * Returns the number of bytes not faulted in.
- */
-static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
-{
-        while (!iov->iov_len)
-                iov++;
-        while (len > 0) {
-                unsigned long this_len;
-                this_len = min_t(unsigned long, len, iov->iov_len);
-                if (fault_in_pages_writeable(iov->iov_base, this_len))
-                        break;
-                len -= this_len;
-                iov++;
-        }
-        return len;
-}
 /*
 * Pre-fault in the user memory, so we can use atomic copies.
 */
@@ -226,52 +177,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 }
 /**
- * generic_pipe_buf_map - virtually map a pipe buffer
- * @pipe:       the pipe that the buffer belongs to
- * @buf:        the buffer that should be mapped
- * @atomic:     whether to use an atomic map
- *
- * Description:
- *      This function returns a kernel virtual address mapping for the
- *      pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
- *      and the caller has to be careful not to fault before calling
- *      the unmap function.
- *
- *      Note that this function calls kmap_atomic() if @atomic != 0.
- */
-void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
-                           struct pipe_buffer *buf, int atomic)
-{
-        if (atomic) {
-                buf->flags |= PIPE_BUF_FLAG_ATOMIC;
-                return kmap_atomic(buf->page);
-        }
-        return kmap(buf->page);
-}
-EXPORT_SYMBOL(generic_pipe_buf_map);
-/**
- * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
- * @pipe:       the pipe that the buffer belongs to
- * @buf:        the buffer that should be unmapped
- * @map_data:   the data that the mapping function returned
- *
- * Description:
- *      This function undoes the mapping that ->map() provided.
- */
-void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
-                            struct pipe_buffer *buf, void *map_data)
-{
-        if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
-                buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
-                kunmap_atomic(map_data);
-        } else
-                kunmap(buf->page);
-}
-EXPORT_SYMBOL(generic_pipe_buf_unmap);
-/**
 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
 * @pipe:       the pipe that the buffer belongs to
 * @buf:        the buffer to attempt to steal
@@ -351,8 +256,6 @@ EXPORT_SYMBOL(generic_pipe_buf_release);
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .can_merge = 1,
-        .map = generic_pipe_buf_map,
-        .unmap = generic_pipe_buf_unmap,
        .confirm = generic_pipe_buf_confirm,
        .release = anon_pipe_buf_release,
        .steal = generic_pipe_buf_steal,
@@ -361,8 +264,6 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
 static const struct pipe_buf_operations packet_pipe_buf_ops = {
        .can_merge = 0,
-        .map = generic_pipe_buf_map,
-        .unmap = generic_pipe_buf_unmap,
        .confirm = generic_pipe_buf_confirm,
        .release = anon_pipe_buf_release,
        .steal = generic_pipe_buf_steal,
@@ -379,12 +280,15 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
        ssize_t ret;
        struct iovec *iov = (struct iovec *)_iov;
        size_t total_len;
+        struct iov_iter iter;
        total_len = iov_length(iov, nr_segs);
        /* Null read succeeds. */
        if (unlikely(total_len == 0))
                return 0;
+        iov_iter_init(&iter, iov, nr_segs, total_len, 0);
        do_wakeup = 0;
        ret = 0;
        __pipe_lock(pipe);
@@ -394,9 +298,9 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
                        int curbuf = pipe->curbuf;
                        struct pipe_buffer *buf = pipe->bufs + curbuf;
                        const struct pipe_buf_operations *ops = buf->ops;
-                        void *addr;
                        size_t chars = buf->len;
-                        int error, atomic;
+                        size_t written;
+                        int error;
                        if (chars > total_len)
                                chars = total_len;
@@ -408,21 +312,10 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
                                break;
                        }
-                        atomic = !iov_fault_in_pages_write(iov, chars);
+                        written = copy_page_to_iter(buf->page, buf->offset, chars, &iter);
-redo:
+                        if (unlikely(written < chars)) {
-                        addr = ops->map(pipe, buf, atomic);
-                        error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
-                        ops->unmap(pipe, buf, addr);
-                        if (unlikely(error)) {
-                                /*
-                                 * Just retry with the slow path if we failed.
-                                 */
-                                if (atomic) {
-                                        atomic = 0;
-                                        goto redo;
-                                }
                                if (!ret)
-                                        ret = error;
+                                        ret = -EFAULT;
                                break;
                        }
                        ret += chars;
@@ -538,10 +431,16 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
                        iov_fault_in_pages_read(iov, chars);
 redo1:
-                        addr = ops->map(pipe, buf, atomic);
+                        if (atomic)
+                                addr = kmap_atomic(buf->page);
+                        else
+                                addr = kmap(buf->page);
                        error = pipe_iov_copy_from_user(offset + addr, iov,
                                                        chars, atomic);
-                        ops->unmap(pipe, buf, addr);
+                        if (atomic)
+                                kunmap_atomic(addr);
+                        else
+                                kunmap(buf->page);
                        ret = error;
                        do_wakeup = 1;
                        if (error) {
diff --git a/fs/pnode.c b/fs/pnode.c
index 88396df725b4..302bf22c4a30 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -164,46 +164,94 @@ static struct mount *propagation_next(struct mount *m,
        }
 }
-/*
+static struct mount *next_group(struct mount *m, struct mount *origin)
- * return the source mount to be used for cloning
- *
- * @dest        the current destination mount
- * @last_dest   the last seen destination mount
- * @last_src    the last seen source mount
- * @type        return CL_SLAVE if the new mount has to be
- *              cloned as a slave.
- */
-static struct mount *get_source(struct mount *dest,
-                                struct mount *last_dest,
-                                struct mount *last_src,
-                                int *type)
 {
-        struct mount *p_last_src = NULL;
+        while (1) {
-        struct mount *p_last_dest = NULL;
+                while (1) {
+                        struct mount *next;
-        while (last_dest != dest->mnt_master) {
+                        if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
-                p_last_dest = last_dest;
+                                return first_slave(m);
-                p_last_src = last_src;
+                        next = next_peer(m);
-                last_dest = last_dest->mnt_master;
+                        if (m->mnt_group_id == origin->mnt_group_id) {
-                last_src = last_src->mnt_master;
+                                if (next == origin)
+                                        return NULL;
+                        } else if (m->mnt_slave.next != &next->mnt_slave)
+                                break;
+                        m = next;
+                }
+                /* m is the last peer */
+                while (1) {
+                        struct mount *master = m->mnt_master;
+                        if (m->mnt_slave.next != &master->mnt_slave_list)
+                                return next_slave(m);
+                        m = next_peer(master);
+                        if (master->mnt_group_id == origin->mnt_group_id)
+                                break;
+                        if (master->mnt_slave.next == &m->mnt_slave)
+                                break;
+                        m = master;
+                }
+                if (m == origin)
+                        return NULL;
        }
+}
-        if (p_last_dest) {
+/* all accesses are serialized by namespace_sem */
-                do {
+static struct user_namespace *user_ns;
-                        p_last_dest = next_peer(p_last_dest);
+static struct mount *last_dest, *last_source, *dest_master;
-                } while (IS_MNT_NEW(p_last_dest));
+static struct mountpoint *mp;
-                /* is that a peer of the earlier? */
+static struct hlist_head *list;
-                if (dest == p_last_dest) {
-                        *type = CL_MAKE_SHARED;
+static int propagate_one(struct mount *m)
-                        return p_last_src;
+{
+        struct mount *child;
+        int type;
+        /* skip ones added by this propagate_mnt() */
+        if (IS_MNT_NEW(m))
+                return 0;
+        /* skip if mountpoint isn't covered by it */
+        if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
+                return 0;
+        if (m->mnt_group_id == last_dest->mnt_group_id) {
+                type = CL_MAKE_SHARED;
+        } else {
+                struct mount *n, *p;
+                for (n = m; ; n = p) {
+                        p = n->mnt_master;
+                        if (p == dest_master || IS_MNT_MARKED(p)) {
+                                while (last_dest->mnt_master != p) {
+                                        last_source = last_source->mnt_master;
+                                        last_dest = last_source->mnt_parent;
+                                }
+                                if (n->mnt_group_id != last_dest->mnt_group_id) {
+                                        last_source = last_source->mnt_master;
+                                        last_dest = last_source->mnt_parent;
+                                }
+                                break;
+                        }
                }
+                type = CL_SLAVE;
+                /* beginning of peer group among the slaves? */
+                if (IS_MNT_SHARED(m))
+                        type |= CL_MAKE_SHARED;
        }
-        /* slave of the earlier, then */
+                
-        *type = CL_SLAVE;
+        /* Notice when we are propagating across user namespaces */
-        /* beginning of peer group among the slaves? */
+        if (m->mnt_ns->user_ns != user_ns)
-        if (IS_MNT_SHARED(dest))
+                type |= CL_UNPRIVILEGED;
-                *type |= CL_MAKE_SHARED;
+        child = copy_tree(last_source, last_source->mnt.mnt_root, type);
-        return last_src;
+        if (IS_ERR(child))
+                return PTR_ERR(child);
+        mnt_set_mountpoint(m, mp, child);
+        last_dest = m;
+        last_source = child;
+        if (m->mnt_master != dest_master) {
+                read_seqlock_excl(&mount_lock);
+                SET_MNT_MARK(m->mnt_master);
+                read_sequnlock_excl(&mount_lock);
+        }
+        hlist_add_head(&child->mnt_hash, list);
+        return 0;
 }
 /*
@@ -222,56 +270,48 @@ static struct mount *get_source(struct mount *dest,
 int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
                    struct mount *source_mnt, struct hlist_head *tree_list)
 {
-        struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
+        struct mount *m, *n;
-        struct mount *m, *child;
        int ret = 0;
-        struct mount *prev_dest_mnt = dest_mnt;
-        struct mount *prev_src_mnt  = source_mnt;
+        /*
-        HLIST_HEAD(tmp_list);
+         * we don't want to bother passing tons of arguments to
+         * propagate_one(); everything is serialized by namespace_sem,
-        for (m = propagation_next(dest_mnt, dest_mnt); m;
+         * so globals will do just fine.
-                        m = propagation_next(m, dest_mnt)) {
+         */
-                int type;
+        user_ns = current->nsproxy->mnt_ns->user_ns;
-                struct mount *source;
+        last_dest = dest_mnt;
+        last_source = source_mnt;
-                if (IS_MNT_NEW(m))
+        mp = dest_mp;
-                        continue;
+        list = tree_list;
+        dest_master = dest_mnt->mnt_master;
-                source =  get_source(m, prev_dest_mnt, prev_src_mnt, &type);
+        /* all peers of dest_mnt, except dest_mnt itself */
-                /* Notice when we are propagating across user namespaces */
+        for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
-                if (m->mnt_ns->user_ns != user_ns)
+                ret = propagate_one(n);
-                        type |= CL_UNPRIVILEGED;
+                if (ret)
-                child = copy_tree(source, source->mnt.mnt_root, type);
-                if (IS_ERR(child)) {
-                        ret = PTR_ERR(child);
-                        tmp_list = *tree_list;
-                        tmp_list.first->pprev = &tmp_list.first;
-                        INIT_HLIST_HEAD(tree_list);
                        goto out;
-                }
+        }
-                if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) {
+        /* all slave groups */
-                        mnt_set_mountpoint(m, dest_mp, child);
+        for (m = next_group(dest_mnt, dest_mnt); m;
-                        hlist_add_head(&child->mnt_hash, tree_list);
+                        m = next_group(m, dest_mnt)) {
-                } else {
+                /* everything in that slave group */
-                        /*
+                n = m;
-                         * This can happen if the parent mount was bind mounted
+                do {
-                         * on some subdirectory of a shared/slave mount.
+                        ret = propagate_one(n);
-                         */
+                        if (ret)
-                        hlist_add_head(&child->mnt_hash, &tmp_list);
+                                goto out;
-                }
+                        n = next_peer(n);
-                prev_dest_mnt = m;
+                } while (n != m);
-                prev_src_mnt  = child;
        }
 out:
-        lock_mount_hash();
+        read_seqlock_excl(&mount_lock);
-        while (!hlist_empty(&tmp_list)) {
+        hlist_for_each_entry(n, tree_list, mnt_hash) {
-                child = hlist_entry(tmp_list.first, struct mount, mnt_hash);
+                m = n->mnt_parent;
-                umount_tree(child, 0);
+                if (m->mnt_master != dest_mnt->mnt_master)
+                        CLEAR_MNT_MARK(m->mnt_master);
        }
-        unlock_mount_hash();
+        read_sequnlock_excl(&mount_lock);
        return ret;
 }
diff --git a/fs/pnode.h b/fs/pnode.h
index fc28a27fa892..4a246358b031 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -16,6 +16,9 @@
 #define IS_MNT_NEW(m)  (!(m)->mnt_ns)
 #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
 #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
+#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
+#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
+#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
 #define CL_EXPIRE               0x01
 #define CL_SLAVE                0x02
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 11c54fd51e16..0855f772cd41 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -246,6 +246,12 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
        umode_t mode = 0;
        int not_equiv = 0;
+        /*
+         * A null ACL can always be presented as mode bits.
+         */
+        if (!acl)
+                return 0;
        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch (pa->e_tag) {
                        case ACL_USER_OBJ:
@@ -723,7 +729,7 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
                   void *buffer, size_t size)
 {
        posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
-        posix_acl_xattr_entry *ext_entry = ext_acl->a_entries;
+        posix_acl_xattr_entry *ext_entry;
        int real_size, n;
        real_size = posix_acl_xattr_size(acl->a_count);
@@ -731,7 +737,8 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
                return real_size;
        if (real_size > size)
                return -ERANGE;
-        
+        ext_entry = ext_acl->a_entries;
        ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
        for (n=0; n < acl->a_count; n++, ext_entry++) {
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ab30716584f5..239493ec718e 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -27,6 +27,5 @@ proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
 proc-$(CONFIG_NET)              += proc_net.o
 proc-$(CONFIG_PROC_KCORE)       += kcore.o
 proc-$(CONFIG_PROC_VMCORE)      += vmcore.o
-proc-$(CONFIG_PROC_DEVICETREE)  += proc_devtree.o
 proc-$(CONFIG_PRINTK)   += kmsg.o
 proc-$(CONFIG_PROC_PAGE_MONITOR)        += page.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 656e401794de..64db2bceac59 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -138,8 +138,8 @@ static const char * const task_state_array[] = {
        "D (disk sleep)",       /*   2 */
        "T (stopped)",          /*   4 */
        "t (tracing stop)",     /*   8 */
-        "Z (zombie)",           /*  16 */
+        "X (dead)",             /*  16 */
-        "X (dead)",             /*  32 */
+        "Z (zombie)",           /*  32 */
 };
 static inline const char *get_task_state(struct task_struct *tsk)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b9760628e1fd..2d696b0c93bf 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -200,41 +200,9 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
        return result;
 }
-static int proc_pid_cmdline(struct task_struct *task, char * buffer)
+static int proc_pid_cmdline(struct task_struct *task, char *buffer)
 {
-        int res = 0;
+        return get_cmdline(task, buffer, PAGE_SIZE);
-        unsigned int len;
-        struct mm_struct *mm = get_task_mm(task);
-        if (!mm)
-                goto out;
-        if (!mm->arg_end)
-                goto out_mm;    /* Shh! No looking before we're done */
-        len = mm->arg_end - mm->arg_start;
- 
-        if (len > PAGE_SIZE)
-                len = PAGE_SIZE;
- 
-        res = access_process_vm(task, mm->arg_start, buffer, len, 0);
-        // If the nul at the end of args has been overwritten, then
-        // assume application is using setproctitle(3).
-        if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
-                len = strnlen(buffer, res);
-                if (len < res) {
-                    res = len;
-                } else {
-                        len = mm->env_end - mm->env_start;
-                        if (len > PAGE_SIZE - res)
-                                len = PAGE_SIZE - res;
-                        res += access_process_vm(task, mm->env_start, buffer+res, len, 0);
-                        res = strnlen(buffer, res);
-                }
-        }
-out_mm:
-        mmput(mm);
-out:
-        return res;
 }
 static int proc_pid_auxv(struct task_struct *task, char *buffer)
@@ -1236,6 +1204,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
        make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
        if (*end)
                return -EINVAL;
+        if (make_it_fail < 0 || make_it_fail > 1)
+                return -EINVAL;
        task = get_proc_task(file_inode(file));
        if (!task)
                return -ESRCH;
@@ -2588,7 +2559,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("environ",    S_IRUSR, proc_environ_operations),
        INF("auxv",       S_IRUSR, proc_pid_auxv),
        ONE("status",     S_IRUGO, proc_pid_status),
-        ONE("personality", S_IRUGO, proc_pid_personality),
+        ONE("personality", S_IRUSR, proc_pid_personality),
        INF("limits",     S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
@@ -2598,7 +2569,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-        INF("syscall",    S_IRUGO, proc_pid_syscall),
+        INF("syscall",    S_IRUSR, proc_pid_syscall),
 #endif
        INF("cmdline",    S_IRUGO, proc_pid_cmdline),
        ONE("stat",       S_IRUGO, proc_tgid_stat),
@@ -2617,7 +2588,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
-        REG("pagemap",    S_IRUGO, proc_pagemap_operations),
+        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
        DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2626,7 +2597,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        INF("wchan",      S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
-        ONE("stack",      S_IRUGO, proc_pid_stack),
+        ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
        INF("schedstat",  S_IRUGO, proc_pid_schedstat),
@@ -2927,14 +2898,14 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("environ",   S_IRUSR, proc_environ_operations),
        INF("auxv",      S_IRUSR, proc_pid_auxv),
        ONE("status",    S_IRUGO, proc_pid_status),
-        ONE("personality", S_IRUGO, proc_pid_personality),
+        ONE("personality", S_IRUSR, proc_pid_personality),
        INF("limits",    S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-        INF("syscall",   S_IRUGO, proc_pid_syscall),
+        INF("syscall",   S_IRUSR, proc_pid_syscall),
 #endif
        INF("cmdline",   S_IRUGO, proc_pid_cmdline),
        ONE("stat",      S_IRUGO, proc_tid_stat),
@@ -2955,7 +2926,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
        REG("smaps",     S_IRUGO, proc_tid_smaps_operations),
-        REG("pagemap",    S_IRUGO, proc_pagemap_operations),
+        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
        DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2964,7 +2935,7 @@ static const struct pid_entry tid_base_stuff[] = {
        INF("wchan",     S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
-        ONE("stack",      S_IRUGO, proc_pid_stack),
+        ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
        INF("schedstat", S_IRUGO, proc_pid_schedstat),
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 985ea881b5bc..0788d093f5d8 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -11,6 +11,7 @@
 #include <linux/proc_fs.h>
+#include "../mount.h"
 #include "internal.h"
 #include "fd.h"
@@ -48,8 +49,9 @@ static int seq_show(struct seq_file *m, void *v)
        }
        if (!ret) {
-                seq_printf(m, "pos:\t%lli\nflags:\t0%o\n",
+                seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
-                           (long long)file->f_pos, f_flags);
+                           (long long)file->f_pos, f_flags,
+                           real_mount(file->f_path.mnt)->mnt_id);
                if (file->f_op->show_fdinfo)
                        ret = file->f_op->show_fdinfo(m, file);
                fput(file);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 124fc43c7090..0adbc02d60e3 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,7 +35,7 @@ static void proc_evict_inode(struct inode *inode)
        const struct proc_ns_operations *ns_ops;
        void *ns;
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        /* Stop tracking associated processes */
@@ -47,7 +47,7 @@ static void proc_evict_inode(struct inode *inode)
                pde_put(de);
        head = PROC_I(inode)->sysctl;
        if (head) {
-                rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
+                RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
                sysctl_head_put(head);
        }
        /* Release any associated namespace */
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 651d09a11dde..3ab6d14e71c5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -211,13 +211,6 @@ extern int proc_fill_super(struct super_block *);
 extern void proc_entry_rundown(struct proc_dir_entry *);
 /*
- * proc_devtree.c
- */
-#ifdef CONFIG_PROC_DEVICETREE
-extern void proc_device_tree_init(void);
-#endif
-/*
 * proc_namespaces.c
 */
 extern const struct inode_operations proc_ns_dir_inode_operations;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 136e548d9567..7445af0b1aa3 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -73,7 +73,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
        available += pagecache;
        /*
-         * Part of the reclaimable swap consists of items that are in use,
+         * Part of the reclaimable slab consists of items that are in use,
         * and cannot be freed. Cap this estimate at the low watermark.
         */
        available += global_page_state(NR_SLAB_RECLAIMABLE) -
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 9ae46b87470d..89026095f2b5 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -146,7 +146,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
        struct task_struct *task;
        void *ns;
        char name[50];
-        int len = -EACCES;
+        int res = -EACCES;
        task = get_proc_task(inode);
        if (!task)
@@ -155,24 +155,18 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto out_put_task;
-        len = -ENOENT;
+        res = -ENOENT;
        ns = ns_ops->get(task);
        if (!ns)
                goto out_put_task;
        snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
-        len = strlen(name);
+        res = readlink_copy(buffer, buflen, name);
-        if (len > buflen)
-                len = buflen;
-        if (copy_to_user(buffer, name, len))
-                len = -EFAULT;
        ns_ops->put(ns);
 out_put_task:
        put_task_struct(task);
 out:
-        return len;
+        return res;
 }
 static const struct inode_operations proc_ns_link_inode_operations = {
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
deleted file mode 100644
index c82dd5147845..000000000000
--- a/fs/proc/proc_devtree.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * proc_devtree.c - handles /proc/device-tree
- *
- * Copyright 1997 Paul Mackerras
- */
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/time.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/printk.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/of.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include "internal.h"
-static inline void set_node_proc_entry(struct device_node *np,
-                                       struct proc_dir_entry *de)
-{
-        np->pde = de;
-}
-static struct proc_dir_entry *proc_device_tree;
-/*
- * Supply data on a read from /proc/device-tree/node/property.
- */
-static int property_proc_show(struct seq_file *m, void *v)
-{
-        struct property *pp = m->private;
-        seq_write(m, pp->value, pp->length);
-        return 0;
-}
-static int property_proc_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, property_proc_show, __PDE_DATA(inode));
-}
-static const struct file_operations property_proc_fops = {
-        .owner          = THIS_MODULE,
-        .open           = property_proc_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-};
-/*
- * For a node with a name like "gc@10", we make symlinks called "gc"
- * and "@10" to it.
- */
-/*
- * Add a property to a node
- */
-static struct proc_dir_entry *
-__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
-                const char *name)
-{
-        struct proc_dir_entry *ent;
-        /*
-         * Unfortunately proc_register puts each new entry
-         * at the beginning of the list.  So we rearrange them.
-         */
-        ent = proc_create_data(name,
-                               strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR,
-                               de, &property_proc_fops, pp);
-        if (ent == NULL)
-                return NULL;
-        if (!strncmp(name, "security-", 9))
-                proc_set_size(ent, 0); /* don't leak number of password chars */
-        else
-                proc_set_size(ent, pp->length);
-        return ent;
-}
-void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop)
-{
-        __proc_device_tree_add_prop(pde, prop, prop->name);
-}
-void proc_device_tree_remove_prop(struct proc_dir_entry *pde,
-                                  struct property *prop)
-{
-        remove_proc_entry(prop->name, pde);
-}
-void proc_device_tree_update_prop(struct proc_dir_entry *pde,
-                                  struct property *newprop,
-                                  struct property *oldprop)
-{
-        struct proc_dir_entry *ent;
-        if (!oldprop) {
-                proc_device_tree_add_prop(pde, newprop);
-                return;
-        }
-        for (ent = pde->subdir; ent != NULL; ent = ent->next)
-                if (ent->data == oldprop)
-                        break;
-        if (ent == NULL) {
-                pr_warn("device-tree: property \"%s\" does not exist\n",
-                        oldprop->name);
-        } else {
-                ent->data = newprop;
-                ent->size = newprop->length;
-        }
-}
-/*
- * Various dodgy firmware might give us nodes and/or properties with
- * conflicting names. That's generally ok, except for exporting via /proc,
- * so munge names here to ensure they're unique.
- */
-static int duplicate_name(struct proc_dir_entry *de, const char *name)
-{
-        struct proc_dir_entry *ent;
-        int found = 0;
-        spin_lock(&proc_subdir_lock);
-        for (ent = de->subdir; ent != NULL; ent = ent->next) {
-                if (strcmp(ent->name, name) == 0) {
-                        found = 1;
-                        break;
-                }
-        }
-        spin_unlock(&proc_subdir_lock);
-        return found;
-}
-static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de,
-                const char *name)
-{
-        char *fixed_name;
-        int fixup_len = strlen(name) + 2 + 1; /* name + #x + \0 */
-        int i = 1, size;
-realloc:
-        fixed_name = kmalloc(fixup_len, GFP_KERNEL);
-        if (fixed_name == NULL) {
-                pr_err("device-tree: Out of memory trying to fixup "
-                       "name \"%s\"\n", name);
-                return name;
-        }
-retry:
-        size = snprintf(fixed_name, fixup_len, "%s#%d", name, i);
-        size++; /* account for NULL */
-        if (size > fixup_len) {
-                /* We ran out of space, free and reallocate. */
-                kfree(fixed_name);
-                fixup_len = size;
-                goto realloc;
-        }
-        if (duplicate_name(de, fixed_name)) {
-                /* Multiple duplicates. Retry with a different offset. */
-                i++;
-                goto retry;
-        }
-        pr_warn("device-tree: Duplicate name in %s, renamed to \"%s\"\n",
-                np->full_name, fixed_name);
-        return fixed_name;
-}
-/*
- * Process a node, adding entries for its children and its properties.
- */
-void proc_device_tree_add_node(struct device_node *np,
-                               struct proc_dir_entry *de)
-{
-        struct property *pp;
-        struct proc_dir_entry *ent;
-        struct device_node *child;
-        const char *p;
-        set_node_proc_entry(np, de);
-        for (child = NULL; (child = of_get_next_child(np, child));) {
-                /* Use everything after the last slash, or the full name */
-                p = kbasename(child->full_name);
-                if (duplicate_name(de, p))
-                        p = fixup_name(np, de, p);
-                ent = proc_mkdir(p, de);
-                if (ent == NULL)
-                        break;
-                proc_device_tree_add_node(child, ent);
-        }
-        of_node_put(child);
-        for (pp = np->properties; pp != NULL; pp = pp->next) {
-                p = pp->name;
-                if (strchr(p, '/'))
-                        continue;
-                if (duplicate_name(de, p))
-                        p = fixup_name(np, de, p);
-                ent = __proc_device_tree_add_prop(de, pp, p);
-                if (ent == NULL)
-                        break;
-        }
-}
-/*
- * Called on initialization to set up the /proc/device-tree subtree
- */
-void __init proc_device_tree_init(void)
-{
-        struct device_node *root;
-        proc_device_tree = proc_mkdir("device-tree", NULL);
-        if (proc_device_tree == NULL)
-                return;
-        root = of_find_node_by_path("/");
-        if (root == NULL) {
-                remove_proc_entry("device-tree", NULL);
-                pr_debug("/proc/device-tree: can't find root\n");
-                return;
-        }
-        proc_device_tree_add_node(root, proc_device_tree);
-        of_node_put(root);
-}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 87dbcbef7fe4..5dbadecb234d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -92,6 +92,8 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
 int proc_remount(struct super_block *sb, int *flags, char *data)
 {
        struct pid_namespace *pid = sb->s_fs_info;
+        sync_filesystem(sb);
        return !proc_parse_options(data, pid);
 }
@@ -183,9 +185,6 @@ void __init proc_root_init(void)
        proc_mkdir("openprom", NULL);
 #endif
        proc_tty_init();
-#ifdef CONFIG_PROC_DEVICETREE
-        proc_device_tree_init();
-#endif
        proc_mkdir("bus", NULL);
        proc_sys_init();
 }
diff --git a/fs/proc/self.c b/fs/proc/self.c
index ffeb202ec942..4348bb8907c2 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -16,7 +16,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
        if (!tgid)
                return -ENOENT;
        sprintf(tmp, "%d", tgid);
-        return vfs_readlink(dentry,buffer,buflen,tmp);
+        return readlink_copy(buffer, buflen, tmp);
 }
 static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 6f599c62f0cc..9d231e9e5f0e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -9,7 +9,7 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/irqnr.h>
-#include <asm/cputime.h>
+#include <linux/cputime.h>
 #include <linux/tick.h>
 #ifndef arch_irq_stat_cpu
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fb52b548080d..c4b2646b6d7c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,4 +1,5 @@
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/hugetlb.h>
 #include <linux/huge_mm.h>
 #include <linux/mount.h>
@@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        /*
         * We remember last_addr rather than next_addr to hit with
-         * mmap_cache most of the time. We have zero last_addr at
+         * vmacache most of the time. We have zero last_addr at
         * the beginning and also after lseek. We will have -1 last_addr
         * after the end of the vmas.
         */
@@ -1350,7 +1351,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
        struct numa_maps *md;
        struct page *page;
-        if (pte_none(*pte))
+        if (!pte_present(*pte))
                return 0;
        page = pte_page(*pte);
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 7141b8d0ca9e..33de567c25af 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -5,7 +5,7 @@
 #include <linux/seq_file.h>
 #include <linux/time.h>
 #include <linux/kernel_stat.h>
-#include <asm/cputime.h>
+#include <linux/cputime.h>
 static int uptime_proc_show(struct seq_file *m, void *v)
 {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 88d4585b30f1..6a8e785b29da 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -484,7 +484,6 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
                phdr_ptr->p_memsz = real_sz;
                if (real_sz == 0) {
                        pr_warn("Warning: Zero PT_NOTE entries found\n");
-                        return -EINVAL;
                }
        }
@@ -671,7 +670,6 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
                phdr_ptr->p_memsz = real_sz;
                if (real_sz == 0) {
                        pr_warn("Warning: Zero PT_NOTE entries found\n");
-                        return -EINVAL;
                }
        }
@@ -1118,4 +1116,3 @@ void vmcore_cleanup(void)
        }
        free_elfcorebuf();
 }
-EXPORT_SYMBOL_GPL(vmcore_cleanup);
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 7be26f03a3f5..1a81373947f3 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -267,6 +267,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
        p->root = root;
        p->m.poll_event = ns->event;
        p->show = show;
+        p->cached_event = ~0ULL;
        return 0;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 12823845d324..192297b0090d 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -249,6 +249,7 @@ static void parse_options(char *options)
 static int pstore_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        parse_options(data);
        return 0;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 78c3c2097787..46d269e38706 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -497,6 +497,7 @@ void pstore_get_records(int quiet)
                                                        big_oops_buf_sz);
                        if (unzipped_len > 0) {
+                                kfree(buf);
                                buf = big_oops_buf;
                                size = unzipped_len;
                                compressed = false;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index fa8cef2cca3a..3b5744306ed8 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -86,6 +86,7 @@ struct ramoops_context {
        struct persistent_ram_ecc_info ecc_info;
        unsigned int max_dump_cnt;
        unsigned int dump_write_cnt;
+        /* _read_cnt need clear on ramoops_pstore_open */
        unsigned int dump_read_cnt;
        unsigned int console_read_cnt;
        unsigned int ftrace_read_cnt;
@@ -101,6 +102,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
        cxt->dump_read_cnt = 0;
        cxt->console_read_cnt = 0;
+        cxt->ftrace_read_cnt = 0;
        return 0;
 }
@@ -117,13 +119,15 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
                return NULL;
        prz = przs[i];
+        if (!prz)
+                return NULL;
-        if (update) {
+        /* Update old/shadowed buffer. */
-                /* Update old/shadowed buffer. */
+        if (update)
                persistent_ram_save_old(prz);
-                if (!persistent_ram_old_size(prz))
-                        return NULL;
+        if (!persistent_ram_old_size(prz))
-        }
+                return NULL;
        *typep = type;
        *id = i;
@@ -316,6 +320,7 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
 {
        int i;
+        cxt->max_dump_cnt = 0;
        if (!cxt->przs)
                return;
@@ -346,7 +351,7 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
                             GFP_KERNEL);
        if (!cxt->przs) {
                dev_err(dev, "failed to initialize a prz array for dumps\n");
-                return -ENOMEM;
+                goto fail_prz;
        }
        for (i = 0; i < cxt->max_dump_cnt; i++) {
@@ -428,7 +433,6 @@ static int ramoops_probe(struct platform_device *pdev)
        if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
                pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
-        cxt->dump_read_cnt = 0;
        cxt->size = pdata->mem_size;
        cxt->phys_addr = pdata->mem_address;
        cxt->record_size = pdata->record_size;
@@ -505,7 +509,6 @@ fail_buf:
        kfree(cxt->pstore.buf);
 fail_clear:
        cxt->pstore.bufsize = 0;
-        cxt->max_dump_cnt = 0;
 fail_cnt:
        kfree(cxt->fprz);
 fail_init_fprz:
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index de272d426763..ff7e3d4df5a1 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -54,7 +54,7 @@ static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
        do {
                old = atomic_read(&prz->buffer->start);
                new = old + a;
-                while (unlikely(new > prz->buffer_size))
+                while (unlikely(new >= prz->buffer_size))
                        new -= prz->buffer_size;
        } while (atomic_cmpxchg(&prz->buffer->start, old, new) != old);
@@ -91,7 +91,7 @@ static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
        old = atomic_read(&prz->buffer->start);
        new = old + a;
-        while (unlikely(new > prz->buffer_size))
+        while (unlikely(new >= prz->buffer_size))
                new -= prz->buffer_size;
        atomic_set(&prz->buffer->start, new);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 89558810381c..c4bcb778886e 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -44,6 +44,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
 {
        struct qnx4_sb_info *qs;
+        sync_filesystem(sb);
        qs = qnx4_sb(sb);
        qs->Version = QNX4_VERSION;
        *flags |= MS_RDONLY;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 8d941edfefa1..65cdaab3ed49 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -55,6 +55,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
 static int qnx6_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_RDONLY;
        return 0;
 }
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 880fd9884366..c51df1dd237e 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -8,9 +8,10 @@ config QUOTA
        help
          If you say Y here, you will be able to set per user limits for disk
          usage (also called disk quotas). Currently, it works for the
-          ext2, ext3, and reiserfs file system. ext3 also supports journalled
+          ext2, ext3, ext4, jfs, ocfs2 and reiserfs file systems.
-          quotas for which you don't need to run quotacheck(8) after an unclean
+          Note that gfs2 and xfs use their own quota system.
-          shutdown.
+          Ext3, ext4 and reiserfs also support journaled quotas for which
+          you don't need to run quotacheck(8) after an unclean shutdown.
          For further details, read the Quota mini-HOWTO, available from
          <http://www.tldp.org/docs.html#howto>, or the documentation provided
          with the quota tools. Probably the quota support is only useful for
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index cfc8dcc16043..9cd5f63715c0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -528,7 +528,7 @@ restart:
                if (atomic_read(&dquot->dq_count)) {
                        DEFINE_WAIT(wait);
-                        atomic_inc(&dquot->dq_count);
+                        dqgrab(dquot);
                        prepare_to_wait(&dquot->dq_wait_unused, &wait,
                                        TASK_UNINTERRUPTIBLE);
                        spin_unlock(&dq_list_lock);
@@ -632,7 +632,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
                        /* Now we have active dquot from which someone is
                         * holding reference so we can safely just increase
                         * use count */
-                        atomic_inc(&dquot->dq_count);
+                        dqgrab(dquot);
                        spin_unlock(&dq_list_lock);
                        dqstats_inc(DQST_LOOKUPS);
                        err = sb->dq_op->write_dquot(dquot);
diff --git a/fs/read_write.c b/fs/read_write.c
index 28cc9c810744..31c6efa43183 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -994,9 +994,9 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
        return ret;
 }
-COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
+static long __compat_sys_preadv64(unsigned long fd,
-                const struct compat_iovec __user *,vec,
+                                  const struct compat_iovec __user *vec,
-                unsigned long, vlen, loff_t, pos)
+                                  unsigned long vlen, loff_t pos)
 {
        struct fd f;
        ssize_t ret;
@@ -1013,12 +1013,22 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
        return ret;
 }
+#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
+COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
+                const struct compat_iovec __user *,vec,
+                unsigned long, vlen, loff_t, pos)
+{
+        return __compat_sys_preadv64(fd, vec, vlen, pos);
+}
+#endif
 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
                const struct compat_iovec __user *,vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
 {
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
-        return compat_sys_preadv64(fd, vec, vlen, pos);
+        return __compat_sys_preadv64(fd, vec, vlen, pos);
 }
 static size_t compat_writev(struct file *file,
@@ -1061,9 +1071,9 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
        return ret;
 }
-COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
+static long __compat_sys_pwritev64(unsigned long fd,
-                const struct compat_iovec __user *,vec,
+                                   const struct compat_iovec __user *vec,
-                unsigned long, vlen, loff_t, pos)
+                                   unsigned long vlen, loff_t pos)
 {
        struct fd f;
        ssize_t ret;
@@ -1080,12 +1090,22 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
        return ret;
 }
+#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
+COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
+                const struct compat_iovec __user *,vec,
+                unsigned long, vlen, loff_t, pos)
+{
+        return __compat_sys_pwritev64(fd, vec, vlen, pos);
+}
+#endif
 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
                const struct compat_iovec __user *,vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
 {
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;
-        return compat_sys_pwritev64(fd, vec, vlen, pos);
+        return __compat_sys_pwritev64(fd, vec, vlen, pos);
 }
 #endif
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 1fd2051109a3..af677353a3f5 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -125,6 +125,7 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
                                int d_reclen;
                                char *d_name;
                                ino_t d_ino;
+                                loff_t cur_pos = deh_offset(deh);
                                if (!de_visible(deh))
                                        /* it is hidden entry */
@@ -196,8 +197,9 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
                                if (local_buf != small_buf) {
                                        kfree(local_buf);
                                }
-                                // next entry should be looked for with such offset
-                                next_pos = deh_offset(deh) + 1;
+                                /* deh_offset(deh) may be invalid now. */
+                                next_pos = cur_pos + 1;
                                if (item_moved(&tmp_ih, &path_to_entry)) {
                                        set_cpu_key_k_offset(&pos_key,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ad62bdbb451e..bc8b8009897d 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -35,7 +35,7 @@ void reiserfs_evict_inode(struct inode *inode)
        if (!inode->i_nlink && !is_bad_inode(inode))
                dquot_initialize(inode);
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        if (inode->i_nlink)
                goto no_delete;
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 8d06adf89948..83d4eac8059a 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2831,6 +2831,7 @@ void reiserfs_init_alloc_options(struct super_block *s);
 */
 __le32 reiserfs_choose_packing(struct inode *dir);
+void show_alloc_options(struct seq_file *seq, struct super_block *s);
 int reiserfs_init_bitmap_cache(struct super_block *sb);
 void reiserfs_free_bitmap_cache(struct super_block *sb);
 void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2c803353f8ac..9fb20426005e 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -62,7 +62,6 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
 static int reiserfs_remount(struct super_block *s, int *flags, char *data);
 static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
-void show_alloc_options(struct seq_file *seq, struct super_block *s);
 static int reiserfs_sync_fs(struct super_block *s, int wait)
 {
@@ -597,7 +596,7 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
 }
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
        reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
                                                  sizeof(struct
@@ -1319,6 +1318,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        int i;
 #endif
+        sync_filesystem(s);
        reiserfs_write_lock(s);
 #ifdef CONFIG_QUOTA
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index d8418782862b..ef90e8bca95a 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -432,6 +432,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 */
 static int romfs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_RDONLY;
        return 0;
 }
diff --git a/fs/splice.c b/fs/splice.c
index 12028fa41def..e246954ea48c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -136,8 +136,6 @@ error:
 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
        .can_merge = 0,
-        .map = generic_pipe_buf_map,
-        .unmap = generic_pipe_buf_unmap,
        .confirm = page_cache_pipe_buf_confirm,
        .release = page_cache_pipe_buf_release,
        .steal = page_cache_pipe_buf_steal,
@@ -156,8 +154,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
        .can_merge = 0,
-        .map = generic_pipe_buf_map,
-        .unmap = generic_pipe_buf_unmap,
        .confirm = generic_pipe_buf_confirm,
        .release = page_cache_pipe_buf_release,
        .steal = user_page_pipe_buf_steal,
@@ -547,8 +543,6 @@ EXPORT_SYMBOL(generic_file_splice_read);
 static const struct pipe_buf_operations default_pipe_buf_ops = {
        .can_merge = 0,
-        .map = generic_pipe_buf_map,
-        .unmap = generic_pipe_buf_unmap,
        .confirm = generic_pipe_buf_confirm,
        .release = generic_pipe_buf_release,
        .steal = generic_pipe_buf_steal,
@@ -564,8 +558,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
 /* Pipe buffer operations for a socket and similar. */
 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
        .can_merge = 0,
-        .map = generic_pipe_buf_map,
-        .unmap = generic_pipe_buf_unmap,
        .confirm = generic_pipe_buf_confirm,
        .release = generic_pipe_buf_release,
        .steal = generic_pipe_buf_nosteal,
@@ -767,13 +759,13 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
                goto out;
        if (buf->page != page) {
-                char *src = buf->ops->map(pipe, buf, 1);
+                char *src = kmap_atomic(buf->page);
                char *dst = kmap_atomic(page);
                memcpy(dst + offset, src + buf->offset, this_len);
                flush_dcache_page(page);
                kunmap_atomic(dst);
-                buf->ops->unmap(pipe, buf, src);
+                kunmap_atomic(src);
        }
        ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
                                page, fsdata);
@@ -1067,9 +1059,9 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
        void *data;
        loff_t tmp = sd->pos;
-        data = buf->ops->map(pipe, buf, 0);
+        data = kmap(buf->page);
        ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
-        buf->ops->unmap(pipe, buf, data);
+        kunmap(buf->page);
        return ret;
 }
@@ -1528,116 +1520,50 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
                        struct splice_desc *sd)
 {
-        char *src;
+        int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
-        int ret;
+        return n == sd->len ? n : -EFAULT;
-        /*
-         * See if we can use the atomic maps, by prefaulting in the
-         * pages and doing an atomic copy
-         */
-        if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
-                src = buf->ops->map(pipe, buf, 1);
-                ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
-                                                        sd->len);
-                buf->ops->unmap(pipe, buf, src);
-                if (!ret) {
-                        ret = sd->len;
-                        goto out;
-                }
-        }
-        /*
-         * No dice, use slow non-atomic map and copy
-         */
-        src = buf->ops->map(pipe, buf, 0);
-        ret = sd->len;
-        if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
-                ret = -EFAULT;
-        buf->ops->unmap(pipe, buf, src);
-out:
-        if (ret > 0)
-                sd->u.userptr += ret;
-        return ret;
 }
 /*
 * For lack of a better implementation, implement vmsplice() to userspace
 * as a simple copy of the pipes pages to the user iov.
 */
-static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
+static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
                             unsigned long nr_segs, unsigned int flags)
 {
        struct pipe_inode_info *pipe;
        struct splice_desc sd;
-        ssize_t size;
-        int error;
        long ret;
+        struct iovec iovstack[UIO_FASTIOV];
+        struct iovec *iov = iovstack;
+        struct iov_iter iter;
+        ssize_t count;
        pipe = get_pipe_info(file);
        if (!pipe)
                return -EBADF;
-        pipe_lock(pipe);
+        ret = rw_copy_check_uvector(READ, uiov, nr_segs,
+                                    ARRAY_SIZE(iovstack), iovstack, &iov);
-        error = ret = 0;
+        if (ret <= 0)
-        while (nr_segs) {
+                goto out;
-                void __user *base;
-                size_t len;
-                /*
-                 * Get user address base and length for this iovec.
-                 */
-                error = get_user(base, &iov->iov_base);
-                if (unlikely(error))
-                        break;
-                error = get_user(len, &iov->iov_len);
-                if (unlikely(error))
-                        break;
-                /*
-                 * Sanity check this iovec. 0 read succeeds.
-                 */
-                if (unlikely(!len))
-                        break;
-                if (unlikely(!base)) {
-                        error = -EFAULT;
-                        break;
-                }
-                if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
-                        error = -EFAULT;
-                        break;
-                }
-                sd.len = 0;
-                sd.total_len = len;
-                sd.flags = flags;
-                sd.u.userptr = base;
-                sd.pos = 0;
-                size = __splice_from_pipe(pipe, &sd, pipe_to_user);
-                if (size < 0) {
-                        if (!ret)
-                                ret = size;
-                        break;
-                }
-                ret += size;
-                if (size < len)
+        count = ret;
-                        break;
+        iov_iter_init(&iter, iov, nr_segs, count, 0);
-                nr_segs--;
+        sd.len = 0;
-                iov++;
+        sd.total_len = count;
-        }
+        sd.flags = flags;
+        sd.u.data = &iter;
+        sd.pos = 0;
+        pipe_lock(pipe);
+        ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
        pipe_unlock(pipe);
-        if (!ret)
+out:
-                ret = error;
+        if (iov != iovstack)
+                kfree(iov);
        return ret;
 }
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 202df6312d4e..031c8d67fd51 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -371,6 +371,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 {
+        sync_filesystem(sb);
        *flags |= MS_RDONLY;
        return 0;
 }
diff --git a/fs/super.c b/fs/super.c
index 80d5cf2ca765..48377f7463c0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -719,8 +719,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                }
        }
-        sync_filesystem(sb);
        if (sb->s_op->remount_fs) {
                retval = sb->s_op->remount_fs(sb, &flags, data);
                if (retval) {
@@ -802,7 +800,10 @@ void emergency_remount(void)
 static DEFINE_IDA(unnamed_dev_ida);
 static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
-static int unnamed_dev_start = 0; /* don't bother trying below it */
+/* Many userspace utilities consider an FSID of 0 invalid.
+ * Always return at least 1 from get_anon_bdev.
+ */
+static int unnamed_dev_start = 1;
 int get_anon_bdev(dev_t *p)
 {
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index 8c41feacbac5..b2756014508c 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,6 +1,7 @@
 config SYSFS
        bool "sysfs file system support" if EXPERT
        default y
+        select KERNFS
        help
        The sysfs filesystem is a virtual filesystem that the kernel uses to
        export internal kernel objects, their attributes, and their
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index ee0d761c3179..0b45ff42f374 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -19,39 +19,18 @@
 DEFINE_SPINLOCK(sysfs_symlink_target_lock);
-/**
- *      sysfs_pathname - return full path to sysfs dirent
- *      @kn: kernfs_node whose path we want
- *      @path: caller allocated buffer of size PATH_MAX
- *
- *      Gives the name "/" to the sysfs_root entry; any path returned
- *      is relative to wherever sysfs is mounted.
- */
-static char *sysfs_pathname(struct kernfs_node *kn, char *path)
-{
-        if (kn->parent) {
-                sysfs_pathname(kn->parent, path);
-                strlcat(path, "/", PATH_MAX);
-        }
-        strlcat(path, kn->name, PATH_MAX);
-        return path;
-}
 void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
 {
-        char *path;
+        char *buf, *path = NULL;
-        path = kzalloc(PATH_MAX, GFP_KERNEL);
+        buf = kzalloc(PATH_MAX, GFP_KERNEL);
-        if (path) {
+        if (buf)
-                sysfs_pathname(parent, path);
+                path = kernfs_path(parent, buf, PATH_MAX);
-                strlcat(path, "/", PATH_MAX);
-                strlcat(path, name, PATH_MAX);
-        }
-        WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s'\n",
+        WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n",
-             path ? path : name);
+             path, name);
-        kfree(path);
+        kfree(buf);
 }
 /**
@@ -122,9 +101,13 @@ void sysfs_remove_dir(struct kobject *kobj)
 int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
                        const void *new_ns)
 {
-        struct kernfs_node *parent = kobj->sd->parent;
+        struct kernfs_node *parent;
+        int ret;
-        return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
+        parent = kernfs_get_parent(kobj->sd);
+        ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
+        kernfs_put(parent);
+        return ret;
 }
 int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
@@ -133,7 +116,6 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
        struct kernfs_node *kn = kobj->sd;
        struct kernfs_node *new_parent;
-        BUG_ON(!kn->parent);
        new_parent = new_parent_kobj && new_parent_kobj->sd ?
                new_parent_kobj->sd : sysfs_root_kn;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 810cf6e613e5..e9ef59b3abb1 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -47,12 +47,13 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
        ssize_t count;
        char *buf;
-        /* acquire buffer and ensure that it's >= PAGE_SIZE */
+        /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */
        count = seq_get_buf(sf, &buf);
        if (count < PAGE_SIZE) {
                seq_commit(sf, -1);
                return 0;
        }
+        memset(buf, 0, PAGE_SIZE);
        /*
         * Invoke show().  Control may reach here via seq file lseek even
@@ -372,6 +373,29 @@ void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
+/**
+ * sysfs_remove_file_self - remove an object attribute from its own method
+ * @kobj: object we're acting for
+ * @attr: attribute descriptor
+ *
+ * See kernfs_remove_self() for details.
+ */
+bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr)
+{
+        struct kernfs_node *parent = kobj->sd;
+        struct kernfs_node *kn;
+        bool ret;
+        kn = kernfs_find_and_get(parent, attr->name);
+        if (WARN_ON_ONCE(!kn))
+                return false;
+        ret = kernfs_remove_self(kn);
+        kernfs_put(kn);
+        return ret;
+}
 void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
 {
        int i;
@@ -430,95 +454,3 @@ void sysfs_remove_bin_file(struct kobject *kobj,
        kernfs_remove_by_name(kobj->sd, attr->attr.name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
-struct sysfs_schedule_callback_struct {
-        struct list_head        workq_list;
-        struct kobject          *kobj;
-        void                    (*func)(void *);
-        void                    *data;
-        struct module           *owner;
-        struct work_struct      work;
-};
-static struct workqueue_struct *sysfs_workqueue;
-static DEFINE_MUTEX(sysfs_workq_mutex);
-static LIST_HEAD(sysfs_workq);
-static void sysfs_schedule_callback_work(struct work_struct *work)
-{
-        struct sysfs_schedule_callback_struct *ss = container_of(work,
-                        struct sysfs_schedule_callback_struct, work);
-        (ss->func)(ss->data);
-        kobject_put(ss->kobj);
-        module_put(ss->owner);
-        mutex_lock(&sysfs_workq_mutex);
-        list_del(&ss->workq_list);
-        mutex_unlock(&sysfs_workq_mutex);
-        kfree(ss);
-}
-/**
- * sysfs_schedule_callback - helper to schedule a callback for a kobject
- * @kobj: object we're acting for.
- * @func: callback function to invoke later.
- * @data: argument to pass to @func.
- * @owner: module owning the callback code
- *
- * sysfs attribute methods must not unregister themselves or their parent
- * kobject (which would amount to the same thing).  Attempts to do so will
- * deadlock, since unregistration is mutually exclusive with driver
- * callbacks.
- *
- * Instead methods can call this routine, which will attempt to allocate
- * and schedule a workqueue request to call back @func with @data as its
- * argument in the workqueue's process context.  @kobj will be pinned
- * until @func returns.
- *
- * Returns 0 if the request was submitted, -ENOMEM if storage could not
- * be allocated, -ENODEV if a reference to @owner isn't available,
- * -EAGAIN if a callback has already been scheduled for @kobj.
- */
-int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
-                void *data, struct module *owner)
-{
-        struct sysfs_schedule_callback_struct *ss, *tmp;
-        if (!try_module_get(owner))
-                return -ENODEV;
-        mutex_lock(&sysfs_workq_mutex);
-        list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
-                if (ss->kobj == kobj) {
-                        module_put(owner);
-                        mutex_unlock(&sysfs_workq_mutex);
-                        return -EAGAIN;
-                }
-        mutex_unlock(&sysfs_workq_mutex);
-        if (sysfs_workqueue == NULL) {
-                sysfs_workqueue = create_singlethread_workqueue("sysfsd");
-                if (sysfs_workqueue == NULL) {
-                        module_put(owner);
-                        return -ENOMEM;
-                }
-        }
-        ss = kmalloc(sizeof(*ss), GFP_KERNEL);
-        if (!ss) {
-                module_put(owner);
-                return -ENOMEM;
-        }
-        kobject_get(kobj);
-        ss->kobj = kobj;
-        ss->func = func;
-        ss->data = data;
-        ss->owner = owner;
-        INIT_WORK(&ss->work, sysfs_schedule_callback_work);
-        INIT_LIST_HEAD(&ss->workq_list);
-        mutex_lock(&sysfs_workq_mutex);
-        list_add_tail(&ss->workq_list, &sysfs_workq);
-        mutex_unlock(&sysfs_workq_mutex);
-        queue_work(sysfs_workqueue, &ss->work);
-        return 0;
-}
-EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 6b579387c67a..aa0406895b53 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -70,8 +70,11 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
        if (grp->bin_attrs) {
                for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
                        if (update)
-                                sysfs_remove_bin_file(kobj, *bin_attr);
+                                kernfs_remove_by_name(parent,
-                        error = sysfs_create_bin_file(kobj, *bin_attr);
+                                                (*bin_attr)->attr.name);
+                        error = sysfs_add_file_mode_ns(parent,
+                                        &(*bin_attr)->attr, true,
+                                        (*bin_attr)->attr.mode, NULL);
                        if (error)
                                break;
                }
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 3eaf5c6622eb..8a49486bf30c 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -13,6 +13,7 @@
 #define DEBUG
 #include <linux/fs.h>
+#include <linux/magic.h>
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/user_namespace.h>
@@ -38,7 +39,8 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
        }
        ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
-        root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns);
+        root = kernfs_mount_ns(fs_type, flags, sysfs_root,
+                                SYSFS_MAGIC, &new_sb, ns);
        if (IS_ERR(root) || !new_sb)
                kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
        return root;
@@ -63,7 +65,8 @@ int __init sysfs_init(void)
 {
        int err;
-        sysfs_root = kernfs_create_root(NULL, NULL);
+        sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
+                                        NULL);
        if (IS_ERR(sysfs_root))
                return PTR_ERR(sysfs_root);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index c327d4ee1235..88956309cc86 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -60,6 +60,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
 {
        struct sysv_sb_info *sbi = SYSV_SB(sb);
+        sync_filesystem(sb);
        if (sbi->s_forced_ro)
                *flags |= MS_RDONLY;
        return 0;
@@ -295,7 +296,7 @@ int sysv_sync_inode(struct inode *inode)
 static void sysv_evict_inode(struct inode *inode)
 {
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        if (!inode->i_nlink) {
                inode->i_size = 0;
                sysv_truncate(inode);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 929312180dd0..0013142c0475 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -317,6 +317,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
            (clockid != CLOCK_MONOTONIC &&
             clockid != CLOCK_REALTIME &&
             clockid != CLOCK_REALTIME_ALARM &&
+             clockid != CLOCK_BOOTTIME &&
             clockid != CLOCK_BOOTTIME_ALARM))
                return -EINVAL;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 123c79b7261e..4f34dbae823d 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1538,6 +1538,7 @@ out_unlock:
 static const struct vm_operations_struct ubifs_file_vm_ops = {
        .fault        = filemap_fault,
+        .map_pages = filemap_map_pages,
        .page_mkwrite = ubifs_vm_page_mkwrite,
        .remap_pages = generic_file_remap_pages,
 };
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 5ded8490c0c6..a81c7b556896 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -351,7 +351,7 @@ static void ubifs_evict_inode(struct inode *inode)
        dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
        ubifs_assert(!atomic_read(&inode->i_count));
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        if (inode->i_nlink)
                goto done;
@@ -1556,7 +1556,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        if (c->space_fixup) {
                err = ubifs_fixup_free_space(c);
                if (err)
-                        return err;
+                        goto out;
        }
        err = check_free_space(c);
@@ -1827,6 +1827,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
        int err;
        struct ubifs_info *c = sb->s_fs_info;
+        sync_filesystem(sb);
        dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
        err = ubifs_parse_options(c, data, 1);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1037637957c7..d2c170f8b035 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -171,7 +171,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        } else
                up_write(&iinfo->i_data_sem);
-        retval = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+        retval = __generic_file_aio_write(iocb, iov, nr_segs);
        mutex_unlock(&inode->i_mutex);
        if (retval > 0) {
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 982ce05c87ed..5d643706212f 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -146,8 +146,8 @@ void udf_evict_inode(struct inode *inode)
                want_delete = 1;
                udf_setsize(inode, 0);
                udf_update_inode(inode, IS_SYNC(inode));
-        } else
+        }
-                truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        invalidate_inode_buffers(inode);
        clear_inode(inode);
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3306b9f69bed..3286db047a40 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -175,7 +175,7 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
 }
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
        udf_inode_cachep = kmem_cache_create("udf_inode_cache",
                                             sizeof(struct udf_inode_info),
@@ -505,6 +505,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
        while ((p = strsep(&options, ",")) != NULL) {
                substring_t args[MAX_OPT_ARGS];
                int token;
+                unsigned n;
                if (!*p)
                        continue;
@@ -516,7 +517,10 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
                case Opt_bs:
                        if (match_int(&args[0], &option))
                                return 0;
-                        uopt->blocksize = option;
+                        n = option;
+                        if (n != 512 && n != 1024 && n != 2048 && n != 4096)
+                                return 0;
+                        uopt->blocksize = n;
                        uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
                        break;
                case Opt_unhide:
@@ -646,6 +650,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        int error = 0;
        struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
+        sync_filesystem(sb);
        if (lvidiu) {
                int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
                if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index a7ea492ae660..0ab1de4b39a5 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -38,7 +38,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
 {
        struct super_block * sb;
        struct ufs_sb_private_info * uspi;
-        struct ufs_super_block_first * usb1;
        struct ufs_cg_private_info * ucpi;
        struct ufs_cylinder_group * ucg;
        unsigned cgno, bit, end_bit, bbase, blkmap, i;
@@ -46,7 +45,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
        
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
-        usb1 = ubh_get_usb_first(uspi);
        
        UFSD("ENTER, fragment %llu, count %u\n",
             (unsigned long long)fragment, count);
@@ -135,7 +133,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
 {
        struct super_block * sb;
        struct ufs_sb_private_info * uspi;
-        struct ufs_super_block_first * usb1;
        struct ufs_cg_private_info * ucpi;
        struct ufs_cylinder_group * ucg;
        unsigned overflow, cgno, bit, end_bit, i;
@@ -143,7 +140,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
        
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
-        usb1 = ubh_get_usb_first(uspi);
        UFSD("ENTER, fragment %llu, count %u\n",
             (unsigned long long)fragment, count);
@@ -499,7 +495,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
 {
        struct super_block * sb;
        struct ufs_sb_private_info * uspi;
-        struct ufs_super_block_first * usb1;
        struct ufs_cg_private_info * ucpi;
        struct ufs_cylinder_group * ucg;
        unsigned cgno, fragno, fragoff, count, fragsize, i;
@@ -509,7 +504,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
        
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
-        usb1 = ubh_get_usb_first (uspi);
        count = newcount - oldcount;
        
        cgno = ufs_dtog(uspi, fragment);
@@ -577,7 +571,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
 {
        struct super_block * sb;
        struct ufs_sb_private_info * uspi;
-        struct ufs_super_block_first * usb1;
        struct ufs_cg_private_info * ucpi;
        struct ufs_cylinder_group * ucg;
        unsigned oldcg, i, j, k, allocsize;
@@ -588,7 +581,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
-        usb1 = ubh_get_usb_first(uspi);
        oldcg = cgno;
        
        /*
@@ -690,7 +682,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
 {
        struct super_block * sb;
        struct ufs_sb_private_info * uspi;
-        struct ufs_super_block_first * usb1;
        struct ufs_cylinder_group * ucg;
        u64 result, blkno;
@@ -698,7 +689,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
-        usb1 = ubh_get_usb_first(uspi);
        ucg = ubh_get_ucg(UCPI_UBH(ucpi));
        if (goal == 0) {
@@ -794,7 +784,6 @@ static u64 ufs_bitmap_search(struct super_block *sb,
                0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
        };
        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
-        struct ufs_super_block_first *usb1;
        struct ufs_cylinder_group *ucg;
        unsigned start, length, loc;
        unsigned pos, want, blockmap, mask, end;
@@ -803,7 +792,6 @@ static u64 ufs_bitmap_search(struct super_block *sb,
        UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx,
             (unsigned long long)goal, count);
-        usb1 = ubh_get_usb_first (uspi);
        ucg = ubh_get_ucg(UCPI_UBH(ucpi));
        if (goal)
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index d0426d74817b..98f7211599ff 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -57,7 +57,6 @@ void ufs_free_inode (struct inode * inode)
 {
        struct super_block * sb;
        struct ufs_sb_private_info * uspi;
-        struct ufs_super_block_first * usb1;
        struct ufs_cg_private_info * ucpi;
        struct ufs_cylinder_group * ucg;
        int is_directory;
@@ -67,7 +66,6 @@ void ufs_free_inode (struct inode * inode)
        sb = inode->i_sb;
        uspi = UFS_SB(sb)->s_uspi;
-        usb1 = ubh_get_usb_first(uspi);
        
        ino = inode->i_ino;
@@ -175,7 +173,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
        struct super_block * sb;
        struct ufs_sb_info * sbi;
        struct ufs_sb_private_info * uspi;
-        struct ufs_super_block_first * usb1;
        struct ufs_cg_private_info * ucpi;
        struct ufs_cylinder_group * ucg;
        struct inode * inode;
@@ -195,7 +192,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
        ufsi = UFS_I(inode);
        sbi = UFS_SB(sb);
        uspi = sbi->s_uspi;
-        usb1 = ubh_get_usb_first(uspi);
        mutex_lock(&sbi->s_lock);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index c8ca96086784..61e8a9b021dd 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -885,7 +885,7 @@ void ufs_evict_inode(struct inode * inode)
        if (!inode->i_nlink && !is_bad_inode(inode))
                want_delete = 1;
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        if (want_delete) {
                loff_t old_i_size;
                /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 329f2f53b7ed..c1183f9f69dc 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -524,11 +524,9 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
        struct ufs_buffer_head * ubh;
        unsigned char * base, * space;
        unsigned size, blks, i;
-        struct ufs_super_block_third *usb3;
        UFSD("ENTER\n");
-        usb3 = ubh_get_usb_third(uspi);
        /*
         * Read cs structures from (usually) first data block
         * on the device. 
@@ -1280,6 +1278,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        unsigned new_mount_opt, ufstype;
        unsigned flags;
+        sync_filesystem(sb);
        lock_ufs(sb);
        mutex_lock(&UFS_SB(sb)->s_lock);
        uspi = UFS_SB(sb)->s_uspi;
@@ -1389,15 +1388,11 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct super_block *sb = dentry->d_sb;
        struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi;
        unsigned  flags = UFS_SB(sb)->s_flags;
-        struct ufs_super_block_first *usb1;
-        struct ufs_super_block_second *usb2;
        struct ufs_super_block_third *usb3;
        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
        lock_ufs(sb);
-        usb1 = ubh_get_usb_first(uspi);
-        usb2 = ubh_get_usb_second(uspi);
        usb3 = ubh_get_usb_third(uspi);
        
        if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
@@ -1453,7 +1448,7 @@ static void init_once(void *foo)
        inode_init_once(&ei->vfs_inode);
 }
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
        ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
                                             sizeof(struct ufs_inode_info),
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 66a36befc5c0..844e288b9576 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -65,12 +65,31 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
 void *
 kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
 {
+        unsigned noio_flag = 0;
        void    *ptr;
+        gfp_t   lflags;
        ptr = kmem_zalloc(size, flags | KM_MAYFAIL);
        if (ptr)
                return ptr;
-        return vzalloc(size);
+        /*
+         * __vmalloc() will allocate data pages and auxillary structures (e.g.
+         * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
+         * here. Hence we need to tell memory reclaim that we are in such a
+         * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
+         * the filesystem here and potentially deadlocking.
+         */
+        if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+                noio_flag = memalloc_noio_save();
+        lflags = kmem_flags_convert(flags);
+        ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+        if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+                memalloc_noio_restore(noio_flag);
+        return ptr;
 }
 void
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 0ecec1896f25..6888ad886ff6 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -281,7 +281,7 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
        if (!acl)
                goto set_acl;
-        error = -EINVAL;
+        error = -E2BIG;
        if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
                return error;
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 3fc109819c34..0fdd4109c624 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -89,6 +89,8 @@ typedef struct xfs_agf {
        /* structure must be padded to 64 bit alignment */
 } xfs_agf_t;
+#define XFS_AGF_CRC_OFF         offsetof(struct xfs_agf, agf_crc)
 #define XFS_AGF_MAGICNUM        0x00000001
 #define XFS_AGF_VERSIONNUM      0x00000002
 #define XFS_AGF_SEQNO           0x00000004
@@ -167,6 +169,8 @@ typedef struct xfs_agi {
        /* structure must be padded to 64 bit alignment */
 } xfs_agi_t;
+#define XFS_AGI_CRC_OFF         offsetof(struct xfs_agi, agi_crc)
 #define XFS_AGI_MAGICNUM        0x00000001
 #define XFS_AGI_VERSIONNUM      0x00000002
 #define XFS_AGI_SEQNO           0x00000004
@@ -222,6 +226,8 @@ typedef struct xfs_agfl {
        __be32          agfl_bno[];     /* actually XFS_AGFL_SIZE(mp) */
 } xfs_agfl_t;
+#define XFS_AGFL_CRC_OFF        offsetof(struct xfs_agfl, agfl_crc)
 /*
 * tags for inode radix tree
 */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 9eab2dfdcbb5..c1cf6a336a72 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -474,7 +474,6 @@ xfs_agfl_read_verify(
        struct xfs_buf  *bp)
 {
        struct xfs_mount *mp = bp->b_target->bt_mount;
-        int             agfl_ok = 1;
        /*
         * There is no verification of non-crc AGFLs because mkfs does not
@@ -485,15 +484,13 @@ xfs_agfl_read_verify(
        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return;
-        agfl_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+        if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
-                                   offsetof(struct xfs_agfl, agfl_crc));
+                xfs_buf_ioerror(bp, EFSBADCRC);
+        else if (!xfs_agfl_verify(bp))
-        agfl_ok = agfl_ok && xfs_agfl_verify(bp);
-        if (!agfl_ok) {
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
-        }
+        if (bp->b_error)
+                xfs_verifier_error(bp);
 }
 static void
@@ -508,16 +505,15 @@ xfs_agfl_write_verify(
                return;
        if (!xfs_agfl_verify(bp)) {
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
                return;
        }
        if (bip)
                XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+        xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
-                         offsetof(struct xfs_agfl, agfl_crc));
 }
 const struct xfs_buf_ops xfs_agfl_buf_ops = {
@@ -2238,19 +2234,17 @@ xfs_agf_read_verify(
        struct xfs_buf  *bp)
 {
        struct xfs_mount *mp = bp->b_target->bt_mount;
-        int             agf_ok = 1;
-        if (xfs_sb_version_hascrc(&mp->m_sb))
-                agf_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
-                                          offsetof(struct xfs_agf, agf_crc));
-        agf_ok = agf_ok && xfs_agf_verify(mp, bp);
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
+            !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
-        if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+                xfs_buf_ioerror(bp, EFSBADCRC);
-                        XFS_RANDOM_ALLOC_READ_AGF))) {
+        else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+                                XFS_ERRTAG_ALLOC_READ_AGF,
+                                XFS_RANDOM_ALLOC_READ_AGF))
                xfs_buf_ioerror(bp, EFSCORRUPTED);
-        }
+        if (bp->b_error)
+                xfs_verifier_error(bp);
 }
 static void
@@ -2261,8 +2255,8 @@ xfs_agf_write_verify(
        struct xfs_buf_log_item *bip = bp->b_fspriv;
        if (!xfs_agf_verify(mp, bp)) {
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
                return;
        }
@@ -2272,8 +2266,7 @@ xfs_agf_write_verify(
        if (bip)
                XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+        xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
-                         offsetof(struct xfs_agf, agf_crc));
 }
 const struct xfs_buf_ops xfs_agf_buf_ops = {
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 13085429e523..cc1eadcbb049 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -355,12 +355,14 @@ static void
 xfs_allocbt_read_verify(
        struct xfs_buf  *bp)
 {
-        if (!(xfs_btree_sblock_verify_crc(bp) &&
+        if (!xfs_btree_sblock_verify_crc(bp))
-              xfs_allocbt_verify(bp))) {
+                xfs_buf_ioerror(bp, EFSBADCRC);
-                trace_xfs_btree_corrupt(bp, _RET_IP_);
+        else if (!xfs_allocbt_verify(bp))
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
-                                     bp->b_target->bt_mount, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        if (bp->b_error) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                xfs_verifier_error(bp);
        }
 }
@@ -370,9 +372,9 @@ xfs_allocbt_write_verify(
 {
        if (!xfs_allocbt_verify(bp)) {
                trace_xfs_btree_corrupt(bp, _RET_IP_);
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
-                                     bp->b_target->bt_mount, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
        }
        xfs_btree_sblock_calc_crc(bp);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index db2cfb067d0b..0479c32c5eb1 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -632,38 +632,46 @@ xfs_map_at_offset(
 }
 /*
- * Test if a given page is suitable for writing as part of an unwritten
+ * Test if a given page contains at least one buffer of a given @type.
- * or delayed allocate extent.
+ * If @check_all_buffers is true, then we walk all the buffers in the page to
+ * try to find one of the type passed in. If it is not set, then the caller only
+ * needs to check the first buffer on the page for a match.
 */
-STATIC int
+STATIC bool
 xfs_check_page_type(
        struct page             *page,
-        unsigned int            type)
+        unsigned int            type,
+        bool                    check_all_buffers)
 {
-        if (PageWriteback(page))
+        struct buffer_head      *bh;
-                return 0;
+        struct buffer_head      *head;
-        if (page->mapping && page_has_buffers(page)) {
+        if (PageWriteback(page))
-                struct buffer_head      *bh, *head;
+                return false;
-                int                     acceptable = 0;
+        if (!page->mapping)
+                return false;
+        if (!page_has_buffers(page))
+                return false;
-                bh = head = page_buffers(page);
+        bh = head = page_buffers(page);
-                do {
+        do {
-                        if (buffer_unwritten(bh))
+                if (buffer_unwritten(bh)) {
-                                acceptable += (type == XFS_IO_UNWRITTEN);
+                        if (type == XFS_IO_UNWRITTEN)
-                        else if (buffer_delay(bh))
+                                return true;
-                                acceptable += (type == XFS_IO_DELALLOC);
+                } else if (buffer_delay(bh)) {
-                        else if (buffer_dirty(bh) && buffer_mapped(bh))
+                        if (type == XFS_IO_DELALLOC)
-                                acceptable += (type == XFS_IO_OVERWRITE);
+                                return true;
-                        else
+                } else if (buffer_dirty(bh) && buffer_mapped(bh)) {
-                                break;
+                        if (type == XFS_IO_OVERWRITE)
-                } while ((bh = bh->b_this_page) != head);
+                                return true;
+                }
-                if (acceptable)
+                /* If we are only checking the first buffer, we are done now. */
-                        return 1;
+                if (!check_all_buffers)
-        }
+                        break;
+        } while ((bh = bh->b_this_page) != head);
-        return 0;
+        return false;
 }
 /*
@@ -697,7 +705,7 @@ xfs_convert_page(
                goto fail_unlock_page;
        if (page->mapping != inode->i_mapping)
                goto fail_unlock_page;
-        if (!xfs_check_page_type(page, (*ioendp)->io_type))
+        if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
                goto fail_unlock_page;
        /*
@@ -742,6 +750,15 @@ xfs_convert_page(
        p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
        page_dirty = p_offset / len;
+        /*
+         * The moment we find a buffer that doesn't match our current type
+         * specification or can't be written, abort the loop and start
+         * writeback. As per the above xfs_imap_valid() check, only
+         * xfs_vm_writepage() can handle partial page writeback fully - we are
+         * limited here to the buffers that are contiguous with the current
+         * ioend, and hence a buffer we can't write breaks that contiguity and
+         * we have to defer the rest of the IO to xfs_vm_writepage().
+         */
        bh = head = page_buffers(page);
        do {
                if (offset >= end_offset)
@@ -750,7 +767,7 @@ xfs_convert_page(
                        uptodate = 0;
                if (!(PageUptodate(page) || buffer_uptodate(bh))) {
                        done = 1;
-                        continue;
+                        break;
                }
                if (buffer_unwritten(bh) || buffer_delay(bh) ||
@@ -762,10 +779,11 @@ xfs_convert_page(
                        else
                                type = XFS_IO_OVERWRITE;
-                        if (!xfs_imap_valid(inode, imap, offset)) {
+                        /*
-                                done = 1;
+                         * imap should always be valid because of the above
-                                continue;
+                         * partial page end_offset check on the imap.
-                        }
+                         */
+                        ASSERT(xfs_imap_valid(inode, imap, offset));
                        lock_buffer(bh);
                        if (type != XFS_IO_OVERWRITE)
@@ -777,6 +795,7 @@ xfs_convert_page(
                        count++;
                } else {
                        done = 1;
+                        break;
                }
        } while (offset += len, (bh = bh->b_this_page) != head);
@@ -868,7 +887,7 @@ xfs_aops_discard_page(
        struct buffer_head      *bh, *head;
        loff_t                  offset = page_offset(page);
-        if (!xfs_check_page_type(page, XFS_IO_DELALLOC))
+        if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
                goto out_invalidate;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -1325,6 +1344,14 @@ __xfs_get_blocks(
        /*
         * If this is O_DIRECT or the mpage code calling tell them how large
         * the mapping is, so that we can avoid repeated get_blocks calls.
+         *
+         * If the mapping spans EOF, then we have to break the mapping up as the
+         * mapping for blocks beyond EOF must be marked new so that sub block
+         * regions can be correctly zeroed. We can't do this for mappings within
+         * EOF unless the mapping was just allocated or is unwritten, otherwise
+         * the callers would overwrite existing data with zeros. Hence we have
+         * to split the mapping into a range up to and including EOF, and a
+         * second mapping for beyond EOF.
         */
        if (direct || size > (1 << inode->i_blkbits)) {
                xfs_off_t               mapping_size;
@@ -1335,6 +1362,12 @@ __xfs_get_blocks(
                ASSERT(mapping_size > 0);
                if (mapping_size > size)
                        mapping_size = size;
+                if (offset < i_size_read(inode) &&
+                    offset + mapping_size >= i_size_read(inode)) {
+                        /* limit mapping to block that spans EOF */
+                        mapping_size = roundup_64(i_size_read(inode) - offset,
+                                                  1 << inode->i_blkbits);
+                }
                if (mapping_size > LONG_MAX)
                        mapping_size = LONG_MAX;
@@ -1441,7 +1474,8 @@ xfs_vm_direct_IO(
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
                                            xfs_get_blocks_direct,
-                                            xfs_end_io_direct_write, NULL, 0);
+                                            xfs_end_io_direct_write, NULL,
+                                            DIO_ASYNC_EXTEND);
                if (ret != -EIOCBQUEUED && iocb->private)
                        goto out_destroy_ioend;
        } else {
@@ -1546,6 +1580,16 @@ xfs_vm_write_failed(
                xfs_vm_kill_delalloc_range(inode, block_offset,
                                           block_offset + bh->b_size);
+                /*
+                 * This buffer does not contain data anymore. make sure anyone
+                 * who finds it knows that for certain.
+                 */
+                clear_buffer_delay(bh);
+                clear_buffer_uptodate(bh);
+                clear_buffer_mapped(bh);
+                clear_buffer_new(bh);
+                clear_buffer_dirty(bh);
        }
 }
@@ -1579,12 +1623,21 @@ xfs_vm_write_begin(
        status = __block_write_begin(page, pos, len, xfs_get_blocks);
        if (unlikely(status)) {
                struct inode    *inode = mapping->host;
+                size_t          isize = i_size_read(inode);
                xfs_vm_write_failed(inode, page, pos, len);
                unlock_page(page);
-                if (pos + len > i_size_read(inode))
+                /*
-                        truncate_pagecache(inode, i_size_read(inode));
+                 * If the write is beyond EOF, we only want to kill blocks
+                 * allocated in this write, not blocks that were previously
+                 * written successfully.
+                 */
+                if (pos + len > isize) {
+                        ssize_t start = max_t(ssize_t, pos, isize);
+                        truncate_pagecache_range(inode, start, pos + len);
+                }
                page_cache_release(page);
                page = NULL;
@@ -1595,9 +1648,12 @@ xfs_vm_write_begin(
 }
 /*
- * On failure, we only need to kill delalloc blocks beyond EOF because they
+ * On failure, we only need to kill delalloc blocks beyond EOF in the range of
- * will never be written. For blocks within EOF, generic_write_end() zeros them
+ * this specific write because they will never be written. Previous writes
- * so they are safe to leave alone and be written with all the other valid data.
+ * beyond EOF where block allocation succeeded do not need to be trashed, so
+ * only new blocks from this write should be trashed. For blocks within
+ * EOF, generic_write_end() zeros them so they are safe to leave alone and be
+ * written with all the other valid data.
 */
 STATIC int
 xfs_vm_write_end(
@@ -1620,8 +1676,11 @@ xfs_vm_write_end(
                loff_t          to = pos + len;
                if (to > isize) {
-                        truncate_pagecache(inode, isize);
+                        /* only kill blocks in this write beyond EOF */
+                        if (pos > isize)
+                                isize = pos;
                        xfs_vm_kill_delalloc_range(inode, isize, to);
+                        truncate_pagecache_range(inode, isize, to);
                }
        }
        return ret;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 01b6a0102fbd..abda1124a70f 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -213,7 +213,7 @@ xfs_attr_calc_size(
                 * Out of line attribute, cannot double split, but
                 * make room for the attribute value itself.
                 */
-                uint    dblocks = XFS_B_TO_FSB(mp, valuelen);
+                uint    dblocks = xfs_attr3_rmt_blocks(mp, valuelen);
                nblks += dblocks;
                nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
        }
@@ -698,11 +698,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                trace_xfs_attr_leaf_replace(args);
+                /* save the attribute state for later removal*/
                args->op_flags |= XFS_DA_OP_RENAME;     /* an atomic rename */
                args->blkno2 = args->blkno;             /* set 2nd entry info*/
                args->index2 = args->index;
                args->rmtblkno2 = args->rmtblkno;
                args->rmtblkcnt2 = args->rmtblkcnt;
+                args->rmtvaluelen2 = args->rmtvaluelen;
+                /*
+                 * clear the remote attr state now that it is saved so that the
+                 * values reflect the state of the attribute we are about to
+                 * add, not the attribute we just found and will remove later.
+                 */
+                args->rmtblkno = 0;
+                args->rmtblkcnt = 0;
+                args->rmtvaluelen = 0;
        }
        /*
@@ -794,6 +805,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                args->blkno = args->blkno2;
                args->rmtblkno = args->rmtblkno2;
                args->rmtblkcnt = args->rmtblkcnt2;
+                args->rmtvaluelen = args->rmtvaluelen2;
                if (args->rmtblkno) {
                        error = xfs_attr_rmtval_remove(args);
                        if (error)
@@ -999,13 +1011,22 @@ restart:
                trace_xfs_attr_node_replace(args);
+                /* save the attribute state for later removal*/
                args->op_flags |= XFS_DA_OP_RENAME;     /* atomic rename op */
                args->blkno2 = args->blkno;             /* set 2nd entry info*/
                args->index2 = args->index;
                args->rmtblkno2 = args->rmtblkno;
                args->rmtblkcnt2 = args->rmtblkcnt;
+                args->rmtvaluelen2 = args->rmtvaluelen;
+                /*
+                 * clear the remote attr state now that it is saved so that the
+                 * values reflect the state of the attribute we are about to
+                 * add, not the attribute we just found and will remove later.
+                 */
                args->rmtblkno = 0;
                args->rmtblkcnt = 0;
+                args->rmtvaluelen = 0;
        }
        retval = xfs_attr3_leaf_add(blk->bp, state->args);
@@ -1133,6 +1154,7 @@ restart:
                args->blkno = args->blkno2;
                args->rmtblkno = args->rmtblkno2;
                args->rmtblkcnt = args->rmtblkcnt2;
+                args->rmtvaluelen = args->rmtvaluelen2;
                if (args->rmtblkno) {
                        error = xfs_attr_rmtval_remove(args);
                        if (error)
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 7b126f46a2f9..511c283459b1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -213,8 +213,8 @@ xfs_attr3_leaf_write_verify(
        struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
        if (!xfs_attr3_leaf_verify(bp)) {
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
                return;
        }
@@ -224,7 +224,7 @@ xfs_attr3_leaf_write_verify(
        if (bip)
                hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
-        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_ATTR3_LEAF_CRC_OFF);
+        xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
 }
 /*
@@ -239,13 +239,14 @@ xfs_attr3_leaf_read_verify(
 {
        struct xfs_mount        *mp = bp->b_target->bt_mount;
-        if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
-             !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+             !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
-                                          XFS_ATTR3_LEAF_CRC_OFF)) ||
+                xfs_buf_ioerror(bp, EFSBADCRC);
-            !xfs_attr3_leaf_verify(bp)) {
+        else if (!xfs_attr3_leaf_verify(bp))
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
-        }
+        if (bp->b_error)
+                xfs_verifier_error(bp);
 }
 const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
@@ -1228,6 +1229,7 @@ xfs_attr3_leaf_add_work(
                name_rmt->valueblk = 0;
                args->rmtblkno = 1;
                args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+                args->rmtvaluelen = args->valuelen;
        }
        xfs_trans_log_buf(args->trans, bp,
             XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
@@ -2166,11 +2168,11 @@ xfs_attr3_leaf_lookup_int(
                        if (!xfs_attr_namesp_match(args->flags, entry->flags))
                                continue;
                        args->index = probe;
-                        args->valuelen = be32_to_cpu(name_rmt->valuelen);
+                        args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
                        args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
                        args->rmtblkcnt = xfs_attr3_rmt_blocks(
                                                        args->dp->i_mount,
-                                                        args->valuelen);
+                                                        args->rmtvaluelen);
                        return XFS_ERROR(EEXIST);
                }
        }
@@ -2219,19 +2221,19 @@ xfs_attr3_leaf_getvalue(
                name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
                ASSERT(name_rmt->namelen == args->namelen);
                ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
-                valuelen = be32_to_cpu(name_rmt->valuelen);
+                args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
                args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
                args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
-                                                       valuelen);
+                                                       args->rmtvaluelen);
                if (args->flags & ATTR_KERNOVAL) {
-                        args->valuelen = valuelen;
+                        args->valuelen = args->rmtvaluelen;
                        return 0;
                }
-                if (args->valuelen < valuelen) {
+                if (args->valuelen < args->rmtvaluelen) {
-                        args->valuelen = valuelen;
+                        args->valuelen = args->rmtvaluelen;
                        return XFS_ERROR(ERANGE);
                }
-                args->valuelen = valuelen;
+                args->valuelen = args->rmtvaluelen;
        }
        return 0;
 }
@@ -2518,7 +2520,7 @@ xfs_attr3_leaf_clearflag(
                ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
                name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
                name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
-                name_rmt->valuelen = cpu_to_be32(args->valuelen);
+                name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
                xfs_trans_log_buf(args->trans, bp,
                         XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
        }
@@ -2676,7 +2678,7 @@ xfs_attr3_leaf_flipflags(
                ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
                name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
                name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
-                name_rmt->valuelen = cpu_to_be32(args->valuelen);
+                name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
                xfs_trans_log_buf(args->trans, bp1,
                         XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
        }
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 01db96f60cf0..833fe5d98d80 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -447,6 +447,7 @@ xfs_attr3_leaf_list_int(
                                args.dp = context->dp;
                                args.whichfork = XFS_ATTR_FORK;
                                args.valuelen = valuelen;
+                                args.rmtvaluelen = valuelen;
                                args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
                                args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
                                args.rmtblkcnt = xfs_attr3_rmt_blocks(
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 5549d69ddb45..d2e6e948cec7 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -125,7 +125,6 @@ xfs_attr3_rmt_read_verify(
        struct xfs_mount *mp = bp->b_target->bt_mount;
        char            *ptr;
        int             len;
-        bool            corrupt = false;
        xfs_daddr_t     bno;
        /* no verification of non-crc buffers */
@@ -140,11 +139,11 @@ xfs_attr3_rmt_read_verify(
        while (len > 0) {
                if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp),
                                      XFS_ATTR3_RMT_CRC_OFF)) {
-                        corrupt = true;
+                        xfs_buf_ioerror(bp, EFSBADCRC);
                        break;
                }
                if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
-                        corrupt = true;
+                        xfs_buf_ioerror(bp, EFSCORRUPTED);
                        break;
                }
                len -= XFS_LBSIZE(mp);
@@ -152,10 +151,9 @@ xfs_attr3_rmt_read_verify(
                bno += mp->m_bsize;
        }
-        if (corrupt) {
+        if (bp->b_error)
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+                xfs_verifier_error(bp);
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        else
-        } else
                ASSERT(len == 0);
 }
@@ -180,9 +178,8 @@ xfs_attr3_rmt_write_verify(
        while (len > 0) {
                if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
-                        XFS_CORRUPTION_ERROR(__func__,
-                                            XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                        xfs_buf_ioerror(bp, EFSCORRUPTED);
+                        xfs_verifier_error(bp);
                        return;
                }
                if (bip) {
@@ -340,7 +337,7 @@ xfs_attr_rmtval_get(
        struct xfs_buf          *bp;
        xfs_dablk_t             lblkno = args->rmtblkno;
        __uint8_t               *dst = args->value;
-        int                     valuelen = args->valuelen;
+        int                     valuelen;
        int                     nmap;
        int                     error;
        int                     blkcnt = args->rmtblkcnt;
@@ -350,7 +347,9 @@ xfs_attr_rmtval_get(
        trace_xfs_attr_rmtval_get(args);
        ASSERT(!(args->flags & ATTR_KERNOVAL));
+        ASSERT(args->rmtvaluelen == args->valuelen);
+        valuelen = args->rmtvaluelen;
        while (valuelen > 0) {
                nmap = ATTR_RMTVALUE_MAPSIZE;
                error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
@@ -418,7 +417,7 @@ xfs_attr_rmtval_set(
         * attributes have headers, we can't just do a straight byte to FSB
         * conversion and have to take the header space into account.
         */
-        blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+        blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
        error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
                                                   XFS_ATTR_FORK);
        if (error)
@@ -483,7 +482,7 @@ xfs_attr_rmtval_set(
         */
        lblkno = args->rmtblkno;
        blkcnt = args->rmtblkcnt;
-        valuelen = args->valuelen;
+        valuelen = args->rmtvaluelen;
        while (valuelen > 0) {
                struct xfs_buf  *bp;
                xfs_daddr_t     dblkno;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 152543c4ca70..f0efc7e970ef 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5378,3 +5378,201 @@ error0:
        }
        return error;
 }
+/*
+ * Shift extent records to the left to cover a hole.
+ *
+ * The maximum number of extents to be shifted in a single operation
+ * is @num_exts, and @current_ext keeps track of the current extent
+ * index we have shifted. @offset_shift_fsb is the length by which each
+ * extent is shifted. If there is no hole to shift the extents
+ * into, this will be considered invalid operation and we abort immediately.
+ */
+int
+xfs_bmap_shift_extents(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *ip,
+        int                     *done,
+        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           offset_shift_fsb,
+        xfs_extnum_t            *current_ext,
+        xfs_fsblock_t           *firstblock,
+        struct xfs_bmap_free    *flist,
+        int                     num_exts)
+{
+        struct xfs_btree_cur            *cur;
+        struct xfs_bmbt_rec_host        *gotp;
+        struct xfs_bmbt_irec            got;
+        struct xfs_bmbt_irec            left;
+        struct xfs_mount                *mp = ip->i_mount;
+        struct xfs_ifork                *ifp;
+        xfs_extnum_t                    nexts = 0;
+        xfs_fileoff_t                   startoff;
+        int                             error = 0;
+        int                             i;
+        int                             whichfork = XFS_DATA_FORK;
+        int                             logflags;
+        xfs_filblks_t                   blockcount = 0;
+        int                             total_extents;
+        if (unlikely(XFS_TEST_ERROR(
+            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+                XFS_ERROR_REPORT("xfs_bmap_shift_extents",
+                                 XFS_ERRLEVEL_LOW, mp);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        ASSERT(current_ext != NULL);
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+                /* Read in all the extents */
+                error = xfs_iread_extents(tp, ip, whichfork);
+                if (error)
+                        return error;
+        }
+        /*
+         * If *current_ext is 0, we would need to lookup the extent
+         * from where we would start shifting and store it in gotp.
+         */
+        if (!*current_ext) {
+                gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
+                /*
+                 * gotp can be null in 2 cases: 1) if there are no extents
+                 * or 2) start_fsb lies in a hole beyond which there are
+                 * no extents. Either way, we are done.
+                 */
+                if (!gotp) {
+                        *done = 1;
+                        return 0;
+                }
+        }
+        /* We are going to change core inode */
+        logflags = XFS_ILOG_CORE;
+        if (ifp->if_flags & XFS_IFBROOT) {
+                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+                cur->bc_private.b.firstblock = *firstblock;
+                cur->bc_private.b.flist = flist;
+                cur->bc_private.b.flags = 0;
+        } else {
+                cur = NULL;
+                logflags |= XFS_ILOG_DEXT;
+        }
+        /*
+         * There may be delalloc extents in the data fork before the range we
+         * are collapsing out, so we cannot
+         * use the count of real extents here. Instead we have to calculate it
+         * from the incore fork.
+         */
+        total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+        while (nexts++ < num_exts && *current_ext < total_extents) {
+                gotp = xfs_iext_get_ext(ifp, *current_ext);
+                xfs_bmbt_get_all(gotp, &got);
+                startoff = got.br_startoff - offset_shift_fsb;
+                /*
+                 * Before shifting extent into hole, make sure that the hole
+                 * is large enough to accomodate the shift.
+                 */
+                if (*current_ext) {
+                        xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+                                                *current_ext - 1), &left);
+                        if (startoff < left.br_startoff + left.br_blockcount)
+                                error = XFS_ERROR(EINVAL);
+                } else if (offset_shift_fsb > got.br_startoff) {
+                        /*
+                         * When first extent is shifted, offset_shift_fsb
+                         * should be less than the stating offset of
+                         * the first extent.
+                         */
+                        error = XFS_ERROR(EINVAL);
+                }
+                if (error)
+                        goto del_cursor;
+                if (cur) {
+                        error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+                                                   got.br_startblock,
+                                                   got.br_blockcount,
+                                                   &i);
+                        if (error)
+                                goto del_cursor;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+                }
+                /* Check if we can merge 2 adjacent extents */
+                if (*current_ext &&
+                    left.br_startoff + left.br_blockcount == startoff &&
+                    left.br_startblock + left.br_blockcount ==
+                                got.br_startblock &&
+                    left.br_state == got.br_state &&
+                    left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
+                        blockcount = left.br_blockcount +
+                                got.br_blockcount;
+                        xfs_iext_remove(ip, *current_ext, 1, 0);
+                        if (cur) {
+                                error = xfs_btree_delete(cur, &i);
+                                if (error)
+                                        goto del_cursor;
+                                XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+                        }
+                        XFS_IFORK_NEXT_SET(ip, whichfork,
+                                XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+                        gotp = xfs_iext_get_ext(ifp, --*current_ext);
+                        xfs_bmbt_get_all(gotp, &got);
+                        /* Make cursor point to the extent we will update */
+                        if (cur) {
+                                error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+                                                           got.br_startblock,
+                                                           got.br_blockcount,
+                                                           &i);
+                                if (error)
+                                        goto del_cursor;
+                                XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+                        }
+                        xfs_bmbt_set_blockcount(gotp, blockcount);
+                        got.br_blockcount = blockcount;
+                } else {
+                        /* We have to update the startoff */
+                        xfs_bmbt_set_startoff(gotp, startoff);
+                        got.br_startoff = startoff;
+                }
+                if (cur) {
+                        error = xfs_bmbt_update(cur, got.br_startoff,
+                                                got.br_startblock,
+                                                got.br_blockcount,
+                                                got.br_state);
+                        if (error)
+                                goto del_cursor;
+                }
+                (*current_ext)++;
+                total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+        }
+        /* Check if we are done */
+        if (*current_ext == total_extents)
+                *done = 1;
+del_cursor:
+        if (cur)
+                xfs_btree_del_cursor(cur,
+                        error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        xfs_trans_log_inode(tp, ip, logflags);
+        return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 33b41f351225..f84bd7af43be 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -127,6 +127,16 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
        { BMAP_RIGHT_FILLING,   "RF" }, \
        { BMAP_ATTRFORK,        "ATTR" }
+/*
+ * This macro is used to determine how many extents will be shifted
+ * in one write transaction. We could require two splits,
+ * an extent move on the first and an extent merge on the second,
+ * So it is proper that one extent is shifted inside write transaction
+ * at a time.
+ */
+#define XFS_BMAP_MAX_SHIFT_EXTENTS      1
 #ifdef DEBUG
 void    xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
                int whichfork, unsigned long caller_ip);
@@ -169,5 +179,10 @@ int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
 int     xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
                xfs_extnum_t num);
 uint    xfs_default_attroffset(struct xfs_inode *ip);
+int     xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+                int *done, xfs_fileoff_t start_fsb,
+                xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
+                xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
+                int num_exts);
 #endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 706bc3f777cb..818d546664e7 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -780,12 +780,14 @@ static void
 xfs_bmbt_read_verify(
        struct xfs_buf  *bp)
 {
-        if (!(xfs_btree_lblock_verify_crc(bp) &&
+        if (!xfs_btree_lblock_verify_crc(bp))
-              xfs_bmbt_verify(bp))) {
+                xfs_buf_ioerror(bp, EFSBADCRC);
-                trace_xfs_btree_corrupt(bp, _RET_IP_);
+        else if (!xfs_bmbt_verify(bp))
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
-                                     bp->b_target->bt_mount, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        if (bp->b_error) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                xfs_verifier_error(bp);
        }
 }
@@ -794,11 +796,9 @@ xfs_bmbt_write_verify(
        struct xfs_buf  *bp)
 {
        if (!xfs_bmbt_verify(bp)) {
-                xfs_warn(bp->b_target->bt_mount, "bmbt daddr 0x%llx failed", bp->b_bn);
                trace_xfs_btree_corrupt(bp, _RET_IP_);
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
-                                     bp->b_target->bt_mount, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
                return;
        }
        xfs_btree_lblock_calc_crc(bp);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index f264616080ca..296160b8e78c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1349,7 +1349,6 @@ xfs_free_file_space(
                 * the freeing of the space succeeds at ENOSPC.
                 */
                tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-                tp->t_flags |= XFS_TRANS_RESERVE;
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
                /*
@@ -1419,6 +1418,8 @@ xfs_zero_file_space(
        xfs_off_t               end_boundary;
        int                     error;
+        trace_xfs_zero_file_space(ip);
        granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
        /*
@@ -1433,9 +1434,18 @@ xfs_zero_file_space(
        ASSERT(end_boundary <= offset + len);
        if (start_boundary < end_boundary - 1) {
-                /* punch out the page cache over the conversion range */
+                /*
+                 * punch out delayed allocation blocks and the page cache over
+                 * the conversion range
+                 */
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = xfs_bmap_punch_delalloc_range(ip,
+                                XFS_B_TO_FSBT(mp, start_boundary),
+                                XFS_B_TO_FSB(mp, end_boundary - start_boundary));
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                truncate_pagecache_range(VFS_I(ip), start_boundary,
                                         end_boundary - 1);
                /* convert the blocks */
                error = xfs_alloc_file_space(ip, start_boundary,
                                        end_boundary - start_boundary - 1,
@@ -1468,6 +1478,102 @@ out:
 }
 /*
+ * xfs_collapse_file_space()
+ *      This routine frees disk space and shift extent for the given file.
+ *      The first thing we do is to free data blocks in the specified range
+ *      by calling xfs_free_file_space(). It would also sync dirty data
+ *      and invalidate page cache over the region on which collapse range
+ *      is working. And Shift extent records to the left to cover a hole.
+ * RETURNS:
+ *      0 on success
+ *      errno on error
+ *
+ */
+int
+xfs_collapse_file_space(
+        struct xfs_inode        *ip,
+        xfs_off_t               offset,
+        xfs_off_t               len)
+{
+        int                     done = 0;
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_trans        *tp;
+        int                     error;
+        xfs_extnum_t            current_ext = 0;
+        struct xfs_bmap_free    free_list;
+        xfs_fsblock_t           first_block;
+        int                     committed;
+        xfs_fileoff_t           start_fsb;
+        xfs_fileoff_t           shift_fsb;
+        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+        trace_xfs_collapse_file_space(ip);
+        start_fsb = XFS_B_TO_FSB(mp, offset + len);
+        shift_fsb = XFS_B_TO_FSB(mp, len);
+        error = xfs_free_file_space(ip, offset, len);
+        if (error)
+                return error;
+        while (!error && !done) {
+                tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+                tp->t_flags |= XFS_TRANS_RESERVE;
+                /*
+                 * We would need to reserve permanent block for transaction.
+                 * This will come into picture when after shifting extent into
+                 * hole we found that adjacent extents can be merged which
+                 * may lead to freeing of a block during record update.
+                 */
+                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                                XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+                if (error) {
+                        ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+                        xfs_trans_cancel(tp, 0);
+                        break;
+                }
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
+                                ip->i_gdquot, ip->i_pdquot,
+                                XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
+                                XFS_QMOPT_RES_REGBLKS);
+                if (error)
+                        goto out;
+                xfs_trans_ijoin(tp, ip, 0);
+                xfs_bmap_init(&free_list, &first_block);
+                /*
+                 * We are using the write transaction in which max 2 bmbt
+                 * updates are allowed
+                 */
+                error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
+                                               shift_fsb, &current_ext,
+                                               &first_block, &free_list,
+                                               XFS_BMAP_MAX_SHIFT_EXTENTS);
+                if (error)
+                        goto out;
+                error = xfs_bmap_finish(&tp, &free_list, &committed);
+                if (error)
+                        goto out;
+                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+        return error;
+out:
+        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        return error;
+}
+/*
 * We need to check that the format of the data fork in the temporary inode is
 * valid for the target inode before doing the swap. This is not a problem with
 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 900747b25772..935ed2b24edf 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -99,6 +99,8 @@ int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
                            xfs_off_t len);
 int     xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
                            xfs_off_t len);
+int     xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
+                                xfs_off_t len);
 /* EOF block manipulation functions */
 bool    xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 9adaae4f3e2f..e80d59fdf89a 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -234,8 +234,7 @@ xfs_btree_lblock_calc_crc(
                return;
        if (bip)
                block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+        xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
-                         XFS_BTREE_LBLOCK_CRC_OFF);
 }
 bool
@@ -243,8 +242,8 @@ xfs_btree_lblock_verify_crc(
        struct xfs_buf          *bp)
 {
        if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
-                return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+                return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
-                                        XFS_BTREE_LBLOCK_CRC_OFF);
        return true;
 }
@@ -267,8 +266,7 @@ xfs_btree_sblock_calc_crc(
                return;
        if (bip)
                block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+        xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
-                         XFS_BTREE_SBLOCK_CRC_OFF);
 }
 bool
@@ -276,8 +274,8 @@ xfs_btree_sblock_verify_crc(
        struct xfs_buf          *bp)
 {
        if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
-                return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+                return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
-                                        XFS_BTREE_SBLOCK_CRC_OFF);
        return true;
 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 9c061ef2b0d9..cb10a0aaab3a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -396,7 +396,17 @@ _xfs_buf_map_pages(
                bp->b_addr = NULL;
        } else {
                int retried = 0;
+                unsigned noio_flag;
+                /*
+                 * vm_map_ram() will allocate auxillary structures (e.g.
+                 * pagetables) with GFP_KERNEL, yet we are likely to be under
+                 * GFP_NOFS context here. Hence we need to tell memory reclaim
+                 * that we are in such a context via PF_MEMALLOC_NOIO to prevent
+                 * memory reclaim re-entering the filesystem here and
+                 * potentially deadlocking.
+                 */
+                noio_flag = memalloc_noio_save();
                do {
                        bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
                                                -1, PAGE_KERNEL);
@@ -404,6 +414,7 @@ _xfs_buf_map_pages(
                                break;
                        vm_unmap_aliases();
                } while (retried++ <= 1);
+                memalloc_noio_restore(noio_flag);
                if (!bp->b_addr)
                        return -ENOMEM;
@@ -1361,21 +1372,29 @@ xfs_buf_iorequest(
                xfs_buf_wait_unpin(bp);
        xfs_buf_hold(bp);
-        /* Set the count to 1 initially, this will stop an I/O
+        /*
+         * Set the count to 1 initially, this will stop an I/O
         * completion callout which happens before we have started
         * all the I/O from calling xfs_buf_ioend too early.
         */
        atomic_set(&bp->b_io_remaining, 1);
        _xfs_buf_ioapply(bp);
-        _xfs_buf_ioend(bp, 1);
+        /*
+         * If _xfs_buf_ioapply failed, we'll get back here with
+         * only the reference we took above.  _xfs_buf_ioend will
+         * drop it to zero, so we'd better not queue it for later,
+         * or we'll free it before it's done.
+         */
+        _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
        xfs_buf_rele(bp);
 }
 /*
 * Waits for I/O to complete on the buffer supplied.  It returns immediately if
- * no I/O is pending or there is already a pending error on the buffer.  It
+ * no I/O is pending or there is already a pending error on the buffer, in which
- * returns the I/O error code, if any, or 0 if there was no error.
+ * case nothing will ever complete.  It returns the I/O error code, if any, or
+ * 0 if there was no error.
 */
 int
 xfs_buf_iowait(
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 995339534db6..b8a3abf6cf47 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -369,6 +369,20 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
        xfs_buf_rele(bp);
 }
+static inline int
+xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
+{
+        return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+                                cksum_offset);
+}
+static inline void
+xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
+{
+        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+                         cksum_offset);
+}
 /*
 *      Handling of buftargs.
 */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 33149113e333..8752821443be 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -796,20 +796,6 @@ xfs_buf_item_init(
                bip->bli_formats[i].blf_map_size = map_size;
        }
-#ifdef XFS_TRANS_DEBUG
-        /*
-         * Allocate the arrays for tracking what needs to be logged
-         * and what our callers request to be logged.  bli_orig
-         * holds a copy of the original, clean buffer for comparison
-         * against, and bli_logged keeps a 1 bit flag per byte in
-         * the buffer to indicate which bytes the callers have asked
-         * to have logged.
-         */
-        bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP);
-        memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length));
-        bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP);
-#endif
        /*
         * Put the buf item into the list of items attached to the
         * buffer at the front.
@@ -957,11 +943,6 @@ STATIC void
 xfs_buf_item_free(
        xfs_buf_log_item_t      *bip)
 {
-#ifdef XFS_TRANS_DEBUG
-        kmem_free(bip->bli_orig);
-        kmem_free(bip->bli_logged);
-#endif /* XFS_TRANS_DEBUG */
        xfs_buf_item_free_format(bip);
        kmem_zone_free(xfs_buf_item_zone, bip);
 }
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 796272a2e129..6cc5f6785a77 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -185,8 +185,8 @@ xfs_da3_node_write_verify(
        struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
        if (!xfs_da3_node_verify(bp)) {
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
                return;
        }
@@ -196,7 +196,7 @@ xfs_da3_node_write_verify(
        if (bip)
                hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
-        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DA3_NODE_CRC_OFF);
+        xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
 }
 /*
@@ -209,18 +209,20 @@ static void
 xfs_da3_node_read_verify(
        struct xfs_buf          *bp)
 {
-        struct xfs_mount        *mp = bp->b_target->bt_mount;
        struct xfs_da_blkinfo   *info = bp->b_addr;
        switch (be16_to_cpu(info->magic)) {
                case XFS_DA3_NODE_MAGIC:
-                        if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+                        if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
-                                              XFS_DA3_NODE_CRC_OFF))
+                                xfs_buf_ioerror(bp, EFSBADCRC);
                                break;
+                        }
                        /* fall through */
                case XFS_DA_NODE_MAGIC:
-                        if (!xfs_da3_node_verify(bp))
+                        if (!xfs_da3_node_verify(bp)) {
+                                xfs_buf_ioerror(bp, EFSCORRUPTED);
                                break;
+                        }
                        return;
                case XFS_ATTR_LEAF_MAGIC:
                case XFS_ATTR3_LEAF_MAGIC:
@@ -237,8 +239,7 @@ xfs_da3_node_read_verify(
        }
        /* corrupt block */
-        XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+        xfs_verifier_error(bp);
-        xfs_buf_ioerror(bp, EFSCORRUPTED);
 }
 const struct xfs_buf_ops xfs_da3_node_buf_ops = {
@@ -1295,7 +1296,7 @@ xfs_da3_fixhashpath(
                node = blk->bp->b_addr;
                dp->d_ops->node_hdr_from_disk(&nodehdr, node);
                btree = dp->d_ops->node_tree_p(node);
-                if (be32_to_cpu(btree->hashval) == lasthash)
+                if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
                        break;
                blk->hashval = lasthash;
                btree[blk->index].hashval = cpu_to_be32(lasthash);
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 6e95ea79f5d7..201c6091d26a 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -60,10 +60,12 @@ typedef struct xfs_da_args {
        int             index;          /* index of attr of interest in blk */
        xfs_dablk_t     rmtblkno;       /* remote attr value starting blkno */
        int             rmtblkcnt;      /* remote attr value block count */
+        int             rmtvaluelen;    /* remote attr value length in bytes */
        xfs_dablk_t     blkno2;         /* blkno of 2nd attr leaf of interest */
        int             index2;         /* index of 2nd attr in blk */
        xfs_dablk_t     rmtblkno2;      /* remote attr value starting blkno */
        int             rmtblkcnt2;     /* remote attr value block count */
+        int             rmtvaluelen2;   /* remote attr value length in bytes */
        int             op_flags;       /* operation flags */
        enum xfs_dacmp  cmpresult;      /* name compare result for lookups */
 } xfs_da_args_t;
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5869b50dc41..623bbe8fd921 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -89,6 +89,8 @@ typedef struct xfs_dinode {
        /* structure must be padded to 64 bit alignment */
 } xfs_dinode_t;
+#define XFS_DINODE_CRC_OFF      offsetof(struct xfs_dinode, di_crc)
 #define DI_MAX_FLUSH 0xffff
 /*
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index ce16ef02997a..fda46253966a 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -180,16 +180,23 @@ xfs_dir_init(
        xfs_inode_t     *dp,
        xfs_inode_t     *pdp)
 {
-        xfs_da_args_t   args;
+        struct xfs_da_args *args;
        int             error;
-        memset((char *)&args, 0, sizeof(args));
-        args.dp = dp;
-        args.trans = tp;
        ASSERT(S_ISDIR(dp->i_d.di_mode));
-        if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino)))
+        error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
+        if (error)
                return error;
-        return xfs_dir2_sf_create(&args, pdp->i_ino);
+        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        if (!args)
+                return ENOMEM;
+        args->dp = dp;
+        args->trans = tp;
+        error = xfs_dir2_sf_create(args, pdp->i_ino);
+        kmem_free(args);
+        return error;
 }
 /*
@@ -205,41 +212,56 @@ xfs_dir_createname(
        xfs_bmap_free_t         *flist,         /* bmap's freeblock list */
        xfs_extlen_t            total)          /* bmap's total block count */
 {
-        xfs_da_args_t           args;
+        struct xfs_da_args      *args;
        int                     rval;
        int                     v;              /* type-checking value */
        ASSERT(S_ISDIR(dp->i_d.di_mode));
-        if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
+        rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+        if (rval)
                return rval;
        XFS_STATS_INC(xs_dir_create);
-        memset(&args, 0, sizeof(xfs_da_args_t));
+        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-        args.name = name->name;
+        if (!args)
-        args.namelen = name->len;
+                return ENOMEM;
-        args.filetype = name->type;
-        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+        args->name = name->name;
-        args.inumber = inum;
+        args->namelen = name->len;
-        args.dp = dp;
+        args->filetype = name->type;
-        args.firstblock = first;
+        args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-        args.flist = flist;
+        args->inumber = inum;
-        args.total = total;
+        args->dp = dp;
-        args.whichfork = XFS_DATA_FORK;
+        args->firstblock = first;
-        args.trans = tp;
+        args->flist = flist;
-        args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+        args->total = total;
+        args->whichfork = XFS_DATA_FORK;
-        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+        args->trans = tp;
-                rval = xfs_dir2_sf_addname(&args);
+        args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
-        else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
-                return rval;
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-        else if (v)
+                rval = xfs_dir2_sf_addname(args);
-                rval = xfs_dir2_block_addname(&args);
+                goto out_free;
-        else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+        }
-                return rval;
-        else if (v)
+        rval = xfs_dir2_isblock(tp, dp, &v);
-                rval = xfs_dir2_leaf_addname(&args);
+        if (rval)
+                goto out_free;
+        if (v) {
+                rval = xfs_dir2_block_addname(args);
+                goto out_free;
+        }
+        rval = xfs_dir2_isleaf(tp, dp, &v);
+        if (rval)
+                goto out_free;
+        if (v)
+                rval = xfs_dir2_leaf_addname(args);
        else
-                rval = xfs_dir2_node_addname(&args);
+                rval = xfs_dir2_node_addname(args);
+out_free:
+        kmem_free(args);
        return rval;
 }
@@ -282,46 +304,66 @@ xfs_dir_lookup(
        xfs_ino_t       *inum,          /* out: inode number */
        struct xfs_name *ci_name)       /* out: actual name if CI match */
 {
-        xfs_da_args_t   args;
+        struct xfs_da_args *args;
        int             rval;
        int             v;              /* type-checking value */
        ASSERT(S_ISDIR(dp->i_d.di_mode));
        XFS_STATS_INC(xs_dir_lookup);
-        memset(&args, 0, sizeof(xfs_da_args_t));
+        /*
-        args.name = name->name;
+         * We need to use KM_NOFS here so that lockdep will not throw false
-        args.namelen = name->len;
+         * positive deadlock warnings on a non-transactional lookup path. It is
-        args.filetype = name->type;
+         * safe to recurse into inode recalim in that case, but lockdep can't
-        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+         * easily be taught about it. Hence KM_NOFS avoids having to add more
-        args.dp = dp;
+         * lockdep Doing this avoids having to add a bunch of lockdep class
-        args.whichfork = XFS_DATA_FORK;
+         * annotations into the reclaim path for the ilock.
-        args.trans = tp;
+         */
-        args.op_flags = XFS_DA_OP_OKNOENT;
+        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+        args->name = name->name;
+        args->namelen = name->len;
+        args->filetype = name->type;
+        args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+        args->dp = dp;
+        args->whichfork = XFS_DATA_FORK;
+        args->trans = tp;
+        args->op_flags = XFS_DA_OP_OKNOENT;
        if (ci_name)
-                args.op_flags |= XFS_DA_OP_CILOOKUP;
+                args->op_flags |= XFS_DA_OP_CILOOKUP;
-        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-                rval = xfs_dir2_sf_lookup(&args);
+                rval = xfs_dir2_sf_lookup(args);
-        else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
+                goto out_check_rval;
-                return rval;
+        }
-        else if (v)
-                rval = xfs_dir2_block_lookup(&args);
+        rval = xfs_dir2_isblock(tp, dp, &v);
-        else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+        if (rval)
-                return rval;
+                goto out_free;
-        else if (v)
+        if (v) {
-                rval = xfs_dir2_leaf_lookup(&args);
+                rval = xfs_dir2_block_lookup(args);
+                goto out_check_rval;
+        }
+        rval = xfs_dir2_isleaf(tp, dp, &v);
+        if (rval)
+                goto out_free;
+        if (v)
+                rval = xfs_dir2_leaf_lookup(args);
        else
-                rval = xfs_dir2_node_lookup(&args);
+                rval = xfs_dir2_node_lookup(args);
+out_check_rval:
        if (rval == EEXIST)
                rval = 0;
        if (!rval) {
-                *inum = args.inumber;
+                *inum = args->inumber;
                if (ci_name) {
-                        ci_name->name = args.value;
+                        ci_name->name = args->value;
-                        ci_name->len = args.valuelen;
+                        ci_name->len = args->valuelen;
                }
        }
+out_free:
+        kmem_free(args);
        return rval;
 }
@@ -338,38 +380,51 @@ xfs_dir_removename(
        xfs_bmap_free_t *flist,         /* bmap's freeblock list */
        xfs_extlen_t    total)          /* bmap's total block count */
 {
-        xfs_da_args_t   args;
+        struct xfs_da_args *args;
        int             rval;
        int             v;              /* type-checking value */
        ASSERT(S_ISDIR(dp->i_d.di_mode));
        XFS_STATS_INC(xs_dir_remove);
-        memset(&args, 0, sizeof(xfs_da_args_t));
+        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-        args.name = name->name;
+        if (!args)
-        args.namelen = name->len;
+                return ENOMEM;
-        args.filetype = name->type;
-        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+        args->name = name->name;
-        args.inumber = ino;
+        args->namelen = name->len;
-        args.dp = dp;
+        args->filetype = name->type;
-        args.firstblock = first;
+        args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-        args.flist = flist;
+        args->inumber = ino;
-        args.total = total;
+        args->dp = dp;
-        args.whichfork = XFS_DATA_FORK;
+        args->firstblock = first;
-        args.trans = tp;
+        args->flist = flist;
+        args->total = total;
-        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+        args->whichfork = XFS_DATA_FORK;
-                rval = xfs_dir2_sf_removename(&args);
+        args->trans = tp;
-        else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
-                return rval;
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-        else if (v)
+                rval = xfs_dir2_sf_removename(args);
-                rval = xfs_dir2_block_removename(&args);
+                goto out_free;
-        else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+        }
-                return rval;
-        else if (v)
+        rval = xfs_dir2_isblock(tp, dp, &v);
-                rval = xfs_dir2_leaf_removename(&args);
+        if (rval)
+                goto out_free;
+        if (v) {
+                rval = xfs_dir2_block_removename(args);
+                goto out_free;
+        }
+        rval = xfs_dir2_isleaf(tp, dp, &v);
+        if (rval)
+                goto out_free;
+        if (v)
+                rval = xfs_dir2_leaf_removename(args);
        else
-                rval = xfs_dir2_node_removename(&args);
+                rval = xfs_dir2_node_removename(args);
+out_free:
+        kmem_free(args);
        return rval;
 }
@@ -386,40 +441,54 @@ xfs_dir_replace(
        xfs_bmap_free_t *flist,         /* bmap's freeblock list */
        xfs_extlen_t    total)          /* bmap's total block count */
 {
-        xfs_da_args_t   args;
+        struct xfs_da_args *args;
        int             rval;
        int             v;              /* type-checking value */
        ASSERT(S_ISDIR(dp->i_d.di_mode));
-        if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
+        rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+        if (rval)
                return rval;
-        memset(&args, 0, sizeof(xfs_da_args_t));
+        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-        args.name = name->name;
+        if (!args)
-        args.namelen = name->len;
+                return ENOMEM;
-        args.filetype = name->type;
-        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+        args->name = name->name;
-        args.inumber = inum;
+        args->namelen = name->len;
-        args.dp = dp;
+        args->filetype = name->type;
-        args.firstblock = first;
+        args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-        args.flist = flist;
+        args->inumber = inum;
-        args.total = total;
+        args->dp = dp;
-        args.whichfork = XFS_DATA_FORK;
+        args->firstblock = first;
-        args.trans = tp;
+        args->flist = flist;
+        args->total = total;
-        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+        args->whichfork = XFS_DATA_FORK;
-                rval = xfs_dir2_sf_replace(&args);
+        args->trans = tp;
-        else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
-                return rval;
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-        else if (v)
+                rval = xfs_dir2_sf_replace(args);
-                rval = xfs_dir2_block_replace(&args);
+                goto out_free;
-        else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+        }
-                return rval;
-        else if (v)
+        rval = xfs_dir2_isblock(tp, dp, &v);
-                rval = xfs_dir2_leaf_replace(&args);
+        if (rval)
+                goto out_free;
+        if (v) {
+                rval = xfs_dir2_block_replace(args);
+                goto out_free;
+        }
+        rval = xfs_dir2_isleaf(tp, dp, &v);
+        if (rval)
+                goto out_free;
+        if (v)
+                rval = xfs_dir2_leaf_replace(args);
        else
-                rval = xfs_dir2_node_replace(&args);
+                rval = xfs_dir2_node_replace(args);
+out_free:
+        kmem_free(args);
        return rval;
 }
@@ -434,7 +503,7 @@ xfs_dir_canenter(
        struct xfs_name *name,          /* name of entry to add */
        uint            resblks)
 {
-        xfs_da_args_t   args;
+        struct xfs_da_args *args;
        int             rval;
        int             v;              /* type-checking value */
@@ -443,29 +512,42 @@ xfs_dir_canenter(
        ASSERT(S_ISDIR(dp->i_d.di_mode));
-        memset(&args, 0, sizeof(xfs_da_args_t));
+        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
-        args.name = name->name;
+        if (!args)
-        args.namelen = name->len;
+                return ENOMEM;
-        args.filetype = name->type;
-        args.hashval = dp->i_mount->m_dirnameops->hashname(name);
+        args->name = name->name;
-        args.dp = dp;
+        args->namelen = name->len;
-        args.whichfork = XFS_DATA_FORK;
+        args->filetype = name->type;
-        args.trans = tp;
+        args->hashval = dp->i_mount->m_dirnameops->hashname(name);
-        args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
+        args->dp = dp;
+        args->whichfork = XFS_DATA_FORK;
+        args->trans = tp;
+        args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
                                                        XFS_DA_OP_OKNOENT;
-        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+        if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
-                rval = xfs_dir2_sf_addname(&args);
+                rval = xfs_dir2_sf_addname(args);
-        else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
+                goto out_free;
-                return rval;
+        }
-        else if (v)
-                rval = xfs_dir2_block_addname(&args);
+        rval = xfs_dir2_isblock(tp, dp, &v);
-        else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
+        if (rval)
-                return rval;
+                goto out_free;
-        else if (v)
+        if (v) {
-                rval = xfs_dir2_leaf_addname(&args);
+                rval = xfs_dir2_block_addname(args);
+                goto out_free;
+        }
+        rval = xfs_dir2_isleaf(tp, dp, &v);
+        if (rval)
+                goto out_free;
+        if (v)
+                rval = xfs_dir2_leaf_addname(args);
        else
-                rval = xfs_dir2_node_addname(&args);
+                rval = xfs_dir2_node_addname(args);
+out_free:
+        kmem_free(args);
        return rval;
 }
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 90cdbf4b5f19..4f6a38cb83a4 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -89,13 +89,14 @@ xfs_dir3_block_read_verify(
 {
        struct xfs_mount        *mp = bp->b_target->bt_mount;
-        if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
-             !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+             !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
-                                          XFS_DIR3_DATA_CRC_OFF)) ||
+                xfs_buf_ioerror(bp, EFSBADCRC);
-            !xfs_dir3_block_verify(bp)) {
+        else if (!xfs_dir3_block_verify(bp))
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
-        }
+        if (bp->b_error)
+                xfs_verifier_error(bp);
 }
 static void
@@ -107,8 +108,8 @@ xfs_dir3_block_write_verify(
        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
        if (!xfs_dir3_block_verify(bp)) {
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
                return;
        }
@@ -118,7 +119,7 @@ xfs_dir3_block_write_verify(
        if (bip)
                hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
-        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF);
+        xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
 }
 const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 70acff4ee173..afa4ad523f3f 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -241,7 +241,6 @@ static void
 xfs_dir3_data_reada_verify(
        struct xfs_buf          *bp)
 {
-        struct xfs_mount        *mp = bp->b_target->bt_mount;
        struct xfs_dir2_data_hdr *hdr = bp->b_addr;
        switch (hdr->magic) {
@@ -255,8 +254,8 @@ xfs_dir3_data_reada_verify(
                xfs_dir3_data_verify(bp);
                return;
        default:
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
                break;
        }
 }
@@ -267,13 +266,14 @@ xfs_dir3_data_read_verify(
 {
        struct xfs_mount        *mp = bp->b_target->bt_mount;
-        if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
-             !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+             !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
-                                          XFS_DIR3_DATA_CRC_OFF)) ||
+                 xfs_buf_ioerror(bp, EFSBADCRC);
-            !xfs_dir3_data_verify(bp)) {
+        else if (!xfs_dir3_data_verify(bp))
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
-        }
+        if (bp->b_error)
+                xfs_verifier_error(bp);
 }
 static void
@@ -285,8 +285,8 @@ xfs_dir3_data_write_verify(
        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
        if (!xfs_dir3_data_verify(bp)) {
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
                return;
        }
@@ -296,7 +296,7 @@ xfs_dir3_data_write_verify(
        if (bip)
                hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
-        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF);
+        xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
 }
 const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index ae47ec6e16c4..d36e97df1187 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -179,13 +179,14 @@ __read_verify(
 {
        struct xfs_mount        *mp = bp->b_target->bt_mount;
-        if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
-             !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+             !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
-                                          XFS_DIR3_LEAF_CRC_OFF)) ||
+                xfs_buf_ioerror(bp, EFSBADCRC);
-            !xfs_dir3_leaf_verify(bp, magic)) {
+        else if (!xfs_dir3_leaf_verify(bp, magic))
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
-        }
+        if (bp->b_error)
+                xfs_verifier_error(bp);
 }
 static void
@@ -198,8 +199,8 @@ __write_verify(
        struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
        if (!xfs_dir3_leaf_verify(bp, magic)) {
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
                return;
        }
@@ -209,7 +210,7 @@ __write_verify(
        if (bip)
                hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
-        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_LEAF_CRC_OFF);
+        xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
 }
 static void
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 48c7d18f68c3..cb434d732681 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -115,13 +115,14 @@ xfs_dir3_free_read_verify(
 {
        struct xfs_mount        *mp = bp->b_target->bt_mount;
-        if ((xfs_sb_version_hascrc(&mp->m_sb) &&
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
-             !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+            !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
-                                          XFS_DIR3_FREE_CRC_OFF)) ||
+                xfs_buf_ioerror(bp, EFSBADCRC);
-            !xfs_dir3_free_verify(bp)) {
+        else if (!xfs_dir3_free_verify(bp))
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
-        }
+        if (bp->b_error)
+                xfs_verifier_error(bp);
 }
 static void
@@ -133,8 +134,8 @@ xfs_dir3_free_write_verify(
        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
        if (!xfs_dir3_free_verify(bp)) {
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
                return;
        }
@@ -144,7 +145,7 @@ xfs_dir3_free_write_verify(
        if (bip)
                hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
-        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_FREE_CRC_OFF);
+        xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
 }
 const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 7aeb4c895b32..868b19f096bf 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -615,7 +615,7 @@ xfs_qm_dqread(
        if (flags & XFS_QMOPT_DQALLOC) {
                tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_attrsetm,
+                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc,
                                          XFS_QM_DQALLOC_SPACE_RES(mp), 0);
                if (error)
                        goto error1;
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
index d401457d2f25..610da8177737 100644
--- a/fs/xfs/xfs_dquot_buf.c
+++ b/fs/xfs/xfs_dquot_buf.c
@@ -257,10 +257,13 @@ xfs_dquot_buf_read_verify(
 {
        struct xfs_mount        *mp = bp->b_target->bt_mount;
-        if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) {
+        if (!xfs_dquot_buf_verify_crc(mp, bp))
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+                xfs_buf_ioerror(bp, EFSBADCRC);
+        else if (!xfs_dquot_buf_verify(mp, bp))
                xfs_buf_ioerror(bp, EFSCORRUPTED);
-        }
+        if (bp->b_error)
+                xfs_verifier_error(bp);
 }
 /*
@@ -275,8 +278,8 @@ xfs_dquot_buf_write_verify(
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        if (!xfs_dquot_buf_verify(mp, bp)) {
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
                return;
        }
 }
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 9995b807d627..edac5b057d28 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -156,7 +156,7 @@ xfs_error_report(
 {
        if (level <= xfs_error_level) {
                xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
-                "Internal error %s at line %d of file %s.  Caller 0x%p",
+                "Internal error %s at line %d of file %s.  Caller %pF",
                            tag, linenum, filename, ra);
                xfs_stack_trace();
@@ -178,3 +178,28 @@ xfs_corruption_error(
        xfs_error_report(tag, level, mp, filename, linenum, ra);
        xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
 }
+/*
+ * Warnings specifically for verifier errors.  Differentiate CRC vs. invalid
+ * values, and omit the stack trace unless the error level is tuned high.
+ */
+void
+xfs_verifier_error(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
+                  bp->b_error == EFSBADCRC ? "CRC error" : "corruption",
+                  __return_address, bp->b_bn);
+        xfs_alert(mp, "Unmount and run xfs_repair");
+        if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+                xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:");
+                xfs_hex_dump(xfs_buf_offset(bp, 0), 64);
+        }
+        if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+                xfs_stack_trace();
+}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 079a367f44ee..c1c57d4a4b5d 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -34,6 +34,7 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
 extern void xfs_corruption_error(const char *tag, int level,
                        struct xfs_mount *mp, void *p, const char *filename,
                        int linenum, inst_t *ra);
+extern void xfs_verifier_error(struct xfs_buf *bp);
 #define XFS_ERROR_REPORT(e, lvl, mp)    \
        xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 1399e187d425..753e467aa1a5 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata(
        if (!lsn)
                return 0;
-        return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+        return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 64b48eade91d..830c1c937b88 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -155,7 +155,7 @@ xfs_dir_fsync(
        if (!lsn)
                return 0;
-        return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+        return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 STATIC int
@@ -295,7 +295,7 @@ xfs_file_aio_read(
                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
                if (inode->i_mapping->nrpages) {
-                        ret = -filemap_write_and_wait_range(
+                        ret = filemap_write_and_wait_range(
                                                        VFS_I(ip)->i_mapping,
                                                        pos, -1);
                        if (ret) {
@@ -679,7 +679,7 @@ xfs_file_dio_aio_write(
                goto out;
        if (mapping->nrpages) {
-                ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+                ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
                                                    pos, -1);
                if (ret)
                        goto out;
@@ -699,7 +699,7 @@ xfs_file_dio_aio_write(
        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
        ret = generic_file_direct_write(iocb, iovp,
-                        &nr_segs, pos, &iocb->ki_pos, count, ocount);
+                        &nr_segs, pos, count, ocount);
 out:
        xfs_rw_iunlock(ip, iolock);
@@ -715,7 +715,7 @@ xfs_file_buffered_aio_write(
        const struct iovec      *iovp,
        unsigned long           nr_segs,
        loff_t                  pos,
-        size_t                  ocount)
+        size_t                  count)
 {
        struct file             *file = iocb->ki_filp;
        struct address_space    *mapping = file->f_mapping;
@@ -724,7 +724,7 @@ xfs_file_buffered_aio_write(
        ssize_t                 ret;
        int                     enospc = 0;
        int                     iolock = XFS_IOLOCK_EXCL;
-        size_t                  count = ocount;
+        struct iov_iter         from;
        xfs_rw_ilock(ip, iolock);
@@ -732,14 +732,15 @@ xfs_file_buffered_aio_write(
        if (ret)
                goto out;
+        iov_iter_init(&from, iovp, nr_segs, count, 0);
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
 write_retry:
        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
-        ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+        ret = generic_perform_write(file, &from, pos);
-                        pos, &iocb->ki_pos, count, 0);
+        if (likely(ret >= 0))
+                iocb->ki_pos = pos + ret;
        /*
         * If we just got an ENOSPC, try to write back all dirty inodes to
         * convert delalloc space to free up some of the excess reserved
@@ -823,7 +824,8 @@ xfs_file_fallocate(
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
-        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
@@ -831,6 +833,28 @@ xfs_file_fallocate(
                error = xfs_free_file_space(ip, offset, len);
                if (error)
                        goto out_unlock;
+        } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+                unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+                if (offset & blksize_mask || len & blksize_mask) {
+                        error = EINVAL;
+                        goto out_unlock;
+                }
+                /*
+                 * There is no need to overlap collapse range with EOF,
+                 * in which case it is effectively a truncate operation
+                 */
+                if (offset + len >= i_size_read(inode)) {
+                        error = EINVAL;
+                        goto out_unlock;
+                }
+                new_size = i_size_read(inode) - len;
+                error = xfs_collapse_file_space(ip, offset, len);
+                if (error)
+                        goto out_unlock;
        } else {
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
                    offset + len > i_size_read(inode)) {
@@ -840,8 +864,11 @@ xfs_file_fallocate(
                                goto out_unlock;
                }
-                error = xfs_alloc_file_space(ip, offset, len,
+                if (mode & FALLOC_FL_ZERO_RANGE)
-                                             XFS_BMAPI_PREALLOC);
+                        error = xfs_zero_file_space(ip, offset, len);
+                else
+                        error = xfs_alloc_file_space(ip, offset, len,
+                                                     XFS_BMAPI_PREALLOC);
                if (error)
                        goto out_unlock;
        }
@@ -859,7 +886,7 @@ xfs_file_fallocate(
        if (ip->i_d.di_mode & S_IXGRP)
                ip->i_d.di_mode &= ~S_ISGID;
-        if (!(mode & FALLOC_FL_PUNCH_HOLE))
+        if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -1465,6 +1492,7 @@ const struct file_operations xfs_dir_file_operations = {
 static const struct vm_operations_struct xfs_file_vm_ops = {
        .fault          = filemap_fault,
+        .map_pages      = filemap_map_pages,
        .page_mkwrite   = xfs_vm_page_mkwrite,
        .remap_pages    = generic_file_remap_pages,
 };
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
index b6ab5a3cfa12..9898f31d05d8 100644
--- a/fs/xfs/xfs_format.h
+++ b/fs/xfs/xfs_format.h
@@ -145,6 +145,8 @@ struct xfs_dsymlink_hdr {
        __be64  sl_lsn;
 };
+#define XFS_SYMLINK_CRC_OFF     offsetof(struct xfs_dsymlink_hdr, sl_crc)
 /*
 * The maximum pathlen is 1024 bytes. Since the minimum file system
 * blocksize is 512 bytes, we can get a max of 3 extents back from
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5d7f105a1c82..8f711db61a0c 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -363,6 +363,18 @@ xfs_ialloc_ag_alloc(
                args.minleft = args.mp->m_in_maxlevels - 1;
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
+                /*
+                 * This request might have dirtied the transaction if the AG can
+                 * satisfy the request, but the exact block was not available.
+                 * If the allocation did fail, subsequent requests will relax
+                 * the exact agbno requirement and increase the alignment
+                 * instead. It is critical that the total size of the request
+                 * (len + alignment + slop) does not increase from this point
+                 * on, so reset minalignslop to ensure it is not included in
+                 * subsequent requests.
+                 */
+                args.minalignslop = 0;
        } else
                args.fsbno = NULLFSBLOCK;
@@ -1568,18 +1580,17 @@ xfs_agi_read_verify(
        struct xfs_buf  *bp)
 {
        struct xfs_mount *mp = bp->b_target->bt_mount;
-        int             agi_ok = 1;
-        if (xfs_sb_version_hascrc(&mp->m_sb))
-                agi_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
-                                          offsetof(struct xfs_agi, agi_crc));
-        agi_ok = agi_ok && xfs_agi_verify(bp);
-        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
+        if (xfs_sb_version_hascrc(&mp->m_sb) &&
-                        XFS_RANDOM_IALLOC_READ_AGI))) {
+            !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+                xfs_buf_ioerror(bp, EFSBADCRC);
+        else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
+                                XFS_ERRTAG_IALLOC_READ_AGI,
+                                XFS_RANDOM_IALLOC_READ_AGI))
                xfs_buf_ioerror(bp, EFSCORRUPTED);
-        }
+        if (bp->b_error)
+                xfs_verifier_error(bp);
 }
 static void
@@ -1590,8 +1601,8 @@ xfs_agi_write_verify(
        struct xfs_buf_log_item *bip = bp->b_fspriv;
        if (!xfs_agi_verify(bp)) {
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
                return;
        }
@@ -1600,8 +1611,7 @@ xfs_agi_write_verify(
        if (bip)
                XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+        xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
-                         offsetof(struct xfs_agi, agi_crc));
 }
 const struct xfs_buf_ops xfs_agi_buf_ops = {
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index c8fa5bbb36de..7e309b11e87d 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -243,12 +243,14 @@ static void
 xfs_inobt_read_verify(
        struct xfs_buf  *bp)
 {
-        if (!(xfs_btree_sblock_verify_crc(bp) &&
+        if (!xfs_btree_sblock_verify_crc(bp))
-              xfs_inobt_verify(bp))) {
+                xfs_buf_ioerror(bp, EFSBADCRC);
-                trace_xfs_btree_corrupt(bp, _RET_IP_);
+        else if (!xfs_inobt_verify(bp))
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
-                                     bp->b_target->bt_mount, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        if (bp->b_error) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                xfs_verifier_error(bp);
        }
 }
@@ -258,9 +260,9 @@ xfs_inobt_write_verify(
 {
        if (!xfs_inobt_verify(bp)) {
                trace_xfs_btree_corrupt(bp, _RET_IP_);
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
-                                     bp->b_target->bt_mount, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
+                return;
        }
        xfs_btree_sblock_calc_crc(bp);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3a137e9f9a7d..768087bedbac 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -42,7 +42,6 @@
 #include "xfs_bmap_util.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
-#include "xfs_dinode.h"
 #include "xfs_filestream.h"
 #include "xfs_cksum.h"
 #include "xfs_trace.h"
@@ -62,6 +61,8 @@ kmem_zone_t *xfs_inode_zone;
 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
+STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *);
 /*
 * helper function to extract extent size hint from inode
 */
@@ -1115,7 +1116,7 @@ xfs_bumplink(
 {
        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-        ASSERT(ip->i_d.di_nlink > 0);
+        ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE));
        ip->i_d.di_nlink++;
        inc_nlink(VFS_I(ip));
        if ((ip->i_d.di_version == 1) &&
@@ -1165,10 +1166,7 @@ xfs_create(
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+        prid = xfs_get_initial_prid(dp);
-                prid = xfs_get_projid(dp);
-        else
-                prid = XFS_PROJID_DEFAULT;
        /*
         * Make sure that we have allocated dquot(s) on disk.
@@ -1333,6 +1331,114 @@ xfs_create(
 }
 int
+xfs_create_tmpfile(
+        struct xfs_inode        *dp,
+        struct dentry           *dentry,
+        umode_t                 mode,
+        struct xfs_inode        **ipp)
+{
+        struct xfs_mount        *mp = dp->i_mount;
+        struct xfs_inode        *ip = NULL;
+        struct xfs_trans        *tp = NULL;
+        int                     error;
+        uint                    cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+        prid_t                  prid;
+        struct xfs_dquot        *udqp = NULL;
+        struct xfs_dquot        *gdqp = NULL;
+        struct xfs_dquot        *pdqp = NULL;
+        struct xfs_trans_res    *tres;
+        uint                    resblks;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return XFS_ERROR(EIO);
+        prid = xfs_get_initial_prid(dp);
+        /*
+         * Make sure that we have allocated dquot(s) on disk.
+         */
+        error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
+                                xfs_kgid_to_gid(current_fsgid()), prid,
+                                XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+                                &udqp, &gdqp, &pdqp);
+        if (error)
+                return error;
+        resblks = XFS_IALLOC_SPACE_RES(mp);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE);
+        tres = &M_RES(mp)->tr_create_tmpfile;
+        error = xfs_trans_reserve(tp, tres, resblks, 0);
+        if (error == ENOSPC) {
+                /* No space at all so try a "no-allocation" reservation */
+                resblks = 0;
+                error = xfs_trans_reserve(tp, tres, 0, 0);
+        }
+        if (error) {
+                cancel_flags = 0;
+                goto out_trans_cancel;
+        }
+        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+                                                pdqp, resblks, 1, 0);
+        if (error)
+                goto out_trans_cancel;
+        error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
+                                prid, resblks > 0, &ip, NULL);
+        if (error) {
+                if (error == ENOSPC)
+                        goto out_trans_cancel;
+                goto out_trans_abort;
+        }
+        if (mp->m_flags & XFS_MOUNT_WSYNC)
+                xfs_trans_set_sync(tp);
+        /*
+         * Attach the dquot(s) to the inodes and modify them incore.
+         * These ids of the inode couldn't have changed since the new
+         * inode has been locked ever since it was created.
+         */
+        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+        ip->i_d.di_nlink--;
+        error = xfs_iunlink(tp, ip);
+        if (error)
+                goto out_trans_abort;
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+        if (error)
+                goto out_release_inode;
+        xfs_qm_dqrele(udqp);
+        xfs_qm_dqrele(gdqp);
+        xfs_qm_dqrele(pdqp);
+        *ipp = ip;
+        return 0;
+ out_trans_abort:
+        cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+        xfs_trans_cancel(tp, cancel_flags);
+ out_release_inode:
+        /*
+         * Wait until after the current transaction is aborted to
+         * release the inode.  This prevents recursive transactions
+         * and deadlocks from xfs_inactive.
+         */
+        if (ip)
+                IRELE(ip);
+        xfs_qm_dqrele(udqp);
+        xfs_qm_dqrele(gdqp);
+        xfs_qm_dqrele(pdqp);
+        return error;
+}
+int
 xfs_link(
        xfs_inode_t             *tdp,
        xfs_inode_t             *sip,
@@ -1397,6 +1503,12 @@ xfs_link(
        xfs_bmap_init(&free_list, &first_block);
+        if (sip->i_d.di_nlink == 0) {
+                error = xfs_iunlink_remove(tp, sip);
+                if (error)
+                        goto abort_return;
+        }
        error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
                                        &first_block, &free_list, resblks);
        if (error)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 65e2350f449c..f2fcde52b66d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -20,6 +20,7 @@
 #include "xfs_inode_buf.h"
 #include "xfs_inode_fork.h"
+#include "xfs_dinode.h"
 /*
 * Kernel only inode definitions
@@ -192,6 +193,15 @@ xfs_set_projid(struct xfs_inode *ip,
        ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
 }
+static inline prid_t
+xfs_get_initial_prid(struct xfs_inode *dp)
+{
+        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+                return xfs_get_projid(dp);
+        return XFS_PROJID_DEFAULT;
+}
 /*
 * In-core inode flags.
 */
@@ -323,6 +333,8 @@ int		xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
                           struct xfs_inode **ipp, struct xfs_name *ci_name);
 int             xfs_create(struct xfs_inode *dp, struct xfs_name *name,
                           umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
+int             xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
+                           umode_t mode, struct xfs_inode **ipp);
 int             xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
                           struct xfs_inode *ip);
 int             xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
index 4fc9f39dd89e..24e993996bdc 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/xfs_inode_buf.c
@@ -102,8 +102,7 @@ xfs_inode_buf_verify(
                        }
                        xfs_buf_ioerror(bp, EFSCORRUPTED);
-                        XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
+                        xfs_verifier_error(bp);
-                                             mp, dip);
 #ifdef DEBUG
                        xfs_alert(mp,
                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
@@ -306,7 +305,7 @@ xfs_dinode_verify(
        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return false;
        if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
-                              offsetof(struct xfs_dinode, di_crc)))
+                              XFS_DINODE_CRC_OFF))
                return false;
        if (be64_to_cpu(dip->di_ino) != ip->i_ino)
                return false;
@@ -327,7 +326,7 @@ xfs_dinode_calc_crc(
        ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
        crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
-                              offsetof(struct xfs_dinode, di_crc));
+                              XFS_DINODE_CRC_OFF);
        dip->di_crc = xfs_end_cksum(crc);
 }
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index bcfe61202115..0b18776b075e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -271,32 +271,6 @@ xfs_open_by_handle(
        return error;
 }
-/*
- * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's
- * unused first argument.
- */
-STATIC int
-do_readlink(
-        char __user             *buffer,
-        int                     buflen,
-        const char              *link)
-{
-        int len;
-        len = PTR_ERR(link);
-        if (IS_ERR(link))
-                goto out;
-        len = strlen(link);
-        if (len > (unsigned) buflen)
-                len = buflen;
-        if (copy_to_user(buffer, link, len))
-                len = -EFAULT;
- out:
-        return len;
-}
 int
 xfs_readlink_by_handle(
        struct file             *parfilp,
@@ -334,7 +308,7 @@ xfs_readlink_by_handle(
        error = -xfs_readlink(XFS_I(dentry->d_inode), link);
        if (error)
                goto out_kfree;
-        error = do_readlink(hreq->ohandle, olen, link);
+        error = readlink_copy(hreq->ohandle, olen, link);
        if (error)
                goto out_kfree;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 22d1cbea283d..3b80ebae05f5 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -128,7 +128,6 @@ xfs_iomap_write_direct(
        xfs_fsblock_t   firstfsb;
        xfs_extlen_t    extsz, temp;
        int             nimaps;
-        int             bmapi_flag;
        int             quota_flag;
        int             rt;
        xfs_trans_t     *tp;
@@ -200,18 +199,15 @@ xfs_iomap_write_direct(
        xfs_trans_ijoin(tp, ip, 0);
-        bmapi_flag = 0;
-        if (offset < XFS_ISIZE(ip) || extsz)
-                bmapi_flag |= XFS_BMAPI_PREALLOC;
        /*
         * From this point onwards we overwrite the imap pointer that the
         * caller gave to us.
         */
        xfs_bmap_init(&free_list, &firstfsb);
        nimaps = 1;
-        error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag,
+        error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
-                                &firstfsb, 0, imap, &nimaps, &free_list);
+                                XFS_BMAPI_PREALLOC, &firstfsb, 0,
+                                imap, &nimaps, &free_list);
        if (error)
                goto out_bmap_cancel;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 9ddfb8190ca1..36d630319a27 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -39,6 +39,7 @@
 #include "xfs_da_btree.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_dinode.h"
+#include "xfs_trans_space.h"
 #include <linux/capability.h>
 #include <linux/xattr.h>
@@ -48,6 +49,18 @@
 #include <linux/fiemap.h>
 #include <linux/slab.h>
+/*
+ * Directories have different lock order w.r.t. mmap_sem compared to regular
+ * files. This is due to readdir potentially triggering page faults on a user
+ * buffer inside filldir(), and this happens with the ilock on the directory
+ * held. For regular files, the lock order is the other way around - the
+ * mmap_sem is taken during the page fault, and then we lock the ilock to do
+ * block mapping. Hence we need a different class for the directory ilock so
+ * that lockdep can tell them apart.
+ */
+static struct lock_class_key xfs_nondir_ilock_class;
+static struct lock_class_key xfs_dir_ilock_class;
 static int
 xfs_initxattrs(
        struct inode            *inode,
@@ -59,8 +72,8 @@ xfs_initxattrs(
        int                     error = 0;
        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-                error = xfs_attr_set(ip, xattr->name, xattr->value,
+                error = -xfs_attr_set(ip, xattr->name, xattr->value,
-                                     xattr->value_len, ATTR_SECURE);
+                                      xattr->value_len, ATTR_SECURE);
                if (error < 0)
                        break;
        }
@@ -80,8 +93,8 @@ xfs_init_security(
        struct inode    *dir,
        const struct qstr *qstr)
 {
-        return security_inode_init_security(inode, dir, qstr,
+        return -security_inode_init_security(inode, dir, qstr,
-                                            &xfs_initxattrs, NULL);
+                                             &xfs_initxattrs, NULL);
 }
 static void
@@ -111,15 +124,15 @@ xfs_cleanup_inode(
        xfs_dentry_to_name(&teardown, dentry, 0);
        xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
-        iput(inode);
 }
 STATIC int
-xfs_vn_mknod(
+xfs_generic_create(
        struct inode    *dir,
        struct dentry   *dentry,
        umode_t         mode,
-        dev_t           rdev)
+        dev_t           rdev,
+        bool            tmpfile)        /* unnamed file */
 {
        struct inode    *inode;
        struct xfs_inode *ip = NULL;
@@ -143,8 +156,12 @@ xfs_vn_mknod(
        if (error)
                return error;
-        xfs_dentry_to_name(&name, dentry, mode);
+        if (!tmpfile) {
-        error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+                xfs_dentry_to_name(&name, dentry, mode);
+                error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+        } else {
+                error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
+        }
        if (unlikely(error))
                goto out_free_acl;
@@ -156,18 +173,22 @@ xfs_vn_mknod(
 #ifdef CONFIG_XFS_POSIX_ACL
        if (default_acl) {
-                error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+                error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
                if (error)
                        goto out_cleanup_inode;
        }
        if (acl) {
-                error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+                error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
                if (error)
                        goto out_cleanup_inode;
        }
 #endif
-        d_instantiate(dentry, inode);
+        if (tmpfile)
+                d_tmpfile(dentry, inode);
+        else
+                d_instantiate(dentry, inode);
 out_free_acl:
        if (default_acl)
                posix_acl_release(default_acl);
@@ -176,11 +197,23 @@ xfs_vn_mknod(
        return -error;
 out_cleanup_inode:
-        xfs_cleanup_inode(dir, inode, dentry);
+        if (!tmpfile)
+                xfs_cleanup_inode(dir, inode, dentry);
+        iput(inode);
        goto out_free_acl;
 }
 STATIC int
+xfs_vn_mknod(
+        struct inode    *dir,
+        struct dentry   *dentry,
+        umode_t         mode,
+        dev_t           rdev)
+{
+        return xfs_generic_create(dir, dentry, mode, rdev, false);
+}
+STATIC int
 xfs_vn_create(
        struct inode    *dir,
        struct dentry   *dentry,
@@ -340,6 +373,7 @@ xfs_vn_symlink(
 out_cleanup_inode:
        xfs_cleanup_inode(dir, inode, dentry);
+        iput(inode);
 out:
        return -error;
 }
@@ -1034,6 +1068,15 @@ xfs_vn_fiemap(
        return 0;
 }
+STATIC int
+xfs_vn_tmpfile(
+        struct inode    *dir,
+        struct dentry   *dentry,
+        umode_t         mode)
+{
+        return xfs_generic_create(dir, dentry, mode, 0, true);
+}
 static const struct inode_operations xfs_inode_operations = {
        .get_acl                = xfs_get_acl,
        .set_acl                = xfs_set_acl,
@@ -1072,6 +1115,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
        .update_time            = xfs_vn_update_time,
+        .tmpfile                = xfs_vn_tmpfile,
 };
 static const struct inode_operations xfs_dir_ci_inode_operations = {
@@ -1099,6 +1143,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
        .update_time            = xfs_vn_update_time,
+        .tmpfile                = xfs_vn_tmpfile,
 };
 static const struct inode_operations xfs_symlink_inode_operations = {
@@ -1191,6 +1236,7 @@ xfs_setup_inode(
        xfs_diflags_to_iflags(inode, ip);
        ip->d_ops = ip->i_mount->m_nondir_inode_ops;
+        lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
                inode->i_op = &xfs_inode_operations;
@@ -1198,6 +1244,7 @@ xfs_setup_inode(
                inode->i_mapping->a_ops = &xfs_address_space_operations;
                break;
        case S_IFDIR:
+                lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
                if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
                        inode->i_op = &xfs_dir_ci_inode_operations;
                else
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index f9bb590acc0e..825249d2dfc1 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -119,6 +119,7 @@ typedef __uint64_t __psunsigned_t;
 #include "xfs_iops.h"
 #include "xfs_aops.h"
 #include "xfs_super.h"
+#include "xfs_cksum.h"
 #include "xfs_buf.h"
 #include "xfs_message.h"
@@ -178,6 +179,7 @@ typedef __uint64_t __psunsigned_t;
 #define ENOATTR         ENODATA         /* Attribute not found */
 #define EWRONGFS        EINVAL          /* Mount with wrong filesystem type */
 #define EFSCORRUPTED    EUCLEAN         /* Filesystem is corrupted */
+#define EFSBADCRC       EBADMSG         /* Bad CRC detected */
 #define SYNCHRONIZE()   barrier()
 #define __return_address __builtin_return_address(0)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 8497a00e399d..a5f8bd9899d3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -616,11 +616,13 @@ xfs_log_mount(
        int             error = 0;
        int             min_logfsbs;
-        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
-                xfs_notice(mp, "Mounting Filesystem");
+                xfs_notice(mp, "Mounting V%d Filesystem",
-        else {
+                           XFS_SB_VERSION_NUM(&mp->m_sb));
+        } else {
                xfs_notice(mp,
-"Mounting filesystem in no-recovery mode.  Filesystem will be inconsistent.");
+"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
+                           XFS_SB_VERSION_NUM(&mp->m_sb));
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
        }
@@ -1181,11 +1183,14 @@ xlog_iodone(xfs_buf_t *bp)
        /* log I/O is always issued ASYNC */
        ASSERT(XFS_BUF_ISASYNC(bp));
        xlog_state_done_syncing(iclog, aborted);
        /*
-         * do not reference the buffer (bp) here as we could race
+         * drop the buffer lock now that we are done. Nothing references
-         * with it being freed after writing the unmount record to the
+         * the buffer after this, so an unmount waiting on this lock can now
-         * log.
+         * tear it down safely. As such, it is unsafe to reference the buffer
+         * (bp) after the unlock as we could race with it being freed.
         */
+        xfs_buf_unlock(bp);
 }
 /*
@@ -1368,8 +1373,16 @@ xlog_alloc_log(
        bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
        if (!bp)
                goto out_free_log;
-        bp->b_iodone = xlog_iodone;
+        /*
+         * The iclogbuf buffer locks are held over IO but we are not going to do
+         * IO yet.  Hence unlock the buffer so that the log IO path can grab it
+         * when appropriately.
+         */
        ASSERT(xfs_buf_islocked(bp));
+        xfs_buf_unlock(bp);
+        bp->b_iodone = xlog_iodone;
        log->l_xbuf = bp;
        spin_lock_init(&log->l_icloglock);
@@ -1398,6 +1411,9 @@ xlog_alloc_log(
                if (!bp)
                        goto out_free_iclog;
+                ASSERT(xfs_buf_islocked(bp));
+                xfs_buf_unlock(bp);
                bp->b_iodone = xlog_iodone;
                iclog->ic_bp = bp;
                iclog->ic_data = bp->b_addr;
@@ -1422,7 +1438,6 @@ xlog_alloc_log(
                iclog->ic_callback_tail = &(iclog->ic_callback);
                iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
-                ASSERT(xfs_buf_islocked(iclog->ic_bp));
                init_waitqueue_head(&iclog->ic_force_wait);
                init_waitqueue_head(&iclog->ic_write_wait);
@@ -1631,6 +1646,12 @@ xlog_cksum(
 * we transition the iclogs to IOERROR state *after* flushing all existing
 * iclogs to disk. This is because we don't want anymore new transactions to be
 * started or completed afterwards.
+ *
+ * We lock the iclogbufs here so that we can serialise against IO completion
+ * during unmount. We might be processing a shutdown triggered during unmount,
+ * and that can occur asynchronously to the unmount thread, and hence we need to
+ * ensure that completes before tearing down the iclogbufs. Hence we need to
+ * hold the buffer lock across the log IO to acheive that.
 */
 STATIC int
 xlog_bdstrat(
@@ -1638,6 +1659,7 @@ xlog_bdstrat(
 {
        struct xlog_in_core     *iclog = bp->b_fspriv;
+        xfs_buf_lock(bp);
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                xfs_buf_ioerror(bp, EIO);
                xfs_buf_stale(bp);
@@ -1645,7 +1667,8 @@ xlog_bdstrat(
                /*
                 * It would seem logical to return EIO here, but we rely on
                 * the log state machine to propagate I/O errors instead of
-                 * doing it here.
+                 * doing it here. Similarly, IO completion will unlock the
+                 * buffer, so we don't do it here.
                 */
                return 0;
        }
@@ -1847,14 +1870,28 @@ xlog_dealloc_log(
        xlog_cil_destroy(log);
        /*
-         * always need to ensure that the extra buffer does not point to memory
+         * Cycle all the iclogbuf locks to make sure all log IO completion
-         * owned by another log buffer before we free it.
+         * is done before we tear down these buffers.
         */
+        iclog = log->l_iclog;
+        for (i = 0; i < log->l_iclog_bufs; i++) {
+                xfs_buf_lock(iclog->ic_bp);
+                xfs_buf_unlock(iclog->ic_bp);
+                iclog = iclog->ic_next;
+        }
+        /*
+         * Always need to ensure that the extra buffer does not point to memory
+         * owned by another log buffer before we free it. Also, cycle the lock
+         * first to ensure we've completed IO on it.
+         */
+        xfs_buf_lock(log->l_xbuf);
+        xfs_buf_unlock(log->l_xbuf);
        xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
        xfs_buf_free(log->l_xbuf);
        iclog = log->l_iclog;
-        for (i=0; i<log->l_iclog_bufs; i++) {
+        for (i = 0; i < log->l_iclog_bufs; i++) {
                xfs_buf_free(iclog->ic_bp);
                next_iclog = iclog->ic_next;
                kmem_free(iclog);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index b0f4ef77fa70..2c4004475e71 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -175,7 +175,7 @@ void	  xlog_iodone(struct xfs_buf *);
 struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void      xfs_log_ticket_put(struct xlog_ticket *ticket);
-int     xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+void    xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
                                xfs_lsn_t *commit_lsn, int flags);
 bool    xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 4ef6fdbced78..7e5455391176 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -499,13 +499,6 @@ xlog_cil_push(
        cil->xc_ctx = new_ctx;
        /*
-         * mirror the new sequence into the cil structure so that we can do
-         * unlocked checks against the current sequence in log forces without
-         * risking deferencing a freed context pointer.
-         */
-        cil->xc_current_sequence = new_ctx->sequence;
-        /*
         * The switch is now done, so we can drop the context lock and move out
         * of a shared context. We can't just go straight to the commit record,
         * though - we need to synchronise with previous and future commits so
@@ -523,8 +516,15 @@ xlog_cil_push(
         * Hence we need to add this context to the committing context list so
         * that higher sequences will wait for us to write out a commit record
         * before they do.
+         *
+         * xfs_log_force_lsn requires us to mirror the new sequence into the cil
+         * structure atomically with the addition of this sequence to the
+         * committing list. This also ensures that we can do unlocked checks
+         * against the current sequence in log forces without risking
+         * deferencing a freed context pointer.
         */
        spin_lock(&cil->xc_push_lock);
+        cil->xc_current_sequence = new_ctx->sequence;
        list_add(&ctx->committing, &cil->xc_committing);
        spin_unlock(&cil->xc_push_lock);
        up_write(&cil->xc_ctx_lock);
@@ -662,8 +662,14 @@ xlog_cil_push_background(
 }
+/*
+ * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence
+ * number that is passed. When it returns, the work will be queued for
+ * @push_seq, but it won't be completed. The caller is expected to do any
+ * waiting for push_seq to complete if it is required.
+ */
 static void
-xlog_cil_push_foreground(
+xlog_cil_push_now(
        struct xlog     *log,
        xfs_lsn_t       push_seq)
 {
@@ -688,10 +694,8 @@ xlog_cil_push_foreground(
        }
        cil->xc_push_seq = push_seq;
+        queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
        spin_unlock(&cil->xc_push_lock);
-        /* do the push now */
-        xlog_cil_push(log);
 }
 bool
@@ -721,7 +725,7 @@ xlog_cil_empty(
 * background commit, returns without it held once background commits are
 * allowed again.
 */
-int
+void
 xfs_log_commit_cil(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
@@ -767,7 +771,6 @@ xfs_log_commit_cil(
        xlog_cil_push_background(log);
        up_read(&cil->xc_ctx_lock);
-        return 0;
 }
 /*
@@ -796,7 +799,8 @@ xlog_cil_force_lsn(
         * xlog_cil_push() handles racing pushes for the same sequence,
         * so no need to deal with it here.
         */
-        xlog_cil_push_foreground(log, sequence);
+restart:
+        xlog_cil_push_now(log, sequence);
        /*
         * See if we can find a previous sequence still committing.
@@ -804,7 +808,6 @@ xlog_cil_force_lsn(
         * before allowing the force of push_seq to go ahead. Hence block
         * on commits for those as well.
         */
-restart:
        spin_lock(&cil->xc_push_lock);
        list_for_each_entry(ctx, &cil->xc_committing, committing) {
                if (ctx->sequence > sequence)
@@ -822,6 +825,28 @@ restart:
                /* found it! */
                commit_lsn = ctx->commit_lsn;
        }
+        /*
+         * The call to xlog_cil_push_now() executes the push in the background.
+         * Hence by the time we have got here it our sequence may not have been
+         * pushed yet. This is true if the current sequence still matches the
+         * push sequence after the above wait loop and the CIL still contains
+         * dirty objects.
+         *
+         * When the push occurs, it will empty the CIL and
+         * atomically increment the currect sequence past the push sequence and
+         * move it into the committing list. Of course, if the CIL is clean at
+         * the time of the push, it won't have pushed the CIL at all, so in that
+         * case we should try the push for this sequence again from the start
+         * just in case.
+         */
+        if (sequence == cil->xc_current_sequence &&
+            !list_empty(&cil->xc_cil)) {
+                spin_unlock(&cil->xc_push_lock);
+                goto restart;
+        }
        spin_unlock(&cil->xc_push_lock);
        return commit_lsn;
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f96c05669a9e..944f3d9456a8 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -314,6 +314,9 @@ reread:
                error = bp->b_error;
                if (loud)
                        xfs_warn(mp, "SB validate failed with error %d.", error);
+                /* bad CRC means corrupted metadata */
+                if (error == EFSBADCRC)
+                        error = EFSCORRUPTED;
                goto release_buf;
        }
@@ -740,8 +743,6 @@ xfs_mountfs(
                new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
                if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
                        mp->m_inode_cluster_size = new_size;
-                xfs_info(mp, "Using inode cluster size of %d bytes",
-                         mp->m_inode_cluster_size);
        }
        /*
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 348e4d2ed6e6..dc977b6e6a36 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -843,22 +843,17 @@ xfs_qm_init_quotainfo(
        qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
-        if ((error = list_lru_init(&qinf->qi_lru))) {
+        error = -list_lru_init(&qinf->qi_lru);
-                kmem_free(qinf);
+        if (error)
-                mp->m_quotainfo = NULL;
+                goto out_free_qinf;
-                return error;
-        }
        /*
         * See if quotainodes are setup, and if not, allocate them,
         * and change the superblock accordingly.
         */
-        if ((error = xfs_qm_init_quotainos(mp))) {
+        error = xfs_qm_init_quotainos(mp);
-                list_lru_destroy(&qinf->qi_lru);
+        if (error)
-                kmem_free(qinf);
+                goto out_free_lru;
-                mp->m_quotainfo = NULL;
-                return error;
-        }
        INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
        INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
@@ -918,7 +913,7 @@ xfs_qm_init_quotainfo(
                qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
                qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
                qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
- 
                xfs_qm_dqdestroy(dqp);
        } else {
                qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
@@ -935,6 +930,13 @@ xfs_qm_init_quotainfo(
        qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
        register_shrinker(&qinf->qi_shrinker);
        return 0;
+out_free_lru:
+        list_lru_destroy(&qinf->qi_lru);
+out_free_qinf:
+        kmem_free(qinf);
+        mp->m_quotainfo = NULL;
+        return error;
 }
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a6a76b2b6a85..ec5ca65c6211 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -842,7 +842,7 @@ xfs_growfs_rt_alloc(
                /*
                 * Reserve space & log for one extent added to the file.
                 */
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
+                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
                                          resblks, 0);
                if (error)
                        goto error_cancel;
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index 1e116794bb66..8baf61afae1d 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -201,10 +201,6 @@ xfs_mount_validate_sb(
         * write validation, we don't need to check feature masks.
         */
        if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
-                xfs_alert(mp,
-"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
-"Use of these features in this kernel is at your own risk!");
                if (xfs_sb_has_compat_feature(sbp,
                                        XFS_SB_FEAT_COMPAT_UNKNOWN)) {
                        xfs_warn(mp,
@@ -288,6 +284,7 @@ xfs_mount_validate_sb(
            sbp->sb_inodelog < XFS_DINODE_MIN_LOG                       ||
            sbp->sb_inodelog > XFS_DINODE_MAX_LOG                       ||
            sbp->sb_inodesize != (1 << sbp->sb_inodelog)                ||
+            sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
            (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)   ||
            (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)  ||
            (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)  ||
@@ -610,12 +607,11 @@ xfs_sb_read_verify(
                                                XFS_SB_VERSION_5) ||
             dsb->sb_crc != 0)) {
-                if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+                if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
-                                      offsetof(struct xfs_sb, sb_crc))) {
                        /* Only fail bad secondaries on a known V5 filesystem */
                        if (bp->b_bn == XFS_SB_DADDR ||
                            xfs_sb_version_hascrc(&mp->m_sb)) {
-                                error = EFSCORRUPTED;
+                                error = EFSBADCRC;
                                goto out_error;
                        }
                }
@@ -624,10 +620,9 @@ xfs_sb_read_verify(
 out_error:
        if (error) {
-                if (error == EFSCORRUPTED)
-                        XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
-                                             mp, bp->b_addr);
                xfs_buf_ioerror(bp, error);
+                if (error == EFSCORRUPTED || error == EFSBADCRC)
+                        xfs_verifier_error(bp);
        }
 }
@@ -662,9 +657,8 @@ xfs_sb_write_verify(
        error = xfs_sb_verify(bp, false);
        if (error) {
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
-                                     mp, bp->b_addr);
                xfs_buf_ioerror(bp, error);
+                xfs_verifier_error(bp);
                return;
        }
@@ -674,8 +668,7 @@ xfs_sb_write_verify(
        if (bip)
                XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
-        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+        xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
-                         offsetof(struct xfs_sb, sb_crc));
 }
 const struct xfs_buf_ops xfs_sb_buf_ops = {
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 35061d4b614c..f7b2fe77c5a5 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -182,6 +182,8 @@ typedef struct xfs_sb {
        /* must be padded to 64 bit alignment */
 } xfs_sb_t;
+#define XFS_SB_CRC_OFF          offsetof(struct xfs_sb, sb_crc)
 /*
 * Superblock - on disk version.  Must match the in core version above.
 * Must be padded to 64 bit alignment.
diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h
index 8c5035a13df1..4484e5151395 100644
--- a/fs/xfs/xfs_shared.h
+++ b/fs/xfs/xfs_shared.h
@@ -104,7 +104,8 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 #define XFS_TRANS_SB_COUNT              41
 #define XFS_TRANS_CHECKPOINT            42
 #define XFS_TRANS_ICREATE               43
-#define XFS_TRANS_TYPE_MAX              43
+#define XFS_TRANS_CREATE_TMPFILE        44
+#define XFS_TRANS_TYPE_MAX              44
 /* new transaction types need to be reflected in xfs_logprint(8) */
 #define XFS_TRANS_TYPES \
@@ -112,6 +113,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
        { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
        { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
        { XFS_TRANS_CREATE,             "CREATE" }, \
+        { XFS_TRANS_CREATE_TMPFILE,     "CREATE_TMPFILE" }, \
        { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
        { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
        { XFS_TRANS_REMOVE,             "REMOVE" }, \
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d971f4932b5d..3494eff8e4eb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -996,7 +996,7 @@ xfs_fs_evict_inode(
        trace_xfs_evict_inode(ip);
-        truncate_inode_pages(&inode->i_data, 0);
+        truncate_inode_pages_final(&inode->i_data);
        clear_inode(inode);
        XFS_STATS_INC(vn_rele);
        XFS_STATS_INC(vn_remove);
@@ -1197,6 +1197,7 @@ xfs_fs_remount(
        char                    *p;
        int                     error;
+        sync_filesystem(sb);
        while ((p = strsep(&options, ",")) != NULL) {
                int token;
@@ -1432,11 +1433,11 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_fsname;
-        error = xfs_init_mount_workqueues(mp);
+        error = -xfs_init_mount_workqueues(mp);
        if (error)
                goto out_close_devices;
-        error = xfs_icsb_init_counters(mp);
+        error = -xfs_icsb_init_counters(mp);
        if (error)
                goto out_destroy_workqueues;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 14e58f2c96bd..52979aa90986 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -80,6 +80,10 @@ xfs_readlink_bmap(
                if (error) {
                        xfs_buf_ioerror_alert(bp, __func__);
                        xfs_buf_relse(bp);
+                        /* bad CRC means corrupted metadata */
+                        if (error == EFSBADCRC)
+                                error = EFSCORRUPTED;
                        goto out;
                }
                byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
@@ -208,10 +212,7 @@ xfs_symlink(
                return XFS_ERROR(ENAMETOOLONG);
        udqp = gdqp = NULL;
-        if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+        prid = xfs_get_initial_prid(dp);
-                prid = xfs_get_projid(dp);
-        else
-                prid = XFS_PROJID_DEFAULT;
        /*
         * Make sure that we have allocated dquot(s) on disk.
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
index bf59a2b45f8c..9b32052ff65e 100644
--- a/fs/xfs/xfs_symlink_remote.c
+++ b/fs/xfs/xfs_symlink_remote.c
@@ -133,12 +133,13 @@ xfs_symlink_read_verify(
        if (!xfs_sb_version_hascrc(&mp->m_sb))
                return;
-        if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+        if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
-                                  offsetof(struct xfs_dsymlink_hdr, sl_crc)) ||
+                xfs_buf_ioerror(bp, EFSBADCRC);
-            !xfs_symlink_verify(bp)) {
+        else if (!xfs_symlink_verify(bp))
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
-        }
+        if (bp->b_error)
+                xfs_verifier_error(bp);
 }
 static void
@@ -153,8 +154,8 @@ xfs_symlink_write_verify(
                return;
        if (!xfs_symlink_verify(bp)) {
-                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_verifier_error(bp);
                return;
        }
@@ -162,8 +163,7 @@ xfs_symlink_write_verify(
                struct xfs_dsymlink_hdr *dsl = bp->b_addr;
                dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
        }
-        xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+        xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
-                         offsetof(struct xfs_dsymlink_hdr, sl_crc));
 }
 const struct xfs_buf_ops xfs_symlink_buf_ops = {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 425dfa45b9a0..65d8c793a25c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -603,6 +603,8 @@ DEFINE_INODE_EVENT(xfs_readlink);
 DEFINE_INODE_EVENT(xfs_inactive_symlink);
 DEFINE_INODE_EVENT(xfs_alloc_file_space);
 DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_zero_file_space);
+DEFINE_INODE_EVENT(xfs_collapse_file_space);
 DEFINE_INODE_EVENT(xfs_readdir);
 #ifdef CONFIG_XFS_POSIX_ACL
 DEFINE_INODE_EVENT(xfs_get_acl);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index c812c5c060de..54a57326d85b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -887,12 +887,7 @@ xfs_trans_commit(
                xfs_trans_apply_sb_deltas(tp);
        xfs_trans_apply_dquot_deltas(tp);
-        error = xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
+        xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
-        if (error == ENOMEM) {
-                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
-                error = XFS_ERROR(EIO);
-                goto out_unreserve;
-        }
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
        xfs_trans_free(tp);
@@ -902,10 +897,7 @@ xfs_trans_commit(
         * log out now and wait for it.
         */
        if (sync) {
-                if (!error) {
+                error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
-                        error = _xfs_log_force_lsn(mp, commit_lsn,
-                                      XFS_LOG_SYNC, NULL);
-                }
                XFS_STATS_INC(xs_trans_sync);
        } else {
                XFS_STATS_INC(xs_trans_async);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 647b6f1d8923..b8eef0549f3f 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -275,6 +275,10 @@ xfs_trans_read_buf_map(
                        XFS_BUF_UNDONE(bp);
                        xfs_buf_stale(bp);
                        xfs_buf_relse(bp);
+                        /* bad CRC means corrupted metadata */
+                        if (error == EFSBADCRC)
+                                error = EFSCORRUPTED;
                        return error;
                }
 #ifdef DEBUG
@@ -338,6 +342,9 @@ xfs_trans_read_buf_map(
                                if (tp->t_flags & XFS_TRANS_DIRTY)
                                        xfs_force_shutdown(tp->t_mountp,
                                                        SHUTDOWN_META_IO_ERROR);
+                                /* bad CRC means corrupted metadata */
+                                if (error == EFSBADCRC)
+                                        error = EFSCORRUPTED;
                                return error;
                        }
                }
@@ -375,6 +382,10 @@ xfs_trans_read_buf_map(
                if (tp->t_flags & XFS_TRANS_DIRTY)
                        xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
                xfs_buf_relse(bp);
+                /* bad CRC means corrupted metadata */
+                if (error == EFSBADCRC)
+                        error = EFSCORRUPTED;
                return error;
        }
 #ifdef DEBUG
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index 2ffd3e331b49..ae368165244d 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -81,20 +81,28 @@ xfs_calc_buf_res(
 * on disk. Hence we need an inode reservation function that calculates all this
 * correctly. So, we log:
 *
- * - log op headers for object
+ * - 4 log op headers for object
+ *      - for the ilf, the inode core and 2 forks
 * - inode log format object
- * - the entire inode contents (core + 2 forks)
+ * - the inode core
- * - two bmap btree block headers
+ * - two inode forks containing bmap btree root blocks.
+ *      - the btree data contained by both forks will fit into the inode size,
+ *        hence when combined with the inode core above, we have a total of the
+ *        actual inode size.
+ *      - the BMBT headers need to be accounted separately, as they are
+ *        additional to the records and pointers that fit inside the inode
+ *        forks.
 */
 STATIC uint
 xfs_calc_inode_res(
        struct xfs_mount        *mp,
        uint                    ninodes)
 {
-        return ninodes * (sizeof(struct xlog_op_header) +
+        return ninodes *
-                          sizeof(struct xfs_inode_log_format) +
+                (4 * sizeof(struct xlog_op_header) +
-                          mp->m_sb.sb_inodesize +
+                 sizeof(struct xfs_inode_log_format) +
-                          2 * XFS_BMBT_BLOCK_LEN(mp));
+                 mp->m_sb.sb_inodesize +
+                 2 * XFS_BMBT_BLOCK_LEN(mp));
 }
 /*
@@ -204,6 +212,19 @@ xfs_calc_rename_reservation(
 }
 /*
+ * For removing an inode from unlinked list at first, we can modify:
+ *    the agi hash list and counters: sector size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ */
+STATIC uint
+xfs_calc_iunlink_remove_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+               max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
+}
+/*
 * For creating a link to an inode:
 *    the parent directory inode: inode size
 *    the linked inode: inode size
@@ -220,6 +241,7 @@ xfs_calc_link_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
+                xfs_calc_iunlink_remove_reservation(mp) +
                MAX((xfs_calc_inode_res(mp, 2) +
                     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
                                      XFS_FSB_TO_B(mp, 1))),
@@ -229,6 +251,18 @@ xfs_calc_link_reservation(
 }
 /*
+ * For adding an inode to unlinked list we can modify:
+ *    the agi hash list: sector size
+ *    the unlinked inode: inode size
+ */
+STATIC uint
+xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
+{
+        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+                xfs_calc_inode_res(mp, 1);
+}
+/*
 * For removing a directory entry we can modify:
 *    the parent directory inode: inode size
 *    the removed inode: inode size
@@ -245,10 +279,11 @@ xfs_calc_remove_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-                MAX((xfs_calc_inode_res(mp, 2) +
+                xfs_calc_iunlink_add_reservation(mp) +
+                MAX((xfs_calc_inode_res(mp, 1) +
                     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
                                      XFS_FSB_TO_B(mp, 1))),
-                    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+                    (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
                                      XFS_FSB_TO_B(mp, 1))));
 }
@@ -343,6 +378,20 @@ xfs_calc_create_reservation(
 }
+STATIC uint
+xfs_calc_create_tmpfile_reservation(
+        struct xfs_mount        *mp)
+{
+        uint    res = XFS_DQUOT_LOGRES(mp);
+        if (xfs_sb_version_hascrc(&mp->m_sb))
+                res += xfs_calc_icreate_resv_alloc(mp);
+        else
+                res += xfs_calc_create_resv_alloc(mp);
+        return res + xfs_calc_iunlink_add_reservation(mp);
+}
 /*
 * Making a new directory is the same as creating a new file.
 */
@@ -383,9 +432,9 @@ xfs_calc_ifree_reservation(
 {
        return XFS_DQUOT_LOGRES(mp) +
                xfs_calc_inode_res(mp, 1) +
-                xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+                xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
                xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
-                max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size) +
+                xfs_calc_iunlink_remove_reservation(mp) +
                xfs_calc_buf_res(1, 0) +
                xfs_calc_buf_res(2 + mp->m_ialloc_blks +
                                 mp->m_in_maxlevels, 0) +
@@ -644,15 +693,14 @@ xfs_calc_qm_setqlim_reservation(
 /*
 * Allocating quota on disk if needed.
- *      the write transaction log space: M_RES(mp)->tr_write.tr_logres
+ *      the write transaction log space for quota file extent allocation
 *      the unit of quota allocation: one system block size
 */
 STATIC uint
 xfs_calc_qm_dqalloc_reservation(
        struct xfs_mount        *mp)
 {
-        ASSERT(M_RES(mp)->tr_write.tr_logres);
+        return xfs_calc_write_reservation(mp) +
-        return M_RES(mp)->tr_write.tr_logres +
                xfs_calc_buf_res(1,
                        XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
 }
@@ -729,6 +777,11 @@ xfs_trans_resv_calc(
        resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
        resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+        resp->tr_create_tmpfile.tr_logres =
+                        xfs_calc_create_tmpfile_reservation(mp);
+        resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
+        resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
        resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
        resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
        resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
@@ -784,7 +837,6 @@ xfs_trans_resv_calc(
        /* The following transaction are logged in logical format */
        resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
        resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
-        resp->tr_swrite.tr_logres = xfs_calc_swrite_reservation(mp);
        resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
        resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
        resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h
index de7de9aaad8a..1097d14cd583 100644
--- a/fs/xfs/xfs_trans_resv.h
+++ b/fs/xfs/xfs_trans_resv.h
@@ -38,11 +38,11 @@ struct xfs_trans_resv {
        struct xfs_trans_res    tr_remove;      /* unlink trans */
        struct xfs_trans_res    tr_symlink;     /* symlink trans */
        struct xfs_trans_res    tr_create;      /* create trans */
+        struct xfs_trans_res    tr_create_tmpfile; /* create O_TMPFILE trans */
        struct xfs_trans_res    tr_mkdir;       /* mkdir trans */
        struct xfs_trans_res    tr_ifree;       /* inode free trans */
        struct xfs_trans_res    tr_ichange;     /* inode update trans */
        struct xfs_trans_res    tr_growdata;    /* fs data section grow trans */
-        struct xfs_trans_res    tr_swrite;      /* sync write inode trans */
        struct xfs_trans_res    tr_addafork;    /* add inode attr fork trans */
        struct xfs_trans_res    tr_writeid;     /* write setuid/setgid file */
        struct xfs_trans_res    tr_attrinval;   /* attr fork buffer
@@ -100,6 +100,7 @@ struct xfs_trans_resv {
 #define XFS_ITRUNCATE_LOG_COUNT         2
 #define XFS_INACTIVE_LOG_COUNT          2
 #define XFS_CREATE_LOG_COUNT            2
+#define XFS_CREATE_TMPFILE_LOG_COUNT    2
 #define XFS_MKDIR_LOG_COUNT             3
 #define XFS_SYMLINK_LOG_COUNT           3
 #define XFS_REMOVE_LOG_COUNT            2
author	James Morris <james.l.morris@oracle.com>	2014-06-24 04:46:07 -0400
committer	James Morris <james.l.morris@oracle.com>	2014-06-24 04:46:07 -0400
commit	f01387d2693813eb5271a3448e6a082322c7d75d (patch)
tree	b591ca73c85276bae53d7db57ff1565be45a29da /fs
parent	92953ff38ba59b4f7b1a54ab28b84be35fafaecc (diff)
parent	1860e379875dfe7271c649058aeddffe5afd9d0d (diff)