Merge tag 'efi-urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/mfleming/efi into x86/urgent

Pull EFI fixes from Matt Fleming: * WARN_ON(!spin_is_locked()) always triggers on non-SMP machines. Swap it for the more canonical lockdep_assert_held() which always does the right thing - Guenter Roeck * Assign the correct value to efi.runtime_version on arm64 so that all the runtime services can be invoked - Semen Protsenko Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ingo Molnar <mingo@kernel.org> 2014-08-22 04:04:15 -0400
committer: Ingo Molnar <mingo@kernel.org> 2014-08-22 04:04:15 -0400
commit: 80b304fd00e8b667775ff791121b61ecd7cd0c03 (patch)
tree: b4f2ec59fe062c43343ee4c2f10a6bcd0e4dcd1b /fs
parent: fb21b84e7f809ef04b1e5aed5d463cf0d4866638 (diff)
parent: 6a7519e81321343165f89abb8b616df186d3e57a (diff)
403 files changed, 12433 insertions, 7255 deletions
diff --git a/fs/Makefile b/fs/Makefile
index 4030cbfbc9af..90c88529892b 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
                attr.o bad_inode.o file.o filesystems.o namespace.o \
                seq_file.o xattr.o libfs.o fs-writeback.o \
                pnode.o splice.o sync.o utimes.o \
-                stack.o fs_struct.o statfs.o
+                stack.o fs_struct.o statfs.o fs_pin.o
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=        buffer.o block_dev.o direct-io.o mpage.o
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index c770337c4b45..24575d9d882d 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -153,6 +153,7 @@ extern int adfs_map_lookup(struct super_block *sb, unsigned int frag_id, unsigne
 extern unsigned int adfs_map_free(struct super_block *sb);
 /* Misc */
+__printf(3, 4)
 void __adfs_error(struct super_block *sb, const char *function,
                  const char *fmt, ...);
 #define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt)
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 0d138c0de293..51c279a29845 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -138,7 +138,7 @@ adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_inf
                goto out;
        if (ADFS_I(inode)->parent_id != dir.parent_id) {
-                adfs_error(sb, "parent directory changed under me! (%lx but got %lx)\n",
+                adfs_error(sb, "parent directory changed under me! (%lx but got %x)\n",
                           ADFS_I(inode)->parent_id, dir.parent_id);
                ret = -EIO;
                goto free_out;
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index d9e3bee4e653..f2ba88ab4aed 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -55,10 +55,10 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
        }
        size >>= sb->s_blocksize_bits;
-        if (size > sizeof(dir->bh)/sizeof(dir->bh[0])) {
+        if (size > ARRAY_SIZE(dir->bh)) {
                /* this directory is too big for fixed bh set, must allocate */
                struct buffer_head **bh_fplus =
-                        kzalloc(size * sizeof(struct buffer_head *),
+                        kcalloc(size, sizeof(struct buffer_head *),
                                GFP_KERNEL);
                if (!bh_fplus) {
                        adfs_error(sb, "not enough memory for"
@@ -79,9 +79,8 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
                dir->bh_fplus[blk] = sb_bread(sb, block);
                if (!dir->bh_fplus[blk]) {
-                        adfs_error(sb,  "dir object %X failed read for"
+                        adfs_error(sb,  "dir object %x failed read for offset %d, mapped block %lX",
-                                        " offset %d, mapped block %X",
+                                   id, blk, block);
-                                        id, blk, block);
                        goto out;
                }
diff --git a/fs/aio.c b/fs/aio.c
index bd7ec2cc2674..ae635872affb 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -192,7 +192,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
        }
        file->f_flags = O_RDWR;
-        file->private_data = ctx;
        return file;
 }
@@ -202,7 +201,7 @@ static struct dentry *aio_mount(struct file_system_type *fs_type,
        static const struct dentry_operations ops = {
                .d_dname        = simple_dname,
        };
-        return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1);
+        return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);
 }
 /* aio_setup
@@ -556,8 +555,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
        struct aio_ring *ring;
        spin_lock(&mm->ioctx_lock);
-        rcu_read_lock();
+        table = rcu_dereference_raw(mm->ioctx_table);
-        table = rcu_dereference(mm->ioctx_table);
        while (1) {
                if (table)
@@ -565,7 +563,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
                                if (!table->table[i]) {
                                        ctx->id = i;
                                        table->table[i] = ctx;
-                                        rcu_read_unlock();
                                        spin_unlock(&mm->ioctx_lock);
                                        /* While kioctx setup is in progress,
@@ -579,8 +576,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
                                }
                new_nr = (table ? table->nr : 1) * 4;
-                rcu_read_unlock();
                spin_unlock(&mm->ioctx_lock);
                table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
@@ -591,8 +586,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
                table->nr = new_nr;
                spin_lock(&mm->ioctx_lock);
-                rcu_read_lock();
+                old = rcu_dereference_raw(mm->ioctx_table);
-                old = rcu_dereference(mm->ioctx_table);
                if (!old) {
                        rcu_assign_pointer(mm->ioctx_table, table);
@@ -739,12 +733,9 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
        spin_lock(&mm->ioctx_lock);
-        rcu_read_lock();
+        table = rcu_dereference_raw(mm->ioctx_table);
-        table = rcu_dereference(mm->ioctx_table);
        WARN_ON(ctx != table->table[ctx->id]);
        table->table[ctx->id] = NULL;
-        rcu_read_unlock();
        spin_unlock(&mm->ioctx_lock);
        /* percpu_ref_kill() will do the necessary call_rcu() */
@@ -793,40 +784,30 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
 */
 void exit_aio(struct mm_struct *mm)
 {
-        struct kioctx_table *table;
+        struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
-        struct kioctx *ctx;
+        int i;
-        unsigned i = 0;
-        while (1) {
-                rcu_read_lock();
-                table = rcu_dereference(mm->ioctx_table);
-                do {
-                        if (!table || i >= table->nr) {
-                                rcu_read_unlock();
-                                rcu_assign_pointer(mm->ioctx_table, NULL);
-                                if (table)
-                                        kfree(table);
-                                return;
-                        }
-                        ctx = table->table[i++];
+        if (!table)
-                } while (!ctx);
+                return;
-                rcu_read_unlock();
+        for (i = 0; i < table->nr; ++i) {
+                struct kioctx *ctx = table->table[i];
+                if (!ctx)
+                        continue;
                /*
-                 * We don't need to bother with munmap() here -
+                 * We don't need to bother with munmap() here - exit_mmap(mm)
-                 * exit_mmap(mm) is coming and it'll unmap everything.
+                 * is coming and it'll unmap everything. And we simply can't,
-                 * Since aio_free_ring() uses non-zero ->mmap_size
+                 * this is not necessarily our ->mm.
-                 * as indicator that it needs to unmap the area,
+                 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
-                 * just set it to 0; aio_free_ring() is the only
+                 * that it needs to unmap the area, just set it to 0.
-                 * place that uses ->mmap_size, so it's safe.
                 */
                ctx->mmap_size = 0;
                kill_ioctx(mm, ctx, NULL);
        }
+        RCU_INIT_POINTER(mm->ioctx_table, NULL);
+        kfree(table);
 }
 static void put_reqs_available(struct kioctx *ctx, unsigned nr)
@@ -834,10 +815,8 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
        struct kioctx_cpu *kcpu;
        unsigned long flags;
-        preempt_disable();
-        kcpu = this_cpu_ptr(ctx->cpu);
        local_irq_save(flags);
+        kcpu = this_cpu_ptr(ctx->cpu);
        kcpu->reqs_available += nr;
        while (kcpu->reqs_available >= ctx->req_batch * 2) {
@@ -846,7 +825,6 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
        }
        local_irq_restore(flags);
-        preempt_enable();
 }
 static bool get_reqs_available(struct kioctx *ctx)
@@ -855,10 +833,8 @@ static bool get_reqs_available(struct kioctx *ctx)
        bool ret = false;
        unsigned long flags;
-        preempt_disable();
-        kcpu = this_cpu_ptr(ctx->cpu);
        local_irq_save(flags);
+        kcpu = this_cpu_ptr(ctx->cpu);
        if (!kcpu->reqs_available) {
                int old, avail = atomic_read(&ctx->reqs_available);
@@ -878,7 +854,6 @@ static bool get_reqs_available(struct kioctx *ctx)
        kcpu->reqs_available--;
 out:
        local_irq_restore(flags);
-        preempt_enable();
        return ret;
 }
@@ -1047,7 +1022,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 }
 EXPORT_SYMBOL(aio_complete);
-/* aio_read_events
+/* aio_read_events_ring
 *      Pull an event off of the ioctx's event ring.  Returns the number of
 *      events fetched
 */
@@ -1270,12 +1245,12 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
        if (compat)
                ret = compat_rw_copy_check_uvector(rw,
                                (struct compat_iovec __user *)buf,
-                                *nr_segs, 1, *iovec, iovec);
+                                *nr_segs, UIO_FASTIOV, *iovec, iovec);
        else
 #endif
                ret = rw_copy_check_uvector(rw,
                                (struct iovec __user *)buf,
-                                *nr_segs, 1, *iovec, iovec);
+                                *nr_segs, UIO_FASTIOV, *iovec, iovec);
        if (ret < 0)
                return ret;
@@ -1299,9 +1274,8 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
 }
 /*
- * aio_setup_iocb:
+ * aio_run_iocb:
- *      Performs the initial checks and aio retry method
+ *      Performs the initial checks and io submission.
- *      setup for the kiocb at the time of io submission.
 */
 static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
                            char __user *buf, bool compat)
@@ -1313,7 +1287,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
        fmode_t mode;
        aio_rw_op *rw_op;
        rw_iter_op *iter_op;
-        struct iovec inline_vec, *iovec = &inline_vec;
+        struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
        struct iov_iter iter;
        switch (opcode) {
@@ -1348,7 +1322,7 @@ rw_common:
                if (!ret)
                        ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
                if (ret < 0) {
-                        if (iovec != &inline_vec)
+                        if (iovec != inline_vecs)
                                kfree(iovec);
                        return ret;
                }
@@ -1395,7 +1369,7 @@ rw_common:
                return -EINVAL;
        }
-        if (iovec != &inline_vec)
+        if (iovec != inline_vecs)
                kfree(iovec);
        if (ret != -EIOCBQUEUED) {
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index acf32054edd8..9e359fb20c0a 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -143,20 +143,6 @@ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
        return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
 }
-/* Does a dentry have some pending activity? */
-static inline int autofs4_ispending(struct dentry *dentry)
-{
-        struct autofs_info *inf = autofs4_dentry_ino(dentry);
-        if (inf->flags & AUTOFS_INF_PENDING)
-                return 1;
-        if (inf->flags & AUTOFS_INF_EXPIRING)
-                return 1;
-        return 0;
-}
 struct inode *autofs4_get_inode(struct super_block *, umode_t);
 void autofs4_free_ino(struct autofs_info *);
@@ -191,55 +177,6 @@ extern const struct file_operations autofs4_root_operations;
 extern const struct dentry_operations autofs4_dentry_operations;
 /* VFS automount flags management functions */
-static inline void __managed_dentry_set_automount(struct dentry *dentry)
-{
-        dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
-}
-static inline void managed_dentry_set_automount(struct dentry *dentry)
-{
-        spin_lock(&dentry->d_lock);
-        __managed_dentry_set_automount(dentry);
-        spin_unlock(&dentry->d_lock);
-}
-static inline void __managed_dentry_clear_automount(struct dentry *dentry)
-{
-        dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
-}
-static inline void managed_dentry_clear_automount(struct dentry *dentry)
-{
-        spin_lock(&dentry->d_lock);
-        __managed_dentry_clear_automount(dentry);
-        spin_unlock(&dentry->d_lock);
-}
-static inline void __managed_dentry_set_transit(struct dentry *dentry)
-{
-        dentry->d_flags |= DCACHE_MANAGE_TRANSIT;
-}
-static inline void managed_dentry_set_transit(struct dentry *dentry)
-{
-        spin_lock(&dentry->d_lock);
-        __managed_dentry_set_transit(dentry);
-        spin_unlock(&dentry->d_lock);
-}
-static inline void __managed_dentry_clear_transit(struct dentry *dentry)
-{
-        dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT;
-}
-static inline void managed_dentry_clear_transit(struct dentry *dentry)
-{
-        spin_lock(&dentry->d_lock);
-        __managed_dentry_clear_transit(dentry);
-        spin_unlock(&dentry->d_lock);
-}
 static inline void __managed_dentry_set_managed(struct dentry *dentry)
 {
        dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 394e90b02c5e..a7be57e39be7 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -333,7 +333,6 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
        if (ino->flags & AUTOFS_INF_PENDING)
                goto out;
        if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
-                struct autofs_info *ino = autofs4_dentry_ino(root);
                ino->flags |= AUTOFS_INF_EXPIRING;
                init_completion(&ino->expire_complete);
                spin_unlock(&sbi->fs_lock);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index cc87c1abac97..cdb25ebccc4c 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -166,8 +166,10 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
        const unsigned char *str = name->name;
        struct list_head *p, *head;
-        spin_lock(&sbi->lookup_lock);
        head = &sbi->active_list;
+        if (list_empty(head))
+                return NULL;
+        spin_lock(&sbi->lookup_lock);
        list_for_each(p, head) {
                struct autofs_info *ino;
                struct dentry *active;
@@ -218,8 +220,10 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
        const unsigned char *str = name->name;
        struct list_head *p, *head;
-        spin_lock(&sbi->lookup_lock);
        head = &sbi->expiring_list;
+        if (list_empty(head))
+                return NULL;
+        spin_lock(&sbi->lookup_lock);
        list_for_each(p, head) {
                struct autofs_info *ino;
                struct dentry *expiring;
@@ -373,7 +377,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
                 * this because the leaves of the directory tree under the
                 * mount never trigger mounts themselves (they have an autofs
                 * trigger mount mounted on them). But v4 pseudo direct mounts
-                 * do need the leaves to to trigger mounts. In this case we
+                 * do need the leaves to trigger mounts. In this case we
                 * have no choice but to use the list_empty() check and
                 * require user space behave.
                 */
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 7c93953030fb..afd2b4408adf 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -218,8 +218,9 @@ static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
        return -EIO;
 }
-static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry,
+static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry,
-                struct inode *new_dir, struct dentry *new_dentry)
+                             struct inode *new_dir, struct dentry *new_dentry,
+                             unsigned int flags)
 {
        return -EIO;
 }
@@ -279,7 +280,7 @@ static const struct inode_operations bad_inode_ops =
        .mkdir          = bad_inode_mkdir,
        .rmdir          = bad_inode_rmdir,
        .mknod          = bad_inode_mknod,
-        .rename         = bad_inode_rename,
+        .rename2        = bad_inode_rename2,
        .readlink       = bad_inode_readlink,
        /* follow_link must be no-op, otherwise unmounting this inode
           won't work */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index a16fbd4e8241..4cf61ec6b7a8 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -799,13 +799,11 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
        befs_debug(sb, "---> %s", __func__);
-#ifndef CONFIG_BEFS_RW
        if (!(sb->s_flags & MS_RDONLY)) {
                befs_warning(sb,
                             "No write support. Marking filesystem read-only");
                sb->s_flags |= MS_RDONLY;
        }
-#endif                          /* CONFIG_BEFS_RW */
        /*
         * Set dummy blocksize to read super block.
@@ -834,16 +832,14 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
                    (befs_super_block *) ((void *) bh->b_data + x86_sb_off);
        }
-        if (befs_load_sb(sb, disk_sb) != BEFS_OK)
+        if ((befs_load_sb(sb, disk_sb) != BEFS_OK) ||
+            (befs_check_sb(sb) != BEFS_OK))
                goto unacquire_bh;
        befs_dump_super_block(sb, disk_sb);
        brelse(bh);
-        if (befs_check_sb(sb) != BEFS_OK)
-                goto unacquire_priv_sbp;
        if( befs_sb->num_blocks > ~((sector_t)0) ) {
                befs_error(sb, "blocks count: %llu "
                        "is larger than the host can use",
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index f7f87e233dd9..f40006db36df 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -46,6 +46,7 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode)
 /* inode.c */
 extern struct inode *bfs_iget(struct super_block *sb, unsigned long ino);
+extern void bfs_dump_imap(const char *, struct super_block *);
 /* file.c */
 extern const struct inode_operations bfs_file_inops;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index a399e6d9dc74..08063ae0a17c 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -75,8 +75,6 @@ const struct file_operations bfs_dir_operations = {
        .llseek         = generic_file_llseek,
 };
-extern void dump_imap(const char *, struct super_block *);
 static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                                                bool excl)
 {
@@ -110,7 +108,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        BFS_I(inode)->i_eblock = 0;
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
-        dump_imap("create", s);
+        bfs_dump_imap("create", s);
        err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len,
                                                        inode->i_ino);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 7041ac35ace8..90bc079d9982 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -30,8 +30,6 @@ MODULE_LICENSE("GPL");
 #define dprintf(x...)
 #endif
-void dump_imap(const char *prefix, struct super_block *s);
 struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
 {
        struct bfs_inode *di;
@@ -194,7 +192,7 @@ static void bfs_evict_inode(struct inode *inode)
                        info->si_freeb += bi->i_eblock + 1 - bi->i_sblock;
                info->si_freei++;
                clear_bit(ino, info->si_imap);
-                dump_imap("delete_inode", s);
+                bfs_dump_imap("delete_inode", s);
        }
        /*
@@ -297,7 +295,7 @@ static const struct super_operations bfs_sops = {
        .statfs         = bfs_statfs,
 };
-void dump_imap(const char *prefix, struct super_block *s)
+void bfs_dump_imap(const char *prefix, struct super_block *s)
 {
 #ifdef DEBUG
        int i;
@@ -443,7 +441,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        }
        brelse(bh);
        brelse(sbh);
-        dump_imap("read_super", s);
+        bfs_dump_imap("read_super", s);
        return 0;
 out3:
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e25564bfcb46..54a201dac7f9 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -276,9 +276,8 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
                        }
                        if (ret > 0)
                                goto next;
-                        ret = ulist_add_merge(parents, eb->start,
+                        ret = ulist_add_merge_ptr(parents, eb->start,
-                                              (uintptr_t)eie,
+                                                  eie, (void **)&old, GFP_NOFS);
-                                              (u64 *)&old, GFP_NOFS);
                        if (ret < 0)
                                break;
                        if (!ret && extent_item_pos) {
@@ -1001,16 +1000,19 @@ again:
                                        ret = -EIO;
                                        goto out;
                                }
+                                btrfs_tree_read_lock(eb);
+                                btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                                ret = find_extent_in_eb(eb, bytenr,
                                                        *extent_item_pos, &eie);
+                                btrfs_tree_read_unlock_blocking(eb);
                                free_extent_buffer(eb);
                                if (ret < 0)
                                        goto out;
                                ref->inode_list = eie;
                        }
-                        ret = ulist_add_merge(refs, ref->parent,
+                        ret = ulist_add_merge_ptr(refs, ref->parent,
-                                              (uintptr_t)ref->inode_list,
+                                                  ref->inode_list,
-                                              (u64 *)&eie, GFP_NOFS);
+                                                  (void **)&eie, GFP_NOFS);
                        if (ret < 0)
                                goto out;
                        if (!ret && extent_item_pos) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4794923c410c..43527fd78825 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -84,12 +84,6 @@ struct btrfs_inode {
         */
        struct list_head delalloc_inodes;
-        /*
-         * list for tracking inodes that must be sent to disk before a
-         * rename or truncate commit
-         */
-        struct list_head ordered_operations;
        /* node for the red-black tree that links inodes in subvolume root */
        struct rb_node rb_node;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index aeab453b8e24..44ee5d2e52a4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,9 +280,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        WARN_ON(btrfs_header_generation(buf) > trans->transid);
        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+                ret = btrfs_inc_ref(trans, root, cow, 1);
        else
-                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+                ret = btrfs_inc_ref(trans, root, cow, 0);
        if (ret)
                return ret;
@@ -1035,14 +1035,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                if ((owner == root->root_key.objectid ||
                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
-                        ret = btrfs_inc_ref(trans, root, buf, 1, 1);
+                        ret = btrfs_inc_ref(trans, root, buf, 1);
                        BUG_ON(ret); /* -ENOMEM */
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID) {
-                                ret = btrfs_dec_ref(trans, root, buf, 0, 1);
+                                ret = btrfs_dec_ref(trans, root, buf, 0);
                                BUG_ON(ret); /* -ENOMEM */
-                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
                                BUG_ON(ret); /* -ENOMEM */
                        }
                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -1050,9 +1050,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID)
-                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
                        else
-                                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+                                ret = btrfs_inc_ref(trans, root, cow, 0);
                        BUG_ON(ret); /* -ENOMEM */
                }
                if (new_flags != 0) {
@@ -1069,11 +1069,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID)
-                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
                        else
-                                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+                                ret = btrfs_inc_ref(trans, root, cow, 0);
                        BUG_ON(ret); /* -ENOMEM */
-                        ret = btrfs_dec_ref(trans, root, buf, 1, 1);
+                        ret = btrfs_dec_ref(trans, root, buf, 1);
                        BUG_ON(ret); /* -ENOMEM */
                }
                clean_tree_block(trans, root, buf);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index be91397f4e92..8e29b614fe93 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3326,9 +3326,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
                         u64 min_alloc_size, u64 empty_size, u64 hint_byte,
                         struct btrfs_key *ins, int is_data, int delalloc);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *buf, int full_backref, int no_quota);
+                  struct extent_buffer *buf, int full_backref);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *buf, int full_backref, int no_quota);
+                  struct extent_buffer *buf, int full_backref);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 flags,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 08e65e9cf2aa..d0ed9e664f7d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -60,8 +60,6 @@ static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                                    int read_only);
-static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
-                                             struct btrfs_root *root);
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                                      struct btrfs_root *root);
@@ -3829,34 +3827,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root)
        btrfs_cleanup_transaction(root);
 }
-static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
-                                             struct btrfs_root *root)
-{
-        struct btrfs_inode *btrfs_inode;
-        struct list_head splice;
-        INIT_LIST_HEAD(&splice);
-        mutex_lock(&root->fs_info->ordered_operations_mutex);
-        spin_lock(&root->fs_info->ordered_root_lock);
-        list_splice_init(&t->ordered_operations, &splice);
-        while (!list_empty(&splice)) {
-                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
-                                         ordered_operations);
-                list_del_init(&btrfs_inode->ordered_operations);
-                spin_unlock(&root->fs_info->ordered_root_lock);
-                btrfs_invalidate_inodes(btrfs_inode->root);
-                spin_lock(&root->fs_info->ordered_root_lock);
-        }
-        spin_unlock(&root->fs_info->ordered_root_lock);
-        mutex_unlock(&root->fs_info->ordered_operations_mutex);
-}
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
 {
        struct btrfs_ordered_extent *ordered;
@@ -4093,8 +4063,6 @@ again:
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
                                   struct btrfs_root *root)
 {
-        btrfs_destroy_ordered_operations(cur_trans, root);
        btrfs_destroy_delayed_refs(cur_trans, root);
        cur_trans->state = TRANS_STATE_COMMIT_START;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 813537f362f9..102ed3143976 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3057,7 +3057,7 @@ out:
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                           int full_backref, int inc, int no_quota)
+                           int full_backref, int inc)
 {
        u64 bytenr;
        u64 num_bytes;
@@ -3111,7 +3111,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                        key.offset -= btrfs_file_extent_offset(buf, fi);
                        ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, key.objectid,
-                                           key.offset, no_quota);
+                                           key.offset, 1);
                        if (ret)
                                goto fail;
                } else {
@@ -3119,7 +3119,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                        num_bytes = btrfs_level_size(root, level - 1);
                        ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, level - 1, 0,
-                                           no_quota);
+                                           1);
                        if (ret)
                                goto fail;
                }
@@ -3130,15 +3130,15 @@ fail:
 }
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *buf, int full_backref, int no_quota)
+                  struct extent_buffer *buf, int full_backref)
 {
-        return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);
+        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
 }
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *buf, int full_backref, int no_quota)
+                  struct extent_buffer *buf, int full_backref)
 {
-        return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);
+        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
 }
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -7478,6 +7478,220 @@ reada:
        wc->reada_slot = slot;
 }
+static int account_leaf_items(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct extent_buffer *eb)
+{
+        int nr = btrfs_header_nritems(eb);
+        int i, extent_type, ret;
+        struct btrfs_key key;
+        struct btrfs_file_extent_item *fi;
+        u64 bytenr, num_bytes;
+        for (i = 0; i < nr; i++) {
+                btrfs_item_key_to_cpu(eb, &key, i);
+                if (key.type != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+                /* filter out non qgroup-accountable extents  */
+                extent_type = btrfs_file_extent_type(eb, fi);
+                if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
+                if (!bytenr)
+                        continue;
+                num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+                ret = btrfs_qgroup_record_ref(trans, root->fs_info,
+                                              root->objectid,
+                                              bytenr, num_bytes,
+                                              BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+/*
+ * Walk up the tree from the bottom, freeing leaves and any interior
+ * nodes which have had all slots visited. If a node (leaf or
+ * interior) is freed, the node above it will have it's slot
+ * incremented. The root node will never be freed.
+ *
+ * At the end of this function, we should have a path which has all
+ * slots incremented to the next position for a search. If we need to
+ * read a new node it will be NULL and the node above it will have the
+ * correct slot selected for a later read.
+ *
+ * If we increment the root nodes slot counter past the number of
+ * elements, 1 is returned to signal completion of the search.
+ */
+static int adjust_slots_upwards(struct btrfs_root *root,
+                                struct btrfs_path *path, int root_level)
+{
+        int level = 0;
+        int nr, slot;
+        struct extent_buffer *eb;
+        if (root_level == 0)
+                return 1;
+        while (level <= root_level) {
+                eb = path->nodes[level];
+                nr = btrfs_header_nritems(eb);
+                path->slots[level]++;
+                slot = path->slots[level];
+                if (slot >= nr || level == 0) {
+                        /*
+                         * Don't free the root -  we will detect this
+                         * condition after our loop and return a
+                         * positive value for caller to stop walking the tree.
+                         */
+                        if (level != root_level) {
+                                btrfs_tree_unlock_rw(eb, path->locks[level]);
+                                path->locks[level] = 0;
+                                free_extent_buffer(eb);
+                                path->nodes[level] = NULL;
+                                path->slots[level] = 0;
+                        }
+                } else {
+                        /*
+                         * We have a valid slot to walk back down
+                         * from. Stop here so caller can process these
+                         * new nodes.
+                         */
+                        break;
+                }
+                level++;
+        }
+        eb = path->nodes[root_level];
+        if (path->slots[root_level] >= btrfs_header_nritems(eb))
+                return 1;
+        return 0;
+}
+/*
+ * root_eb is the subtree root and is locked before this function is called.
+ */
+static int account_shared_subtree(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  struct extent_buffer *root_eb,
+                                  u64 root_gen,
+                                  int root_level)
+{
+        int ret = 0;
+        int level;
+        struct extent_buffer *eb = root_eb;
+        struct btrfs_path *path = NULL;
+        BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
+        BUG_ON(root_eb == NULL);
+        if (!root->fs_info->quota_enabled)
+                return 0;
+        if (!extent_buffer_uptodate(root_eb)) {
+                ret = btrfs_read_buffer(root_eb, root_gen);
+                if (ret)
+                        goto out;
+        }
+        if (root_level == 0) {
+                ret = account_leaf_items(trans, root, root_eb);
+                goto out;
+        }
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        /*
+         * Walk down the tree.  Missing extent blocks are filled in as
+         * we go. Metadata is accounted every time we read a new
+         * extent block.
+         *
+         * When we reach a leaf, we account for file extent items in it,
+         * walk back up the tree (adjusting slot pointers as we go)
+         * and restart the search process.
+         */
+        extent_buffer_get(root_eb); /* For path */
+        path->nodes[root_level] = root_eb;
+        path->slots[root_level] = 0;
+        path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
+walk_down:
+        level = root_level;
+        while (level >= 0) {
+                if (path->nodes[level] == NULL) {
+                        int child_bsize = root->nodesize;
+                        int parent_slot;
+                        u64 child_gen;
+                        u64 child_bytenr;
+                        /* We need to get child blockptr/gen from
+                         * parent before we can read it. */
+                        eb = path->nodes[level + 1];
+                        parent_slot = path->slots[level + 1];
+                        child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+                        child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+                        eb = read_tree_block(root, child_bytenr, child_bsize,
+                                             child_gen);
+                        if (!eb || !extent_buffer_uptodate(eb)) {
+                                ret = -EIO;
+                                goto out;
+                        }
+                        path->nodes[level] = eb;
+                        path->slots[level] = 0;
+                        btrfs_tree_read_lock(eb);
+                        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+                        path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+                        ret = btrfs_qgroup_record_ref(trans, root->fs_info,
+                                                root->objectid,
+                                                child_bytenr,
+                                                child_bsize,
+                                                BTRFS_QGROUP_OPER_SUB_SUBTREE,
+                                                0);
+                        if (ret)
+                                goto out;
+                }
+                if (level == 0) {
+                        ret = account_leaf_items(trans, root, path->nodes[level]);
+                        if (ret)
+                                goto out;
+                        /* Nonzero return here means we completed our search */
+                        ret = adjust_slots_upwards(root, path, root_level);
+                        if (ret)
+                                break;
+                        /* Restart search with new slots */
+                        goto walk_down;
+                }
+                level--;
+        }
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
 /*
 * helper to process tree block while walking down the tree.
 *
@@ -7532,9 +7746,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
        /* wc->stage == UPDATE_BACKREF */
        if (!(wc->flags[level] & flag)) {
                BUG_ON(!path->locks[level]);
-                ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
+                ret = btrfs_inc_ref(trans, root, eb, 1);
                BUG_ON(ret); /* -ENOMEM */
-                ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
+                ret = btrfs_dec_ref(trans, root, eb, 0);
                BUG_ON(ret); /* -ENOMEM */
                ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
                                                  eb->len, flag,
@@ -7581,6 +7795,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        int level = wc->level;
        int reada = 0;
        int ret = 0;
+        bool need_account = false;
        generation = btrfs_node_ptr_generation(path->nodes[level],
                                               path->slots[level]);
@@ -7626,6 +7841,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        if (wc->stage == DROP_REFERENCE) {
                if (wc->refs[level - 1] > 1) {
+                        need_account = true;
                        if (level == 1 &&
                            (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
                                goto skip;
@@ -7689,6 +7905,16 @@ skip:
                        parent = 0;
                }
+                if (need_account) {
+                        ret = account_shared_subtree(trans, root, next,
+                                                     generation, level - 1);
+                        if (ret) {
+                                printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+                                        "%d accounting shared subtree. Quota "
+                                        "is out of sync, rescan required.\n",
+                                        root->fs_info->sb->s_id, ret);
+                        }
+                }
                ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
                                root->root_key.objectid, level - 1, 0, 0);
                BUG_ON(ret); /* -ENOMEM */
@@ -7769,12 +7995,17 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
        if (wc->refs[level] == 1) {
                if (level == 0) {
                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-                                ret = btrfs_dec_ref(trans, root, eb, 1,
+                                ret = btrfs_dec_ref(trans, root, eb, 1);
-                                                    wc->for_reloc);
                        else
-                                ret = btrfs_dec_ref(trans, root, eb, 0,
+                                ret = btrfs_dec_ref(trans, root, eb, 0);
-                                                    wc->for_reloc);
                        BUG_ON(ret); /* -ENOMEM */
+                        ret = account_leaf_items(trans, root, eb);
+                        if (ret) {
+                                printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+                                        "%d accounting leaf items. Quota "
+                                        "is out of sync, rescan required.\n",
+                                        root->fs_info->sb->s_id, ret);
+                        }
                }
                /* make block locked assertion in clean_tree_block happy */
                if (!path->locks[level] &&
@@ -7900,6 +8131,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        int level;
        bool root_dropped = false;
+        btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
        path = btrfs_alloc_path();
        if (!path) {
                err = -ENOMEM;
@@ -8025,6 +8258,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                                goto out_end_trans;
                        }
+                        /*
+                         * Qgroup update accounting is run from
+                         * delayed ref handling. This usually works
+                         * out because delayed refs are normally the
+                         * only way qgroup updates are added. However,
+                         * we may have added updates during our tree
+                         * walk so run qgroups here to make sure we
+                         * don't lose any updates.
+                         */
+                        ret = btrfs_delayed_qgroup_accounting(trans,
+                                                              root->fs_info);
+                        if (ret)
+                                printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
+                                                   "running qgroup updates "
+                                                   "during snapshot delete. "
+                                                   "Quota is out of sync, "
+                                                   "rescan required.\n", ret);
                        btrfs_end_transaction_throttle(trans, tree_root);
                        if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
                                pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8078,6 +8329,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        }
        root_dropped = true;
 out_end_trans:
+        ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
+        if (ret)
+                printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
+                                   "running qgroup updates "
+                                   "during snapshot delete. "
+                                   "Quota is out of sync, "
+                                   "rescan required.\n", ret);
        btrfs_end_transaction_throttle(trans, tree_root);
 out_free:
        kfree(wc);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f46cfe45d686..54c84daec9b5 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -756,7 +756,7 @@ again:
                                found_next = 1;
                        if (ret != 0)
                                goto insert;
-                        slot = 0;
+                        slot = path->slots[0];
                }
                btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
                if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1f2b99cb55ea..d3afac292d67 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1838,33 +1838,9 @@ out:
 int btrfs_release_file(struct inode *inode, struct file *filp)
 {
-        /*
-         * ordered_data_close is set by settattr when we are about to truncate
-         * a file from a non-zero size to a zero size.  This tries to
-         * flush down new bytes that may have been written if the
-         * application were using truncate to replace a file in place.
-         */
-        if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
-                               &BTRFS_I(inode)->runtime_flags)) {
-                struct btrfs_trans_handle *trans;
-                struct btrfs_root *root = BTRFS_I(inode)->root;
-                /*
-                 * We need to block on a committing transaction to keep us from
-                 * throwing a ordered operation on to the list and causing
-                 * something like sync to deadlock trying to flush out this
-                 * inode.
-                 */
-                trans = btrfs_start_transaction(root, 0);
-                if (IS_ERR(trans))
-                        return PTR_ERR(trans);
-                btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
-                btrfs_end_transaction(trans, root);
-                if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
-                        filemap_flush(inode->i_mapping);
-        }
        if (filp->private_data)
                btrfs_ioctl_trans_end(filp);
+        filemap_flush(inode->i_mapping);
        return 0;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3668048e16f8..03708ef3deef 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -709,6 +709,18 @@ retry:
                                unlock_extent(io_tree, async_extent->start,
                                              async_extent->start +
                                              async_extent->ram_size - 1);
+                                /*
+                                 * we need to redirty the pages if we decide to
+                                 * fallback to uncompressed IO, otherwise we
+                                 * will not submit these pages down to lower
+                                 * layers.
+                                 */
+                                extent_range_redirty_for_io(inode,
+                                                async_extent->start,
+                                                async_extent->start +
+                                                async_extent->ram_size - 1);
                                goto retry;
                        }
                        goto out_free;
@@ -7939,27 +7951,6 @@ static int btrfs_truncate(struct inode *inode)
        BUG_ON(ret);
        /*
-         * setattr is responsible for setting the ordered_data_close flag,
-         * but that is only tested during the last file release.  That
-         * could happen well after the next commit, leaving a great big
-         * window where new writes may get lost if someone chooses to write
-         * to this file after truncating to zero
-         *
-         * The inode doesn't have any dirty data here, and so if we commit
-         * this is a noop.  If someone immediately starts writing to the inode
-         * it is very likely we'll catch some of their writes in this
-         * transaction, and the commit will find this file on the ordered
-         * data list with good things to send down.
-         *
-         * This is a best effort solution, there is still a window where
-         * using truncate to replace the contents of the file will
-         * end up with a zero length file after a crash.
-         */
-        if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
-                                           &BTRFS_I(inode)->runtime_flags))
-                btrfs_add_ordered_operation(trans, root, inode);
-        /*
         * So if we truncate and then write and fsync we normally would just
         * write the extents that changed, which is a problem if we need to
         * first truncate that entire inode.  So set this flag so we write out
@@ -8106,7 +8097,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        mutex_init(&ei->delalloc_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->delalloc_inodes);
-        INIT_LIST_HEAD(&ei->ordered_operations);
        RB_CLEAR_NODE(&ei->rb_node);
        return inode;
@@ -8146,17 +8136,6 @@ void btrfs_destroy_inode(struct inode *inode)
        if (!root)
                goto free;
-        /*
-         * Make sure we're properly removed from the ordered operation
-         * lists.
-         */
-        smp_mb();
-        if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
-                spin_lock(&root->fs_info->ordered_root_lock);
-                list_del_init(&BTRFS_I(inode)->ordered_operations);
-                spin_unlock(&root->fs_info->ordered_root_lock);
-        }
        if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
                     &BTRFS_I(inode)->runtime_flags)) {
                btrfs_info(root->fs_info, "inode %llu still on the orphan list",
@@ -8338,12 +8317,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        ret = 0;
        /*
-         * we're using rename to replace one file with another.
+         * we're using rename to replace one file with another.  Start IO on it
-         * and the replacement file is large.  Start IO on it now so
+         * now so  we don't add too much work to the end of the transaction
-         * we don't add too much work to the end of the transaction
         */
-        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
+        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
-            old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
                filemap_flush(old_inode->i_mapping);
        /* close the racy window with snapshot create/destroy ioctl */
@@ -8391,12 +8368,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                 */
                btrfs_pin_log_trans(root);
        }
-        /*
-         * make sure the inode gets flushed if it is replacing
-         * something.
-         */
-        if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
-                btrfs_add_ordered_operation(trans, root, old_inode);
        inode_inc_iversion(old_dir);
        inode_inc_iversion(new_dir);
@@ -8476,6 +8447,16 @@ out_notrans:
        return ret;
 }
+static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+                         struct inode *new_dir, struct dentry *new_dentry,
+                         unsigned int flags)
+{
+        if (flags & ~RENAME_NOREPLACE)
+                return -EINVAL;
+        return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
 static void btrfs_run_delalloc_work(struct btrfs_work *work)
 {
        struct btrfs_delalloc_work *delalloc_work;
@@ -9019,7 +9000,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
        .link           = btrfs_link,
        .mkdir          = btrfs_mkdir,
        .rmdir          = btrfs_rmdir,
-        .rename         = btrfs_rename,
+        .rename2        = btrfs_rename2,
        .symlink        = btrfs_symlink,
        .setattr        = btrfs_setattr,
        .mknod          = btrfs_mknod,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7187b14faa6c..963895c1f801 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
        trace_btrfs_ordered_extent_remove(inode, entry);
-        /*
-         * we have no more ordered extents for this inode and
-         * no dirty pages.  We can safely remove it from the
-         * list of ordered extents
-         */
-        if (RB_EMPTY_ROOT(&tree->tree) &&
-            !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
-                spin_lock(&root->fs_info->ordered_root_lock);
-                list_del_init(&BTRFS_I(inode)->ordered_operations);
-                spin_unlock(&root->fs_info->ordered_root_lock);
-        }
        if (!root->nr_ordered_extents) {
                spin_lock(&root->fs_info->ordered_root_lock);
                BUG_ON(list_empty(&root->ordered_root));
@@ -687,81 +675,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
 }
 /*
- * this is used during transaction commit to write all the inodes
- * added to the ordered operation list.  These files must be fully on
- * disk before the transaction commits.
- *
- * we have two modes here, one is to just start the IO via filemap_flush
- * and the other is to wait for all the io.  When we wait, we have an
- * extra check to make sure the ordered operation list really is empty
- * before we return
- */
-int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root, int wait)
-{
-        struct btrfs_inode *btrfs_inode;
-        struct inode *inode;
-        struct btrfs_transaction *cur_trans = trans->transaction;
-        struct list_head splice;
-        struct list_head works;
-        struct btrfs_delalloc_work *work, *next;
-        int ret = 0;
-        INIT_LIST_HEAD(&splice);
-        INIT_LIST_HEAD(&works);
-        mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
-        spin_lock(&root->fs_info->ordered_root_lock);
-        list_splice_init(&cur_trans->ordered_operations, &splice);
-        while (!list_empty(&splice)) {
-                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
-                                   ordered_operations);
-                inode = &btrfs_inode->vfs_inode;
-                list_del_init(&btrfs_inode->ordered_operations);
-                /*
-                 * the inode may be getting freed (in sys_unlink path).
-                 */
-                inode = igrab(inode);
-                if (!inode)
-                        continue;
-                if (!wait)
-                        list_add_tail(&BTRFS_I(inode)->ordered_operations,
-                                      &cur_trans->ordered_operations);
-                spin_unlock(&root->fs_info->ordered_root_lock);
-                work = btrfs_alloc_delalloc_work(inode, wait, 1);
-                if (!work) {
-                        spin_lock(&root->fs_info->ordered_root_lock);
-                        if (list_empty(&BTRFS_I(inode)->ordered_operations))
-                                list_add_tail(&btrfs_inode->ordered_operations,
-                                              &splice);
-                        list_splice_tail(&splice,
-                                         &cur_trans->ordered_operations);
-                        spin_unlock(&root->fs_info->ordered_root_lock);
-                        ret = -ENOMEM;
-                        goto out;
-                }
-                list_add_tail(&work->list, &works);
-                btrfs_queue_work(root->fs_info->flush_workers,
-                                 &work->work);
-                cond_resched();
-                spin_lock(&root->fs_info->ordered_root_lock);
-        }
-        spin_unlock(&root->fs_info->ordered_root_lock);
-out:
-        list_for_each_entry_safe(work, next, &works, list) {
-                list_del_init(&work->list);
-                btrfs_wait_and_free_delalloc_work(work);
-        }
-        mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
-        return ret;
-}
-/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
@@ -1120,42 +1033,6 @@ out:
        return index;
 }
-/*
- * add a given inode to the list of inodes that must be fully on
- * disk before a transaction commit finishes.
- *
- * This basically gives us the ext3 style data=ordered mode, and it is mostly
- * used to make sure renamed files are fully on disk.
- *
- * It is a noop if the inode is already fully on disk.
- *
- * If trans is not null, we'll do a friendly check for a transaction that
- * is already flushing things and force the IO down ourselves.
- */
-void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root, struct inode *inode)
-{
-        struct btrfs_transaction *cur_trans = trans->transaction;
-        u64 last_mod;
-        last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
-        /*
-         * if this file hasn't been changed since the last transaction
-         * commit, we can safely return without doing anything
-         */
-        if (last_mod <= root->fs_info->last_trans_committed)
-                return;
-        spin_lock(&root->fs_info->ordered_root_lock);
-        if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
-                list_add_tail(&BTRFS_I(inode)->ordered_operations,
-                              &cur_trans->ordered_operations);
-        }
-        spin_unlock(&root->fs_info->ordered_root_lock);
-}
 int __init ordered_data_init(void)
 {
        btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 246897058efb..d81a274d621e 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
                           u32 *sum, int len);
-int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root, int wait);
-void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *root,
-                                 struct inode *inode);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
 void btrfs_get_logged_extents(struct inode *inode,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 98cb6b2630f9..b497498484be 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1201,6 +1201,50 @@ out:
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        return ret;
 }
+static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
+                           struct btrfs_qgroup_operation *oper2)
+{
+        /*
+         * Ignore seq and type here, we're looking for any operation
+         * at all related to this extent on that root.
+         */
+        if (oper1->bytenr < oper2->bytenr)
+                return -1;
+        if (oper1->bytenr > oper2->bytenr)
+                return 1;
+        if (oper1->ref_root < oper2->ref_root)
+                return -1;
+        if (oper1->ref_root > oper2->ref_root)
+                return 1;
+        return 0;
+}
+static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
+                              struct btrfs_qgroup_operation *oper)
+{
+        struct rb_node *n;
+        struct btrfs_qgroup_operation *cur;
+        int cmp;
+        spin_lock(&fs_info->qgroup_op_lock);
+        n = fs_info->qgroup_op_tree.rb_node;
+        while (n) {
+                cur = rb_entry(n, struct btrfs_qgroup_operation, n);
+                cmp = comp_oper_exist(cur, oper);
+                if (cmp < 0) {
+                        n = n->rb_right;
+                } else if (cmp) {
+                        n = n->rb_left;
+                } else {
+                        spin_unlock(&fs_info->qgroup_op_lock);
+                        return -EEXIST;
+                }
+        }
+        spin_unlock(&fs_info->qgroup_op_lock);
+        return 0;
+}
 static int comp_oper(struct btrfs_qgroup_operation *oper1,
                     struct btrfs_qgroup_operation *oper2)
 {
@@ -1290,6 +1334,23 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
        oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
        INIT_LIST_HEAD(&oper->elem.list);
        oper->elem.seq = 0;
+        if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
+                /*
+                 * If any operation for this bytenr/ref_root combo
+                 * exists, then we know it's not exclusively owned and
+                 * shouldn't be queued up.
+                 *
+                 * This also catches the case where we have a cloned
+                 * extent that gets queued up multiple times during
+                 * drop snapshot.
+                 */
+                if (qgroup_oper_exists(fs_info, oper)) {
+                        kfree(oper);
+                        return 0;
+                }
+        }
        ret = insert_qgroup_oper(fs_info, oper);
        if (ret) {
                /* Shouldn't happen so have an assert for developers */
@@ -1884,6 +1945,111 @@ out:
 }
 /*
+ * Process a reference to a shared subtree. This type of operation is
+ * queued during snapshot removal when we encounter extents which are
+ * shared between more than one root.
+ */
+static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
+                                     struct btrfs_fs_info *fs_info,
+                                     struct btrfs_qgroup_operation *oper)
+{
+        struct ulist *roots = NULL;
+        struct ulist_node *unode;
+        struct ulist_iterator uiter;
+        struct btrfs_qgroup_list *glist;
+        struct ulist *parents;
+        int ret = 0;
+        int err;
+        struct btrfs_qgroup *qg;
+        u64 root_obj = 0;
+        struct seq_list elem = {};
+        parents = ulist_alloc(GFP_NOFS);
+        if (!parents)
+                return -ENOMEM;
+        btrfs_get_tree_mod_seq(fs_info, &elem);
+        ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
+                                   elem.seq, &roots);
+        btrfs_put_tree_mod_seq(fs_info, &elem);
+        if (ret < 0)
+                return ret;
+        if (roots->nnodes != 1)
+                goto out;
+        ULIST_ITER_INIT(&uiter);
+        unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
+        /*
+         * If we find our ref root then that means all refs
+         * this extent has to the root have not yet been
+         * deleted. In that case, we do nothing and let the
+         * last ref for this bytenr drive our update.
+         *
+         * This can happen for example if an extent is
+         * referenced multiple times in a snapshot (clone,
+         * etc). If we are in the middle of snapshot removal,
+         * queued updates for such an extent will find the
+         * root if we have not yet finished removing the
+         * snapshot.
+         */
+        if (unode->val == oper->ref_root)
+                goto out;
+        root_obj = unode->val;
+        BUG_ON(!root_obj);
+        spin_lock(&fs_info->qgroup_lock);
+        qg = find_qgroup_rb(fs_info, root_obj);
+        if (!qg)
+                goto out_unlock;
+        qg->excl += oper->num_bytes;
+        qg->excl_cmpr += oper->num_bytes;
+        qgroup_dirty(fs_info, qg);
+        /*
+         * Adjust counts for parent groups. First we find all
+         * parents, then in the 2nd loop we do the adjustment
+         * while adding parents of the parents to our ulist.
+         */
+        list_for_each_entry(glist, &qg->groups, next_group) {
+                err = ulist_add(parents, glist->group->qgroupid,
+                                ptr_to_u64(glist->group), GFP_ATOMIC);
+                if (err < 0) {
+                        ret = err;
+                        goto out_unlock;
+                }
+        }
+        ULIST_ITER_INIT(&uiter);
+        while ((unode = ulist_next(parents, &uiter))) {
+                qg = u64_to_ptr(unode->aux);
+                qg->excl += oper->num_bytes;
+                qg->excl_cmpr += oper->num_bytes;
+                qgroup_dirty(fs_info, qg);
+                /* Add any parents of the parents */
+                list_for_each_entry(glist, &qg->groups, next_group) {
+                        err = ulist_add(parents, glist->group->qgroupid,
+                                        ptr_to_u64(glist->group), GFP_ATOMIC);
+                        if (err < 0) {
+                                ret = err;
+                                goto out_unlock;
+                        }
+                }
+        }
+out_unlock:
+        spin_unlock(&fs_info->qgroup_lock);
+out:
+        ulist_free(roots);
+        ulist_free(parents);
+        return ret;
+}
+/*
 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
 * from the fs. First, all roots referencing the extent are searched, and
 * then the space is accounted accordingly to the different roots. The
@@ -1920,6 +2086,9 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
        case BTRFS_QGROUP_OPER_SUB_SHARED:
                ret = qgroup_shared_accounting(trans, fs_info, oper);
                break;
+        case BTRFS_QGROUP_OPER_SUB_SUBTREE:
+                ret = qgroup_subtree_accounting(trans, fs_info, oper);
+                break;
        default:
                ASSERT(0);
        }
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 5952ff1fbd7a..18cc68ca3090 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -44,6 +44,7 @@ enum btrfs_qgroup_operation_type {
        BTRFS_QGROUP_OPER_ADD_SHARED,
        BTRFS_QGROUP_OPER_SUB_EXCL,
        BTRFS_QGROUP_OPER_SUB_SHARED,
+        BTRFS_QGROUP_OPER_SUB_SUBTREE,
 };
 struct btrfs_qgroup_operation {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8e16bca69c56..c4124de4435b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -851,7 +851,6 @@ static struct dentry *get_default_root(struct super_block *sb,
        struct btrfs_path *path;
        struct btrfs_key location;
        struct inode *inode;
-        struct dentry *dentry;
        u64 dir_id;
        int new = 0;
@@ -922,13 +921,7 @@ setup_root:
                return dget(sb->s_root);
        }
-        dentry = d_obtain_alias(inode);
+        return d_obtain_root(inode);
-        if (!IS_ERR(dentry)) {
-                spin_lock(&dentry->d_lock);
-                dentry->d_flags &= ~DCACHE_DISCONNECTED;
-                spin_unlock(&dentry->d_lock);
-        }
-        return dentry;
 }
 static int btrfs_fill_super(struct super_block *sb,
@@ -1672,6 +1665,21 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
        return 0;
 }
+/*
+ * Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
+ *
+ * If there's a redundant raid level at DATA block groups, use the respective
+ * multiplier to scale the sizes.
+ *
+ * Unused device space usage is based on simulating the chunk allocator
+ * algorithm that respects the device sizes, order of allocations and the
+ * 'alloc_start' value, this is a close approximation of the actual use but
+ * there are other factors that may change the result (like a new metadata
+ * chunk).
+ *
+ * FIXME: not accurate for mixed block groups, total and free/used are ok,
+ * available appears slightly larger.
+ */
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
@@ -1682,6 +1690,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        u64 total_free_data = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)fs_info->fsid;
+        unsigned factor = 1;
+        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
        int ret;
        /* holding chunk_muext to avoid allocating new chunks */
@@ -1689,30 +1699,52 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+                        int i;
                        total_free_data += found->disk_total - found->disk_used;
                        total_free_data -=
                                btrfs_account_ro_block_groups_free_space(found);
+                        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+                                if (!list_empty(&found->block_groups[i])) {
+                                        switch (i) {
+                                        case BTRFS_RAID_DUP:
+                                        case BTRFS_RAID_RAID1:
+                                        case BTRFS_RAID_RAID10:
+                                                factor = 2;
+                                        }
+                                }
+                        }
                }
                total_used += found->disk_used;
        }
        rcu_read_unlock();
-        buf->f_namelen = BTRFS_NAME_LEN;
+        buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
-        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
+        buf->f_blocks >>= bits;
-        buf->f_bfree = buf->f_blocks - (total_used >> bits);
+        buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
-        buf->f_bsize = dentry->d_sb->s_blocksize;
-        buf->f_type = BTRFS_SUPER_MAGIC;
+        /* Account global block reserve as used, it's in logical size already */
+        spin_lock(&block_rsv->lock);
+        buf->f_bfree -= block_rsv->size >> bits;
+        spin_unlock(&block_rsv->lock);
        buf->f_bavail = total_free_data;
        ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
        if (ret) {
                mutex_unlock(&fs_info->chunk_mutex);
                return ret;
        }
-        buf->f_bavail += total_free_data;
+        buf->f_bavail += div_u64(total_free_data, factor);
        buf->f_bavail = buf->f_bavail >> bits;
        mutex_unlock(&fs_info->chunk_mutex);
+        buf->f_type = BTRFS_SUPER_MAGIC;
+        buf->f_bsize = dentry->d_sb->s_blocksize;
+        buf->f_namelen = BTRFS_NAME_LEN;
        /* We treat it as constant endianness (it doesn't matter _which_)
           because we want the fsid to come out the same whether mounted
           on a big-endian or little-endian host */
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5f379affdf23..d89c6d3542ca 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -218,7 +218,6 @@ loop:
        spin_lock_init(&cur_trans->delayed_refs.lock);
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
-        INIT_LIST_HEAD(&cur_trans->ordered_operations);
        INIT_LIST_HEAD(&cur_trans->pending_chunks);
        INIT_LIST_HEAD(&cur_trans->switch_commits);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -1612,27 +1611,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
 }
-static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
-                                          struct btrfs_root *root)
-{
-        int ret;
-        ret = btrfs_run_delayed_items(trans, root);
-        if (ret)
-                return ret;
-        /*
-         * rename don't use btrfs_join_transaction, so, once we
-         * set the transaction to blocked above, we aren't going
-         * to get any new ordered operations.  We can safely run
-         * it here and no for sure that nothing new will be added
-         * to the list
-         */
-        ret = btrfs_run_ordered_operations(trans, root, 1);
-        return ret;
-}
 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
        if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
@@ -1653,13 +1631,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        struct btrfs_transaction *prev_trans = NULL;
        int ret;
-        ret = btrfs_run_ordered_operations(trans, root, 0);
-        if (ret) {
-                btrfs_abort_transaction(trans, root, ret);
-                btrfs_end_transaction(trans, root);
-                return ret;
-        }
        /* Stop the commit early if ->aborted is set */
        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
                ret = cur_trans->aborted;
@@ -1740,7 +1711,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        if (ret)
                goto cleanup_transaction;
-        ret = btrfs_flush_all_pending_stuffs(trans, root);
+        ret = btrfs_run_delayed_items(trans, root);
        if (ret)
                goto cleanup_transaction;
@@ -1748,7 +1719,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                   extwriter_counter_read(cur_trans) == 0);
        /* some pending stuffs might be added after the previous flush. */
-        ret = btrfs_flush_all_pending_stuffs(trans, root);
+        ret = btrfs_run_delayed_items(trans, root);
        if (ret)
                goto cleanup_transaction;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7dd558ed0716..579be51b27e5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -55,7 +55,6 @@ struct btrfs_transaction {
        wait_queue_head_t writer_wait;
        wait_queue_head_t commit_wait;
        struct list_head pending_snapshots;
-        struct list_head ordered_operations;
        struct list_head pending_chunks;
        struct list_head switch_commits;
        struct btrfs_delayed_ref_root delayed_refs;
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 7f78cbf5cf41..4c29db604bbe 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -57,6 +57,21 @@ void ulist_free(struct ulist *ulist);
 int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
 int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
                    u64 *old_aux, gfp_t gfp_mask);
+/* just like ulist_add_merge() but take a pointer for the aux data */
+static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
+                                      void **old_aux, gfp_t gfp_mask)
+{
+#if BITS_PER_LONG == 32
+        u64 old64 = (uintptr_t)*old_aux;
+        int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask);
+        *old_aux = (void *)((uintptr_t)old64);
+        return ret;
+#else
+        return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask);
+#endif
+}
 struct ulist_node *ulist_next(struct ulist *ulist,
                              struct ulist_iterator *uiter);
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 469f2e8657e8..cebf2ebefb55 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -172,14 +172,24 @@ out:
 int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
 {
        struct posix_acl *default_acl, *acl;
+        umode_t new_mode = inode->i_mode;
        int error;
-        error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+        error = posix_acl_create(dir, &new_mode, &default_acl, &acl);
        if (error)
                return error;
-        if (!default_acl && !acl)
+        if (!default_acl && !acl) {
                cache_no_acl(inode);
+                if (new_mode != inode->i_mode) {
+                        struct iattr newattrs = {
+                                .ia_mode = new_mode,
+                                .ia_valid = ATTR_MODE,
+                        };
+                        error = ceph_setattr(dentry, &newattrs);
+                }
+                return error;
+        }
        if (default_acl) {
                error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 1fde164b74b5..6d1cd45dca89 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3277,7 +3277,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
                        rel->ino = cpu_to_le64(ceph_ino(inode));
                        rel->cap_id = cpu_to_le64(cap->cap_id);
                        rel->seq = cpu_to_le32(cap->seq);
-                        rel->issue_seq = cpu_to_le32(cap->issue_seq),
+                        rel->issue_seq = cpu_to_le32(cap->issue_seq);
                        rel->mseq = cpu_to_le32(cap->mseq);
                        rel->caps = cpu_to_le32(cap->implemented);
                        rel->wanted = cpu_to_le32(cap->mds_wanted);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 302085100c28..2eb02f80a0ab 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -423,6 +423,9 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
        dout("sync_read on file %p %llu~%u %s\n", file, off,
             (unsigned)len,
             (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+        if (!len)
+                return 0;
        /*
         * flush any page cache pages in this range.  this
         * will make concurrent normal and sync io slow,
@@ -470,8 +473,11 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
                        size_t left = ret;
                        while (left) {
-                                int copy = min_t(size_t, PAGE_SIZE, left);
+                                size_t page_off = off & ~PAGE_MASK;
-                                l = copy_page_to_iter(pages[k++], 0, copy, i);
+                                size_t copy = min_t(size_t,
+                                                    PAGE_SIZE - page_off, left);
+                                l = copy_page_to_iter(pages[k++], page_off,
+                                                      copy, i);
                                off += l;
                                left -= l;
                                if (l < copy)
@@ -531,7 +537,7 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
 * objects, rollback on failure, etc.)
 */
 static ssize_t
-ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
+ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
@@ -547,7 +553,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
        int check_caps = 0;
        int ret;
        struct timespec mtime = CURRENT_TIME;
-        loff_t pos = iocb->ki_pos;
        size_t count = iov_iter_count(from);
        if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
@@ -646,7 +651,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
 * correct atomic write, we should e.g. take write locks on all
 * objects, rollback on failure, etc.)
 */
-static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t
+ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
@@ -663,7 +669,6 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from)
        int check_caps = 0;
        int ret;
        struct timespec mtime = CURRENT_TIME;
-        loff_t pos = iocb->ki_pos;
        size_t count = iov_iter_count(from);
        if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
@@ -918,9 +923,9 @@ retry_snap:
                /* we might need to revert back to that point */
                data = *from;
                if (file->f_flags & O_DIRECT)
-                        written = ceph_sync_direct_write(iocb, &data);
+                        written = ceph_sync_direct_write(iocb, &data, pos);
                else
-                        written = ceph_sync_write(iocb, &data);
+                        written = ceph_sync_write(iocb, &data, pos);
                if (written == -EOLDSNAPC) {
                        dout("aio_write %p %llx.%llx %llu~%u"
                                "got EOLDSNAPC, retrying\n",
@@ -1177,6 +1182,9 @@ static long ceph_fallocate(struct file *file, int mode,
        loff_t endoff = 0;
        loff_t size;
+        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+                return -EOPNOTSUPP;
        if (!S_ISREG(inode->i_mode))
                return -EOPNOTSUPP;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 92a2548278fc..bad07c09f91e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1904,6 +1904,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
             req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
        if (req->r_got_unsafe) {
+                void *p;
                /*
                 * Replay.  Do not regenerate message (and rebuild
                 * paths, etc.); just use the original message.
@@ -1924,8 +1925,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
                /* remove cap/dentry releases from message */
                rhead->num_releases = 0;
-                msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
-                msg->front.iov_len = req->r_request_release_offset;
+                /* time stamp */
+                p = msg->front.iov_base + req->r_request_release_offset;
+                ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+                msg->front.iov_len = p - msg->front.iov_base;
+                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
                return 0;
        }
@@ -2061,11 +2067,12 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
 static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 {
        struct ceph_mds_request *req;
-        struct rb_node *p;
+        struct rb_node *p = rb_first(&mdsc->request_tree);
        dout("kick_requests mds%d\n", mds);
-        for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
+        while (p) {
                req = rb_entry(p, struct ceph_mds_request, r_node);
+                p = rb_next(p);
                if (req->r_got_unsafe)
                        continue;
                if (req->r_session &&
@@ -2248,6 +2255,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
         */
        if (result == -ESTALE) {
                dout("got ESTALE on request %llu", req->r_tid);
+                req->r_resend_mds = -1;
                if (req->r_direct_mode != USE_AUTH_MDS) {
                        dout("not using auth, setting for that now");
                        req->r_direct_mode = USE_AUTH_MDS;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 06150fd745ac..f6e12377335c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -755,7 +755,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
                                goto out;
                        }
                } else {
-                        root = d_obtain_alias(inode);
+                        root = d_obtain_root(inode);
                }
                ceph_init_dentry(root);
                dout("open_root_inode success, root dentry is %p\n", root);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index c9c2b887381e..12f58d22e017 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -592,12 +592,12 @@ start:
                xattr_version = ci->i_xattrs.version;
                spin_unlock(&ci->i_ceph_lock);
-                xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
+                xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *),
                                 GFP_NOFS);
                err = -ENOMEM;
                if (!xattrs)
                        goto bad_lock;
-                memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
                for (i = 0; i < numattr; i++) {
                        xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
                                            GFP_NOFS);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index f3ac4154cbb6..44ec72684df5 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -213,7 +213,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
                                                   tcon->nativeFileSystem);
                                }
                                seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
-                                        "\n\tPathComponentMax: %d Status: 0x%d",
+                                        "\n\tPathComponentMax: %d Status: %d",
                                        le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
                                        le32_to_cpu(tcon->fsAttrInfo.Attributes),
                                        le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 888398067420..ac4f260155c8 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -848,7 +848,7 @@ const struct inode_operations cifs_dir_inode_ops = {
        .link = cifs_hardlink,
        .mkdir = cifs_mkdir,
        .rmdir = cifs_rmdir,
-        .rename = cifs_rename,
+        .rename2 = cifs_rename2,
        .permission = cifs_permission,
 /*      revalidate:cifs_revalidate,   */
        .setattr = cifs_setattr,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 70f178a7c759..b0fafa499505 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -68,8 +68,8 @@ extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
 extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
 extern int cifs_mkdir(struct inode *, struct dentry *, umode_t);
 extern int cifs_rmdir(struct inode *, struct dentry *);
-extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
+extern int cifs_rename2(struct inode *, struct dentry *, struct inode *,
-                       struct dentry *);
+                        struct dentry *, unsigned int);
 extern int cifs_revalidate_file_attr(struct file *filp);
 extern int cifs_revalidate_dentry_attr(struct dentry *);
 extern int cifs_revalidate_file(struct file *filp);
@@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION   "2.03"
+#define CIFS_VERSION   "2.04"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index de6aed8c78e5..0012e1e291d4 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -404,6 +404,11 @@ struct smb_version_operations {
                        const struct cifs_fid *, u32 *);
        int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *,
                        int);
+        /* writepages retry size */
+        unsigned int (*wp_retry_size)(struct inode *);
+        /* get mtu credits */
+        int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int,
+                                unsigned int *, unsigned int *);
 };
 struct smb_version_values {
@@ -640,6 +645,16 @@ add_credits(struct TCP_Server_Info *server, const unsigned int add,
 }
 static inline void
+add_credits_and_wake_if(struct TCP_Server_Info *server, const unsigned int add,
+                        const int optype)
+{
+        if (add) {
+                server->ops->add_credits(server, add, optype);
+                wake_up(&server->request_q);
+        }
+}
+static inline void
 set_credits(struct TCP_Server_Info *server, const int val)
 {
        server->ops->set_credits(server, val);
@@ -1044,6 +1059,7 @@ struct cifs_readdata {
        struct address_space            *mapping;
        __u64                           offset;
        unsigned int                    bytes;
+        unsigned int                    got_bytes;
        pid_t                           pid;
        int                             result;
        struct work_struct              work;
@@ -1053,6 +1069,7 @@ struct cifs_readdata {
        struct kvec                     iov;
        unsigned int                    pagesz;
        unsigned int                    tailsz;
+        unsigned int                    credits;
        unsigned int                    nr_pages;
        struct page                     *pages[];
 };
@@ -1073,6 +1090,7 @@ struct cifs_writedata {
        int                             result;
        unsigned int                    pagesz;
        unsigned int                    tailsz;
+        unsigned int                    credits;
        unsigned int                    nr_pages;
        struct page                     *pages[];
 };
@@ -1398,6 +1416,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   CIFS_OBREAK_OP   0x0100    /* oplock break request */
 #define   CIFS_NEG_OP      0x0200    /* negotiate request */
 #define   CIFS_OP_MASK     0x0380    /* mask request type */
+#define   CIFS_HAS_CREDITS 0x0400    /* already has credits */
 /* Security Flags: indicate type of session setup needed */
 #define   CIFSSEC_MAY_SIGN      0x00001
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ca7980a1e303..c31ce98c1704 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -36,6 +36,7 @@ extern struct smb_hdr *cifs_buf_get(void);
 extern void cifs_buf_release(void *);
 extern struct smb_hdr *cifs_small_buf_get(void);
 extern void cifs_small_buf_release(void *);
+extern void free_rsp_buf(int, void *);
 extern void cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx,
                                        struct kvec *iov);
 extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
@@ -89,6 +90,9 @@ extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *,
                                                struct smb_rqst *);
 extern int cifs_check_receive(struct mid_q_entry *mid,
                        struct TCP_Server_Info *server, bool log_error);
+extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server,
+                                 unsigned int size, unsigned int *num,
+                                 unsigned int *credits);
 extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
                        struct kvec *, int /* nvec to send */,
                        int * /* type of buf returned */ , const int flags);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6ce4e0954b98..66f65001a6d8 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -196,10 +196,6 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
        if (rc)
                goto out;
-        /*
-         * FIXME: check if wsize needs updated due to negotiated smb buffer
-         *        size shrinking
-         */
        atomic_inc(&tconInfoReconnectCount);
        /* tell server Unix caps we support */
@@ -1517,7 +1513,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
                return length;
        server->total_read += length;
-        rdata->bytes = length;
        cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
                 server->total_read, buflen, data_len);
@@ -1560,12 +1555,18 @@ cifs_readv_callback(struct mid_q_entry *mid)
                                         rc);
                }
                /* FIXME: should this be counted toward the initiating task? */
-                task_io_account_read(rdata->bytes);
+                task_io_account_read(rdata->got_bytes);
-                cifs_stats_bytes_read(tcon, rdata->bytes);
+                cifs_stats_bytes_read(tcon, rdata->got_bytes);
                break;
        case MID_REQUEST_SUBMITTED:
        case MID_RETRY_NEEDED:
                rdata->result = -EAGAIN;
+                if (server->sign && rdata->got_bytes)
+                        /* reset bytes number since we can not check a sign */
+                        rdata->got_bytes = 0;
+                /* FIXME: should this be counted toward the initiating task? */
+                task_io_account_read(rdata->got_bytes);
+                cifs_stats_bytes_read(tcon, rdata->got_bytes);
                break;
        default:
                rdata->result = -EIO;
@@ -1734,10 +1735,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
 /*      cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
        if (*buf) {
-                if (resp_buf_type == CIFS_SMALL_BUFFER)
+                free_rsp_buf(resp_buf_type, iov[0].iov_base);
-                        cifs_small_buf_release(iov[0].iov_base);
-                else if (resp_buf_type == CIFS_LARGE_BUFFER)
-                        cifs_buf_release(iov[0].iov_base);
        } else if (resp_buf_type != CIFS_NO_BUFFER) {
                /* return buffer to caller to free */
                *buf = iov[0].iov_base;
@@ -1899,28 +1897,80 @@ cifs_writedata_release(struct kref *refcount)
 static void
 cifs_writev_requeue(struct cifs_writedata *wdata)
 {
-        int i, rc;
+        int i, rc = 0;
        struct inode *inode = wdata->cfile->dentry->d_inode;
        struct TCP_Server_Info *server;
+        unsigned int rest_len;
-        for (i = 0; i < wdata->nr_pages; i++) {
+        server = tlink_tcon(wdata->cfile->tlink)->ses->server;
-                lock_page(wdata->pages[i]);
+        i = 0;
-                clear_page_dirty_for_io(wdata->pages[i]);
+        rest_len = wdata->bytes;
-        }
        do {
-                server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+                struct cifs_writedata *wdata2;
-                rc = server->ops->async_writev(wdata, cifs_writedata_release);
+                unsigned int j, nr_pages, wsize, tailsz, cur_len;
-        } while (rc == -EAGAIN);
+                wsize = server->ops->wp_retry_size(inode);
+                if (wsize < rest_len) {
+                        nr_pages = wsize / PAGE_CACHE_SIZE;
+                        if (!nr_pages) {
+                                rc = -ENOTSUPP;
+                                break;
+                        }
+                        cur_len = nr_pages * PAGE_CACHE_SIZE;
+                        tailsz = PAGE_CACHE_SIZE;
+                } else {
+                        nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE);
+                        cur_len = rest_len;
+                        tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE;
+                }
-        for (i = 0; i < wdata->nr_pages; i++) {
+                wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
-                unlock_page(wdata->pages[i]);
+                if (!wdata2) {
-                if (rc != 0) {
+                        rc = -ENOMEM;
-                        SetPageError(wdata->pages[i]);
+                        break;
-                        end_page_writeback(wdata->pages[i]);
-                        page_cache_release(wdata->pages[i]);
                }
-        }
+                for (j = 0; j < nr_pages; j++) {
+                        wdata2->pages[j] = wdata->pages[i + j];
+                        lock_page(wdata2->pages[j]);
+                        clear_page_dirty_for_io(wdata2->pages[j]);
+                }
+                wdata2->sync_mode = wdata->sync_mode;
+                wdata2->nr_pages = nr_pages;
+                wdata2->offset = page_offset(wdata2->pages[0]);
+                wdata2->pagesz = PAGE_CACHE_SIZE;
+                wdata2->tailsz = tailsz;
+                wdata2->bytes = cur_len;
+                wdata2->cfile = find_writable_file(CIFS_I(inode), false);
+                if (!wdata2->cfile) {
+                        cifs_dbg(VFS, "No writable handles for inode\n");
+                        rc = -EBADF;
+                        break;
+                }
+                wdata2->pid = wdata2->cfile->pid;
+                rc = server->ops->async_writev(wdata2, cifs_writedata_release);
+                for (j = 0; j < nr_pages; j++) {
+                        unlock_page(wdata2->pages[j]);
+                        if (rc != 0 && rc != -EAGAIN) {
+                                SetPageError(wdata2->pages[j]);
+                                end_page_writeback(wdata2->pages[j]);
+                                page_cache_release(wdata2->pages[j]);
+                        }
+                }
+                if (rc) {
+                        kref_put(&wdata2->refcount, cifs_writedata_release);
+                        if (rc == -EAGAIN)
+                                continue;
+                        break;
+                }
+                rest_len -= cur_len;
+                i += nr_pages;
+        } while (i < wdata->nr_pages);
        mapping_set_error(inode->i_mapping, rc);
        kref_put(&wdata->refcount, cifs_writedata_release);
@@ -2203,10 +2253,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
        }
 /*      cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
-        if (resp_buf_type == CIFS_SMALL_BUFFER)
+        free_rsp_buf(resp_buf_type, iov[0].iov_base);
-                cifs_small_buf_release(iov[0].iov_base);
-        else if (resp_buf_type == CIFS_LARGE_BUFFER)
-                cifs_buf_release(iov[0].iov_base);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
                since file handle passed in no longer valid */
@@ -2451,10 +2498,7 @@ plk_err_exit:
        if (pSMB)
                cifs_small_buf_release(pSMB);
-        if (resp_buf_type == CIFS_SMALL_BUFFER)
+        free_rsp_buf(resp_buf_type, iov[0].iov_base);
-                cifs_small_buf_release(iov[0].iov_base);
-        else if (resp_buf_type == CIFS_LARGE_BUFFER)
-                cifs_buf_release(iov[0].iov_base);
        /* Note: On -EAGAIN error only caller can retry on handle based calls
           since file handle passed in no longer valid */
@@ -3838,10 +3882,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
                }
        }
 qsec_out:
-        if (buf_type == CIFS_SMALL_BUFFER)
+        free_rsp_buf(buf_type, iov[0].iov_base);
-                cifs_small_buf_release(iov[0].iov_base);
-        else if (buf_type == CIFS_LARGE_BUFFER)
-                cifs_buf_release(iov[0].iov_base);
 /*      cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
        return rc;
 }
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index b98366f21f9e..03ed8a09581c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -557,7 +557,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
                try_to_freeze();
                if (server_unresponsive(server)) {
-                        total_read = -EAGAIN;
+                        total_read = -ECONNABORTED;
                        break;
                }
@@ -571,7 +571,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
                        break;
                } else if (server->tcpStatus == CifsNeedReconnect) {
                        cifs_reconnect(server);
-                        total_read = -EAGAIN;
+                        total_read = -ECONNABORTED;
                        break;
                } else if (length == -ERESTARTSYS ||
                           length == -EAGAIN ||
@@ -588,7 +588,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
                        cifs_dbg(FYI, "Received no data or error: expecting %d\n"
                                 "got %d", to_read, length);
                        cifs_reconnect(server);
-                        total_read = -EAGAIN;
+                        total_read = -ECONNABORTED;
                        break;
                }
        }
@@ -786,7 +786,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
                cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
                cifs_reconnect(server);
                wake_up(&server->response_q);
-                return -EAGAIN;
+                return -ECONNABORTED;
        }
        /* switch to large buffer if too big for a small one */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b88b1ade4d3d..4ab2f79ffa7a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1670,8 +1670,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
                                        break;
                        }
-                        len = min((size_t)cifs_sb->wsize,
+                        len = min(server->ops->wp_retry_size(dentry->d_inode),
-                                  write_size - total_written);
+                                  (unsigned int)write_size - total_written);
                        /* iov[0] is reserved for smb header */
                        iov[1].iov_base = (char *)write_data + total_written;
                        iov[1].iov_len = len;
@@ -1878,15 +1878,163 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
        return rc;
 }
+static struct cifs_writedata *
+wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
+                          pgoff_t end, pgoff_t *index,
+                          unsigned int *found_pages)
+{
+        unsigned int nr_pages;
+        struct page **pages;
+        struct cifs_writedata *wdata;
+        wdata = cifs_writedata_alloc((unsigned int)tofind,
+                                     cifs_writev_complete);
+        if (!wdata)
+                return NULL;
+        /*
+         * find_get_pages_tag seems to return a max of 256 on each
+         * iteration, so we must call it several times in order to
+         * fill the array or the wsize is effectively limited to
+         * 256 * PAGE_CACHE_SIZE.
+         */
+        *found_pages = 0;
+        pages = wdata->pages;
+        do {
+                nr_pages = find_get_pages_tag(mapping, index,
+                                              PAGECACHE_TAG_DIRTY, tofind,
+                                              pages);
+                *found_pages += nr_pages;
+                tofind -= nr_pages;
+                pages += nr_pages;
+        } while (nr_pages && tofind && *index <= end);
+        return wdata;
+}
+static unsigned int
+wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
+                    struct address_space *mapping,
+                    struct writeback_control *wbc,
+                    pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done)
+{
+        unsigned int nr_pages = 0, i;
+        struct page *page;
+        for (i = 0; i < found_pages; i++) {
+                page = wdata->pages[i];
+                /*
+                 * At this point we hold neither mapping->tree_lock nor
+                 * lock on the page itself: the page may be truncated or
+                 * invalidated (changing page->mapping to NULL), or even
+                 * swizzled back from swapper_space to tmpfs file
+                 * mapping
+                 */
+                if (nr_pages == 0)
+                        lock_page(page);
+                else if (!trylock_page(page))
+                        break;
+                if (unlikely(page->mapping != mapping)) {
+                        unlock_page(page);
+                        break;
+                }
+                if (!wbc->range_cyclic && page->index > end) {
+                        *done = true;
+                        unlock_page(page);
+                        break;
+                }
+                if (*next && (page->index != *next)) {
+                        /* Not next consecutive page */
+                        unlock_page(page);
+                        break;
+                }
+                if (wbc->sync_mode != WB_SYNC_NONE)
+                        wait_on_page_writeback(page);
+                if (PageWriteback(page) ||
+                                !clear_page_dirty_for_io(page)) {
+                        unlock_page(page);
+                        break;
+                }
+                /*
+                 * This actually clears the dirty bit in the radix tree.
+                 * See cifs_writepage() for more commentary.
+                 */
+                set_page_writeback(page);
+                if (page_offset(page) >= i_size_read(mapping->host)) {
+                        *done = true;
+                        unlock_page(page);
+                        end_page_writeback(page);
+                        break;
+                }
+                wdata->pages[i] = page;
+                *next = page->index + 1;
+                ++nr_pages;
+        }
+        /* reset index to refind any pages skipped */
+        if (nr_pages == 0)
+                *index = wdata->pages[0]->index + 1;
+        /* put any pages we aren't going to use */
+        for (i = nr_pages; i < found_pages; i++) {
+                page_cache_release(wdata->pages[i]);
+                wdata->pages[i] = NULL;
+        }
+        return nr_pages;
+}
+static int
+wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
+                 struct address_space *mapping, struct writeback_control *wbc)
+{
+        int rc = 0;
+        struct TCP_Server_Info *server;
+        unsigned int i;
+        wdata->sync_mode = wbc->sync_mode;
+        wdata->nr_pages = nr_pages;
+        wdata->offset = page_offset(wdata->pages[0]);
+        wdata->pagesz = PAGE_CACHE_SIZE;
+        wdata->tailsz = min(i_size_read(mapping->host) -
+                        page_offset(wdata->pages[nr_pages - 1]),
+                        (loff_t)PAGE_CACHE_SIZE);
+        wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz;
+        if (wdata->cfile != NULL)
+                cifsFileInfo_put(wdata->cfile);
+        wdata->cfile = find_writable_file(CIFS_I(mapping->host), false);
+        if (!wdata->cfile) {
+                cifs_dbg(VFS, "No writable handles for inode\n");
+                rc = -EBADF;
+        } else {
+                wdata->pid = wdata->cfile->pid;
+                server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+                rc = server->ops->async_writev(wdata, cifs_writedata_release);
+        }
+        for (i = 0; i < nr_pages; ++i)
+                unlock_page(wdata->pages[i]);
+        return rc;
+}
 static int cifs_writepages(struct address_space *mapping,
                           struct writeback_control *wbc)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
+        struct TCP_Server_Info *server;
        bool done = false, scanned = false, range_whole = false;
        pgoff_t end, index;
        struct cifs_writedata *wdata;
-        struct TCP_Server_Info *server;
-        struct page *page;
        int rc = 0;
        /*
@@ -1906,152 +2054,50 @@ static int cifs_writepages(struct address_space *mapping,
                        range_whole = true;
                scanned = true;
        }
+        server = cifs_sb_master_tcon(cifs_sb)->ses->server;
 retry:
        while (!done && index <= end) {
-                unsigned int i, nr_pages, found_pages;
+                unsigned int i, nr_pages, found_pages, wsize, credits;
-                pgoff_t next = 0, tofind;
+                pgoff_t next = 0, tofind, saved_index = index;
-                struct page **pages;
+                rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
+                                                   &wsize, &credits);
+                if (rc)
+                        break;
-                tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1,
+                tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1;
-                                end - index) + 1;
-                wdata = cifs_writedata_alloc((unsigned int)tofind,
+                wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index,
-                                             cifs_writev_complete);
+                                                  &found_pages);
                if (!wdata) {
                        rc = -ENOMEM;
+                        add_credits_and_wake_if(server, credits, 0);
                        break;
                }
-                /*
-                 * find_get_pages_tag seems to return a max of 256 on each
-                 * iteration, so we must call it several times in order to
-                 * fill the array or the wsize is effectively limited to
-                 * 256 * PAGE_CACHE_SIZE.
-                 */
-                found_pages = 0;
-                pages = wdata->pages;
-                do {
-                        nr_pages = find_get_pages_tag(mapping, &index,
-                                                        PAGECACHE_TAG_DIRTY,
-                                                        tofind, pages);
-                        found_pages += nr_pages;
-                        tofind -= nr_pages;
-                        pages += nr_pages;
-                } while (nr_pages && tofind && index <= end);
                if (found_pages == 0) {
                        kref_put(&wdata->refcount, cifs_writedata_release);
+                        add_credits_and_wake_if(server, credits, 0);
                        break;
                }
-                nr_pages = 0;
+                nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc,
-                for (i = 0; i < found_pages; i++) {
+                                               end, &index, &next, &done);
-                        page = wdata->pages[i];
-                        /*
-                         * At this point we hold neither mapping->tree_lock nor
-                         * lock on the page itself: the page may be truncated or
-                         * invalidated (changing page->mapping to NULL), or even
-                         * swizzled back from swapper_space to tmpfs file
-                         * mapping
-                         */
-                        if (nr_pages == 0)
-                                lock_page(page);
-                        else if (!trylock_page(page))
-                                break;
-                        if (unlikely(page->mapping != mapping)) {
-                                unlock_page(page);
-                                break;
-                        }
-                        if (!wbc->range_cyclic && page->index > end) {
-                                done = true;
-                                unlock_page(page);
-                                break;
-                        }
-                        if (next && (page->index != next)) {
-                                /* Not next consecutive page */
-                                unlock_page(page);
-                                break;
-                        }
-                        if (wbc->sync_mode != WB_SYNC_NONE)
-                                wait_on_page_writeback(page);
-                        if (PageWriteback(page) ||
-                                        !clear_page_dirty_for_io(page)) {
-                                unlock_page(page);
-                                break;
-                        }
-                        /*
-                         * This actually clears the dirty bit in the radix tree.
-                         * See cifs_writepage() for more commentary.
-                         */
-                        set_page_writeback(page);
-                        if (page_offset(page) >= i_size_read(mapping->host)) {
-                                done = true;
-                                unlock_page(page);
-                                end_page_writeback(page);
-                                break;
-                        }
-                        wdata->pages[i] = page;
-                        next = page->index + 1;
-                        ++nr_pages;
-                }
-                /* reset index to refind any pages skipped */
-                if (nr_pages == 0)
-                        index = wdata->pages[0]->index + 1;
-                /* put any pages we aren't going to use */
-                for (i = nr_pages; i < found_pages; i++) {
-                        page_cache_release(wdata->pages[i]);
-                        wdata->pages[i] = NULL;
-                }
                /* nothing to write? */
                if (nr_pages == 0) {
                        kref_put(&wdata->refcount, cifs_writedata_release);
+                        add_credits_and_wake_if(server, credits, 0);
                        continue;
                }
-                wdata->sync_mode = wbc->sync_mode;
+                wdata->credits = credits;
-                wdata->nr_pages = nr_pages;
-                wdata->offset = page_offset(wdata->pages[0]);
-                wdata->pagesz = PAGE_CACHE_SIZE;
-                wdata->tailsz =
-                        min(i_size_read(mapping->host) -
-                            page_offset(wdata->pages[nr_pages - 1]),
-                            (loff_t)PAGE_CACHE_SIZE);
-                wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) +
-                                        wdata->tailsz;
-                do {
-                        if (wdata->cfile != NULL)
-                                cifsFileInfo_put(wdata->cfile);
-                        wdata->cfile = find_writable_file(CIFS_I(mapping->host),
-                                                          false);
-                        if (!wdata->cfile) {
-                                cifs_dbg(VFS, "No writable handles for inode\n");
-                                rc = -EBADF;
-                                break;
-                        }
-                        wdata->pid = wdata->cfile->pid;
-                        server = tlink_tcon(wdata->cfile->tlink)->ses->server;
-                        rc = server->ops->async_writev(wdata,
-                                                        cifs_writedata_release);
-                } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
-                for (i = 0; i < nr_pages; ++i)
+                rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
-                        unlock_page(wdata->pages[i]);
                /* send failure -- clean up the mess */
                if (rc != 0) {
+                        add_credits_and_wake_if(server, wdata->credits, 0);
                        for (i = 0; i < nr_pages; ++i) {
                                if (rc == -EAGAIN)
                                        redirty_page_for_writepage(wbc,
@@ -2066,6 +2112,11 @@ retry:
                }
                kref_put(&wdata->refcount, cifs_writedata_release);
+                if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) {
+                        index = saved_index;
+                        continue;
+                }
                wbc->nr_to_write -= nr_pages;
                if (wbc->nr_to_write <= 0)
                        done = true;
@@ -2362,123 +2413,109 @@ cifs_uncached_writev_complete(struct work_struct *work)
        kref_put(&wdata->refcount, cifs_uncached_writedata_release);
 }
-/* attempt to send write to server, retry on any -EAGAIN errors */
 static int
-cifs_uncached_retry_writev(struct cifs_writedata *wdata)
+wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
+                      size_t *len, unsigned long *num_pages)
 {
-        int rc;
+        size_t save_len, copied, bytes, cur_len = *len;
-        struct TCP_Server_Info *server;
+        unsigned long i, nr_pages = *num_pages;
-        server = tlink_tcon(wdata->cfile->tlink)->ses->server;
+        save_len = cur_len;
+        for (i = 0; i < nr_pages; i++) {
+                bytes = min_t(const size_t, cur_len, PAGE_SIZE);
+                copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from);
+                cur_len -= copied;
+                /*
+                 * If we didn't copy as much as we expected, then that
+                 * may mean we trod into an unmapped area. Stop copying
+                 * at that point. On the next pass through the big
+                 * loop, we'll likely end up getting a zero-length
+                 * write and bailing out of it.
+                 */
+                if (copied < bytes)
+                        break;
+        }
+        cur_len = save_len - cur_len;
+        *len = cur_len;
-        do {
+        /*
-                if (wdata->cfile->invalidHandle) {
+         * If we have no data to send, then that probably means that
-                        rc = cifs_reopen_file(wdata->cfile, false);
+         * the copy above failed altogether. That's most likely because
-                        if (rc != 0)
+         * the address in the iovec was bogus. Return -EFAULT and let
-                                continue;
+         * the caller free anything we allocated and bail out.
-                }
+         */
-                rc = server->ops->async_writev(wdata,
+        if (!cur_len)
-                                               cifs_uncached_writedata_release);
+                return -EFAULT;
-        } while (rc == -EAGAIN);
-        return rc;
+        /*
+         * i + 1 now represents the number of pages we actually used in
+         * the copy phase above.
+         */
+        *num_pages = i + 1;
+        return 0;
 }
-static ssize_t
+static int
-cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
+cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
+                     struct cifsFileInfo *open_file,
+                     struct cifs_sb_info *cifs_sb, struct list_head *wdata_list)
 {
-        unsigned long nr_pages, i;
+        int rc = 0;
-        size_t bytes, copied, len, cur_len;
+        size_t cur_len;
-        ssize_t total_written = 0;
+        unsigned long nr_pages, num_pages, i;
-        loff_t offset;
+        struct cifs_writedata *wdata;
-        struct cifsFileInfo *open_file;
+        struct iov_iter saved_from;
-        struct cifs_tcon *tcon;
+        loff_t saved_offset = offset;
-        struct cifs_sb_info *cifs_sb;
-        struct cifs_writedata *wdata, *tmp;
-        struct list_head wdata_list;
-        int rc;
        pid_t pid;
+        struct TCP_Server_Info *server;
-        len = iov_iter_count(from);
-        rc = generic_write_checks(file, poffset, &len, 0);
-        if (rc)
-                return rc;
-        if (!len)
-                return 0;
-        iov_iter_truncate(from, len);
-        INIT_LIST_HEAD(&wdata_list);
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        open_file = file->private_data;
-        tcon = tlink_tcon(open_file->tlink);
-        if (!tcon->ses->server->ops->async_writev)
-                return -ENOSYS;
-        offset = *poffset;
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
                pid = open_file->pid;
        else
                pid = current->tgid;
+        server = tlink_tcon(open_file->tlink)->ses->server;
+        memcpy(&saved_from, from, sizeof(struct iov_iter));
        do {
-                size_t save_len;
+                unsigned int wsize, credits;
+                rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
+                                                   &wsize, &credits);
+                if (rc)
+                        break;
-                nr_pages = get_numpages(cifs_sb->wsize, len, &cur_len);
+                nr_pages = get_numpages(wsize, len, &cur_len);
                wdata = cifs_writedata_alloc(nr_pages,
                                             cifs_uncached_writev_complete);
                if (!wdata) {
                        rc = -ENOMEM;
+                        add_credits_and_wake_if(server, credits, 0);
                        break;
                }
                rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
                if (rc) {
                        kfree(wdata);
+                        add_credits_and_wake_if(server, credits, 0);
                        break;
                }
-                save_len = cur_len;
+                num_pages = nr_pages;
-                for (i = 0; i < nr_pages; i++) {
+                rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages);
-                        bytes = min_t(size_t, cur_len, PAGE_SIZE);
+                if (rc) {
-                        copied = copy_page_from_iter(wdata->pages[i], 0, bytes,
-                                                     from);
-                        cur_len -= copied;
-                        /*
-                         * If we didn't copy as much as we expected, then that
-                         * may mean we trod into an unmapped area. Stop copying
-                         * at that point. On the next pass through the big
-                         * loop, we'll likely end up getting a zero-length
-                         * write and bailing out of it.
-                         */
-                        if (copied < bytes)
-                                break;
-                }
-                cur_len = save_len - cur_len;
-                /*
-                 * If we have no data to send, then that probably means that
-                 * the copy above failed altogether. That's most likely because
-                 * the address in the iovec was bogus. Set the rc to -EFAULT,
-                 * free anything we allocated and bail out.
-                 */
-                if (!cur_len) {
                        for (i = 0; i < nr_pages; i++)
                                put_page(wdata->pages[i]);
                        kfree(wdata);
-                        rc = -EFAULT;
+                        add_credits_and_wake_if(server, credits, 0);
                        break;
                }
                /*
-                 * i + 1 now represents the number of pages we actually used in
+                 * Bring nr_pages down to the number of pages we actually used,
-                 * the copy phase above. Bring nr_pages down to that, and free
+                 * and free any pages that we didn't use.
-                 * any pages that we didn't use.
                 */
-                for ( ; nr_pages > i + 1; nr_pages--)
+                for ( ; nr_pages > num_pages; nr_pages--)
                        put_page(wdata->pages[nr_pages - 1]);
                wdata->sync_mode = WB_SYNC_ALL;
@@ -2489,18 +2526,69 @@ cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
                wdata->bytes = cur_len;
                wdata->pagesz = PAGE_SIZE;
                wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
-                rc = cifs_uncached_retry_writev(wdata);
+                wdata->credits = credits;
+                if (!wdata->cfile->invalidHandle ||
+                    !cifs_reopen_file(wdata->cfile, false))
+                        rc = server->ops->async_writev(wdata,
+                                        cifs_uncached_writedata_release);
                if (rc) {
+                        add_credits_and_wake_if(server, wdata->credits, 0);
                        kref_put(&wdata->refcount,
                                 cifs_uncached_writedata_release);
+                        if (rc == -EAGAIN) {
+                                memcpy(from, &saved_from,
+                                       sizeof(struct iov_iter));
+                                iov_iter_advance(from, offset - saved_offset);
+                                continue;
+                        }
                        break;
                }
-                list_add_tail(&wdata->list, &wdata_list);
+                list_add_tail(&wdata->list, wdata_list);
                offset += cur_len;
                len -= cur_len;
        } while (len > 0);
+        return rc;
+}
+static ssize_t
+cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
+{
+        size_t len;
+        ssize_t total_written = 0;
+        struct cifsFileInfo *open_file;
+        struct cifs_tcon *tcon;
+        struct cifs_sb_info *cifs_sb;
+        struct cifs_writedata *wdata, *tmp;
+        struct list_head wdata_list;
+        struct iov_iter saved_from;
+        int rc;
+        len = iov_iter_count(from);
+        rc = generic_write_checks(file, poffset, &len, 0);
+        if (rc)
+                return rc;
+        if (!len)
+                return 0;
+        iov_iter_truncate(from, len);
+        INIT_LIST_HEAD(&wdata_list);
+        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        open_file = file->private_data;
+        tcon = tlink_tcon(open_file->tlink);
+        if (!tcon->ses->server->ops->async_writev)
+                return -ENOSYS;
+        memcpy(&saved_from, from, sizeof(struct iov_iter));
+        rc = cifs_write_from_iter(*poffset, len, from, open_file, cifs_sb,
+                                  &wdata_list);
        /*
         * If at least one write was successfully sent, then discard any rc
         * value from the later writes. If the other write succeeds, then
@@ -2529,7 +2617,25 @@ restart_loop:
                        /* resend call if it's a retryable error */
                        if (rc == -EAGAIN) {
-                                rc = cifs_uncached_retry_writev(wdata);
+                                struct list_head tmp_list;
+                                struct iov_iter tmp_from;
+                                INIT_LIST_HEAD(&tmp_list);
+                                list_del_init(&wdata->list);
+                                memcpy(&tmp_from, &saved_from,
+                                       sizeof(struct iov_iter));
+                                iov_iter_advance(&tmp_from,
+                                                 wdata->offset - *poffset);
+                                rc = cifs_write_from_iter(wdata->offset,
+                                                wdata->bytes, &tmp_from,
+                                                open_file, cifs_sb, &tmp_list);
+                                list_splice(&tmp_list, &wdata_list);
+                                kref_put(&wdata->refcount,
+                                         cifs_uncached_writedata_release);
                                goto restart_loop;
                        }
                }
@@ -2722,26 +2828,6 @@ cifs_uncached_readdata_release(struct kref *refcount)
        cifs_readdata_release(refcount);
 }
-static int
-cifs_retry_async_readv(struct cifs_readdata *rdata)
-{
-        int rc;
-        struct TCP_Server_Info *server;
-        server = tlink_tcon(rdata->cfile->tlink)->ses->server;
-        do {
-                if (rdata->cfile->invalidHandle) {
-                        rc = cifs_reopen_file(rdata->cfile, true);
-                        if (rc != 0)
-                                continue;
-                }
-                rc = server->ops->async_readv(rdata);
-        } while (rc == -EAGAIN);
-        return rc;
-}
 /**
 * cifs_readdata_to_iov - copy data from pages in response to an iovec
 * @rdata:      the readdata response with list of pages holding data
@@ -2754,7 +2840,7 @@ cifs_retry_async_readv(struct cifs_readdata *rdata)
 static int
 cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
 {
-        size_t remaining = rdata->bytes;
+        size_t remaining = rdata->got_bytes;
        unsigned int i;
        for (i = 0; i < rdata->nr_pages; i++) {
@@ -2782,11 +2868,12 @@ static int
 cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
                        struct cifs_readdata *rdata, unsigned int len)
 {
-        int total_read = 0, result = 0;
+        int result = 0;
        unsigned int i;
        unsigned int nr_pages = rdata->nr_pages;
        struct kvec iov;
+        rdata->got_bytes = 0;
        rdata->tailsz = PAGE_SIZE;
        for (i = 0; i < nr_pages; i++) {
                struct page *page = rdata->pages[i];
@@ -2820,55 +2907,45 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
                if (result < 0)
                        break;
-                total_read += result;
+                rdata->got_bytes += result;
        }
-        return total_read > 0 ? total_read : result;
+        return rdata->got_bytes > 0 && result != -ECONNABORTED ?
+                                                rdata->got_bytes : result;
 }
-ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+static int
+cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
+                     struct cifs_sb_info *cifs_sb, struct list_head *rdata_list)
 {
-        struct file *file = iocb->ki_filp;
+        struct cifs_readdata *rdata;
-        ssize_t rc;
+        unsigned int npages, rsize, credits;
-        size_t len, cur_len;
+        size_t cur_len;
-        ssize_t total_read = 0;
+        int rc;
-        loff_t offset = iocb->ki_pos;
-        unsigned int npages;
-        struct cifs_sb_info *cifs_sb;
-        struct cifs_tcon *tcon;
-        struct cifsFileInfo *open_file;
-        struct cifs_readdata *rdata, *tmp;
-        struct list_head rdata_list;
        pid_t pid;
+        struct TCP_Server_Info *server;
-        len = iov_iter_count(to);
+        server = tlink_tcon(open_file->tlink)->ses->server;
-        if (!len)
-                return 0;
-        INIT_LIST_HEAD(&rdata_list);
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        open_file = file->private_data;
-        tcon = tlink_tcon(open_file->tlink);
-        if (!tcon->ses->server->ops->async_readv)
-                return -ENOSYS;
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
                pid = open_file->pid;
        else
                pid = current->tgid;
-        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
-                cifs_dbg(FYI, "attempting read on write only file instance\n");
        do {
-                cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
+                rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
+                                                   &rsize, &credits);
+                if (rc)
+                        break;
+                cur_len = min_t(const size_t, len, rsize);
                npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
                /* allocate a readdata struct */
                rdata = cifs_readdata_alloc(npages,
                                            cifs_uncached_readv_complete);
                if (!rdata) {
+                        add_credits_and_wake_if(server, credits, 0);
                        rc = -ENOMEM;
                        break;
                }
@@ -2884,44 +2961,113 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
                rdata->pid = pid;
                rdata->pagesz = PAGE_SIZE;
                rdata->read_into_pages = cifs_uncached_read_into_pages;
+                rdata->credits = credits;
-                rc = cifs_retry_async_readv(rdata);
+                if (!rdata->cfile->invalidHandle ||
+                    !cifs_reopen_file(rdata->cfile, true))
+                        rc = server->ops->async_readv(rdata);
 error:
                if (rc) {
+                        add_credits_and_wake_if(server, rdata->credits, 0);
                        kref_put(&rdata->refcount,
                                 cifs_uncached_readdata_release);
+                        if (rc == -EAGAIN)
+                                continue;
                        break;
                }
-                list_add_tail(&rdata->list, &rdata_list);
+                list_add_tail(&rdata->list, rdata_list);
                offset += cur_len;
                len -= cur_len;
        } while (len > 0);
+        return rc;
+}
+ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+        struct file *file = iocb->ki_filp;
+        ssize_t rc;
+        size_t len;
+        ssize_t total_read = 0;
+        loff_t offset = iocb->ki_pos;
+        struct cifs_sb_info *cifs_sb;
+        struct cifs_tcon *tcon;
+        struct cifsFileInfo *open_file;
+        struct cifs_readdata *rdata, *tmp;
+        struct list_head rdata_list;
+        len = iov_iter_count(to);
+        if (!len)
+                return 0;
+        INIT_LIST_HEAD(&rdata_list);
+        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+        open_file = file->private_data;
+        tcon = tlink_tcon(open_file->tlink);
+        if (!tcon->ses->server->ops->async_readv)
+                return -ENOSYS;
+        if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+                cifs_dbg(FYI, "attempting read on write only file instance\n");
+        rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list);
        /* if at least one read request send succeeded, then reset rc */
        if (!list_empty(&rdata_list))
                rc = 0;
        len = iov_iter_count(to);
        /* the loop below should proceed in the order of increasing offsets */
+again:
        list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
-        again:
                if (!rc) {
                        /* FIXME: freezable sleep too? */
                        rc = wait_for_completion_killable(&rdata->done);
                        if (rc)
                                rc = -EINTR;
-                        else if (rdata->result) {
+                        else if (rdata->result == -EAGAIN) {
-                                rc = rdata->result;
                                /* resend call if it's a retryable error */
-                                if (rc == -EAGAIN) {
+                                struct list_head tmp_list;
-                                        rc = cifs_retry_async_readv(rdata);
+                                unsigned int got_bytes = rdata->got_bytes;
-                                        goto again;
+                                list_del_init(&rdata->list);
+                                INIT_LIST_HEAD(&tmp_list);
+                                /*
+                                 * Got a part of data and then reconnect has
+                                 * happened -- fill the buffer and continue
+                                 * reading.
+                                 */
+                                if (got_bytes && got_bytes < rdata->bytes) {
+                                        rc = cifs_readdata_to_iov(rdata, to);
+                                        if (rc) {
+                                                kref_put(&rdata->refcount,
+                                                cifs_uncached_readdata_release);
+                                                continue;
+                                        }
                                }
-                        } else {
+                                rc = cifs_send_async_read(
+                                                rdata->offset + got_bytes,
+                                                rdata->bytes - got_bytes,
+                                                rdata->cfile, cifs_sb,
+                                                &tmp_list);
+                                list_splice(&tmp_list, &rdata_list);
+                                kref_put(&rdata->refcount,
+                                         cifs_uncached_readdata_release);
+                                goto again;
+                        } else if (rdata->result)
+                                rc = rdata->result;
+                        else
                                rc = cifs_readdata_to_iov(rdata, to);
-                        }
+                        /* if there was a short read -- discard anything left */
+                        if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
+                                rc = -ENODATA;
                }
                list_del_init(&rdata->list);
                kref_put(&rdata->refcount, cifs_uncached_readdata_release);
@@ -3030,18 +3176,19 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
        for (total_read = 0, cur_offset = read_data; read_size > total_read;
             total_read += bytes_read, cur_offset += bytes_read) {
-                current_read_size = min_t(uint, read_size - total_read, rsize);
+                do {
-                /*
+                        current_read_size = min_t(uint, read_size - total_read,
-                 * For windows me and 9x we do not want to request more than it
+                                                  rsize);
-                 * negotiated since it will refuse the read then.
+                        /*
-                 */
+                         * For windows me and 9x we do not want to request more
-                if ((tcon->ses) && !(tcon->ses->capabilities &
+                         * than it negotiated since it will refuse the read
+                         * then.
+                         */
+                        if ((tcon->ses) && !(tcon->ses->capabilities &
                                tcon->ses->server->vals->cap_large_files)) {
-                        current_read_size = min_t(uint, current_read_size,
+                                current_read_size = min_t(uint,
-                                        CIFSMaxBufSize);
+                                        current_read_size, CIFSMaxBufSize);
-                }
+                        }
-                rc = -EAGAIN;
-                while (rc == -EAGAIN) {
                        if (open_file->invalidHandle) {
                                rc = cifs_reopen_file(open_file, true);
                                if (rc != 0)
@@ -3054,7 +3201,8 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
                        rc = server->ops->sync_read(xid, open_file, &io_parms,
                                                    &bytes_read, &cur_offset,
                                                    &buf_type);
-                }
+                } while (rc == -EAGAIN);
                if (rc || (bytes_read == 0)) {
                        if (total_read) {
                                break;
@@ -3133,25 +3281,30 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 static void
 cifs_readv_complete(struct work_struct *work)
 {
-        unsigned int i;
+        unsigned int i, got_bytes;
        struct cifs_readdata *rdata = container_of(work,
                                                struct cifs_readdata, work);
+        got_bytes = rdata->got_bytes;
        for (i = 0; i < rdata->nr_pages; i++) {
                struct page *page = rdata->pages[i];
                lru_cache_add_file(page);
-                if (rdata->result == 0) {
+                if (rdata->result == 0 ||
+                    (rdata->result == -EAGAIN && got_bytes)) {
                        flush_dcache_page(page);
                        SetPageUptodate(page);
                }
                unlock_page(page);
-                if (rdata->result == 0)
+                if (rdata->result == 0 ||
+                    (rdata->result == -EAGAIN && got_bytes))
                        cifs_readpage_to_fscache(rdata->mapping->host, page);
+                got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes);
                page_cache_release(page);
                rdata->pages[i] = NULL;
        }
@@ -3162,7 +3315,7 @@ static int
 cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
                        struct cifs_readdata *rdata, unsigned int len)
 {
-        int total_read = 0, result = 0;
+        int result = 0;
        unsigned int i;
        u64 eof;
        pgoff_t eof_index;
@@ -3174,6 +3327,7 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
        eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
        cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
+        rdata->got_bytes = 0;
        rdata->tailsz = PAGE_CACHE_SIZE;
        for (i = 0; i < nr_pages; i++) {
                struct page *page = rdata->pages[i];
@@ -3228,10 +3382,70 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
                if (result < 0)
                        break;
-                total_read += result;
+                rdata->got_bytes += result;
        }
-        return total_read > 0 ? total_read : result;
+        return rdata->got_bytes > 0 && result != -ECONNABORTED ?
+                                                rdata->got_bytes : result;
+}
+static int
+readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
+                    unsigned int rsize, struct list_head *tmplist,
+                    unsigned int *nr_pages, loff_t *offset, unsigned int *bytes)
+{
+        struct page *page, *tpage;
+        unsigned int expected_index;
+        int rc;
+        INIT_LIST_HEAD(tmplist);
+        page = list_entry(page_list->prev, struct page, lru);
+        /*
+         * Lock the page and put it in the cache. Since no one else
+         * should have access to this page, we're safe to simply set
+         * PG_locked without checking it first.
+         */
+        __set_page_locked(page);
+        rc = add_to_page_cache_locked(page, mapping,
+                                      page->index, GFP_KERNEL);
+        /* give up if we can't stick it in the cache */
+        if (rc) {
+                __clear_page_locked(page);
+                return rc;
+        }
+        /* move first page to the tmplist */
+        *offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+        *bytes = PAGE_CACHE_SIZE;
+        *nr_pages = 1;
+        list_move_tail(&page->lru, tmplist);
+        /* now try and add more pages onto the request */
+        expected_index = page->index + 1;
+        list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
+                /* discontinuity ? */
+                if (page->index != expected_index)
+                        break;
+                /* would this page push the read over the rsize? */
+                if (*bytes + PAGE_CACHE_SIZE > rsize)
+                        break;
+                __set_page_locked(page);
+                if (add_to_page_cache_locked(page, mapping, page->index,
+                                                                GFP_KERNEL)) {
+                        __clear_page_locked(page);
+                        break;
+                }
+                list_move_tail(&page->lru, tmplist);
+                (*bytes) += PAGE_CACHE_SIZE;
+                expected_index++;
+                (*nr_pages)++;
+        }
+        return rc;
 }
 static int cifs_readpages(struct file *file, struct address_space *mapping,
@@ -3241,19 +3455,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        struct list_head tmplist;
        struct cifsFileInfo *open_file = file->private_data;
        struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        unsigned int rsize = cifs_sb->rsize;
+        struct TCP_Server_Info *server;
        pid_t pid;
        /*
-         * Give up immediately if rsize is too small to read an entire page.
-         * The VFS will fall back to readpage. We should never reach this
-         * point however since we set ra_pages to 0 when the rsize is smaller
-         * than a cache page.
-         */
-        if (unlikely(rsize < PAGE_CACHE_SIZE))
-                return 0;
-        /*
         * Reads as many pages as possible from fscache. Returns -ENOBUFS
         * immediately if the cookie is negative
         *
@@ -3271,7 +3476,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                pid = current->tgid;
        rc = 0;
-        INIT_LIST_HEAD(&tmplist);
+        server = tlink_tcon(open_file->tlink)->ses->server;
        cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
                 __func__, file, mapping, num_pages);
@@ -3288,58 +3493,35 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
         * the rdata->pages, then we want them in increasing order.
         */
        while (!list_empty(page_list)) {
-                unsigned int i;
+                unsigned int i, nr_pages, bytes, rsize;
-                unsigned int bytes = PAGE_CACHE_SIZE;
-                unsigned int expected_index;
-                unsigned int nr_pages = 1;
                loff_t offset;
                struct page *page, *tpage;
                struct cifs_readdata *rdata;
+                unsigned credits;
-                page = list_entry(page_list->prev, struct page, lru);
+                rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
+                                                   &rsize, &credits);
+                if (rc)
+                        break;
                /*
-                 * Lock the page and put it in the cache. Since no one else
+                 * Give up immediately if rsize is too small to read an entire
-                 * should have access to this page, we're safe to simply set
+                 * page. The VFS will fall back to readpage. We should never
-                 * PG_locked without checking it first.
+                 * reach this point however since we set ra_pages to 0 when the
+                 * rsize is smaller than a cache page.
                 */
-                __set_page_locked(page);
+                if (unlikely(rsize < PAGE_CACHE_SIZE)) {
-                rc = add_to_page_cache_locked(page, mapping,
+                        add_credits_and_wake_if(server, credits, 0);
-                                              page->index, GFP_KERNEL);
+                        return 0;
+                }
-                /* give up if we can't stick it in the cache */
+                rc = readpages_get_pages(mapping, page_list, rsize, &tmplist,
+                                         &nr_pages, &offset, &bytes);
                if (rc) {
-                        __clear_page_locked(page);
+                        add_credits_and_wake_if(server, credits, 0);
                        break;
                }
-                /* move first page to the tmplist */
-                offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-                list_move_tail(&page->lru, &tmplist);
-                /* now try and add more pages onto the request */
-                expected_index = page->index + 1;
-                list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
-                        /* discontinuity ? */
-                        if (page->index != expected_index)
-                                break;
-                        /* would this page push the read over the rsize? */
-                        if (bytes + PAGE_CACHE_SIZE > rsize)
-                                break;
-                        __set_page_locked(page);
-                        if (add_to_page_cache_locked(page, mapping,
-                                                page->index, GFP_KERNEL)) {
-                                __clear_page_locked(page);
-                                break;
-                        }
-                        list_move_tail(&page->lru, &tmplist);
-                        bytes += PAGE_CACHE_SIZE;
-                        expected_index++;
-                        nr_pages++;
-                }
                rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete);
                if (!rdata) {
                        /* best to give up if we're out of mem */
@@ -3350,6 +3532,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                                page_cache_release(page);
                        }
                        rc = -ENOMEM;
+                        add_credits_and_wake_if(server, credits, 0);
                        break;
                }
@@ -3360,21 +3543,32 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
                rdata->pid = pid;
                rdata->pagesz = PAGE_CACHE_SIZE;
                rdata->read_into_pages = cifs_readpages_read_into_pages;
+                rdata->credits = credits;
                list_for_each_entry_safe(page, tpage, &tmplist, lru) {
                        list_del(&page->lru);
                        rdata->pages[rdata->nr_pages++] = page;
                }
-                rc = cifs_retry_async_readv(rdata);
+                if (!rdata->cfile->invalidHandle ||
-                if (rc != 0) {
+                    !cifs_reopen_file(rdata->cfile, true))
+                        rc = server->ops->async_readv(rdata);
+                if (rc) {
+                        add_credits_and_wake_if(server, rdata->credits, 0);
                        for (i = 0; i < rdata->nr_pages; i++) {
                                page = rdata->pages[i];
                                lru_cache_add_file(page);
                                unlock_page(page);
                                page_cache_release(page);
+                                if (rc == -EAGAIN)
+                                        list_add_tail(&page->lru, &tmplist);
                        }
                        kref_put(&rdata->refcount, cifs_readdata_release);
+                        if (rc == -EAGAIN) {
+                                /* Re-add pages to the page_list and retry */
+                                list_splice(&tmplist, page_list);
+                                continue;
+                        }
                        break;
                }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 41de3935caa0..426d6c6ad8bf 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1627,8 +1627,9 @@ do_rename_exit:
 }
 int
-cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
+cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
-            struct inode *target_dir, struct dentry *target_dentry)
+             struct inode *target_dir, struct dentry *target_dentry,
+             unsigned int flags)
 {
        char *from_name = NULL;
        char *to_name = NULL;
@@ -1640,6 +1641,9 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
        unsigned int xid;
        int rc, tmprc;
+        if (flags & ~RENAME_NOREPLACE)
+                return -EINVAL;
        cifs_sb = CIFS_SB(source_dir->i_sb);
        tlink = cifs_sb_tlink(cifs_sb);
        if (IS_ERR(tlink))
@@ -1667,6 +1671,12 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
        rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
                            to_name);
+        /*
+         * No-replace is the natural behavior for CIFS, so skip unlink hacks.
+         */
+        if (flags & RENAME_NOREPLACE)
+                goto cifs_rename_exit;
        if (rc == -EEXIST && tcon->unix_ext) {
                /*
                 * Are src and dst hardlinks of same inode? We can only tell
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 6bf55d0ed494..81340c6253eb 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -226,6 +226,15 @@ cifs_small_buf_release(void *buf_to_free)
        return;
 }
+void
+free_rsp_buf(int resp_buftype, void *rsp)
+{
+        if (resp_buftype == CIFS_SMALL_BUFFER)
+                cifs_small_buf_release(rsp);
+        else if (resp_buftype == CIFS_LARGE_BUFFER)
+                cifs_buf_release(rsp);
+}
 /* NB: MID can not be set if treeCon not passed in, in that
   case it is responsbility of caller to set the mid */
 void
@@ -414,7 +423,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
                        return true;
                }
                if (pSMBr->hdr.Status.CifsError) {
-                        cifs_dbg(FYI, "notify err 0x%d\n",
+                        cifs_dbg(FYI, "notify err 0x%x\n",
                                 pSMBr->hdr.Status.CifsError);
                        return true;
                }
@@ -441,7 +450,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
        if (pSMB->hdr.WordCount != 8)
                return false;
-        cifs_dbg(FYI, "oplock type 0x%d level 0x%d\n",
+        cifs_dbg(FYI, "oplock type 0x%x level 0x%x\n",
                 pSMB->LockType, pSMB->OplockLevel);
        if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
                return false;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index e87387dbf39f..39ee32688eac 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -520,382 +520,559 @@ select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
        }
 }
-int
+struct sess_data {
-CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
+        unsigned int xid;
-               const struct nls_table *nls_cp)
+        struct cifs_ses *ses;
+        struct nls_table *nls_cp;
+        void (*func)(struct sess_data *);
+        int result;
+        /* we will send the SMB in three pieces:
+         * a fixed length beginning part, an optional
+         * SPNEGO blob (which can be zero length), and a
+         * last part which will include the strings
+         * and rest of bcc area. This allows us to avoid
+         * a large buffer 17K allocation
+         */
+        int buf0_type;
+        struct kvec iov[3];
+};
+static int
+sess_alloc_buffer(struct sess_data *sess_data, int wct)
 {
-        int rc = 0;
+        int rc;
-        int wct;
+        struct cifs_ses *ses = sess_data->ses;
        struct smb_hdr *smb_buf;
-        char *bcc_ptr;
-        char *str_area;
-        SESSION_SETUP_ANDX *pSMB;
-        __u32 capabilities;
-        __u16 count;
-        int resp_buf_type;
-        struct kvec iov[3];
-        enum securityEnum type;
-        __u16 action, bytes_remaining;
-        struct key *spnego_key = NULL;
-        __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
-        u16 blob_len;
-        char *ntlmsspblob = NULL;
-        if (ses == NULL) {
+        rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
-                WARN(1, "%s: ses == NULL!", __func__);
+                                  (void **)&smb_buf);
-                return -EINVAL;
-        }
-        type = select_sectype(ses->server, ses->sectype);
+        if (rc)
-        cifs_dbg(FYI, "sess setup type %d\n", type);
+                return rc;
-        if (type == Unspecified) {
-                cifs_dbg(VFS,
+        sess_data->iov[0].iov_base = (char *)smb_buf;
-                        "Unable to select appropriate authentication method!");
+        sess_data->iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
-                return -EINVAL;
+        /*
+         * This variable will be used to clear the buffer
+         * allocated above in case of any error in the calling function.
+         */
+        sess_data->buf0_type = CIFS_SMALL_BUFFER;
+        /* 2000 big enough to fit max user, domain, NOS name etc. */
+        sess_data->iov[2].iov_base = kmalloc(2000, GFP_KERNEL);
+        if (!sess_data->iov[2].iov_base) {
+                rc = -ENOMEM;
+                goto out_free_smb_buf;
        }
-        if (type == RawNTLMSSP) {
+        return 0;
-                /* if memory allocation is successful, caller of this function
-                 * frees it.
+out_free_smb_buf:
-                 */
+        kfree(smb_buf);
-                ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
+        sess_data->iov[0].iov_base = NULL;
-                if (!ses->ntlmssp)
+        sess_data->iov[0].iov_len = 0;
-                        return -ENOMEM;
+        sess_data->buf0_type = CIFS_NO_BUFFER;
-                ses->ntlmssp->sesskey_per_smbsess = false;
+        return rc;
+}
+static void
+sess_free_buffer(struct sess_data *sess_data)
+{
+        free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base);
+        sess_data->buf0_type = CIFS_NO_BUFFER;
+        kfree(sess_data->iov[2].iov_base);
+}
+static int
+sess_establish_session(struct sess_data *sess_data)
+{
+        struct cifs_ses *ses = sess_data->ses;
+        mutex_lock(&ses->server->srv_mutex);
+        if (!ses->server->session_estab) {
+                if (ses->server->sign) {
+                        ses->server->session_key.response =
+                                kmemdup(ses->auth_key.response,
+                                ses->auth_key.len, GFP_KERNEL);
+                        if (!ses->server->session_key.response) {
+                                mutex_unlock(&ses->server->srv_mutex);
+                                return -ENOMEM;
+                        }
+                        ses->server->session_key.len =
+                                                ses->auth_key.len;
+                }
+                ses->server->sequence_number = 0x2;
+                ses->server->session_estab = true;
        }
+        mutex_unlock(&ses->server->srv_mutex);
-ssetup_ntlmssp_authenticate:
+        cifs_dbg(FYI, "CIFS session established successfully\n");
-        if (phase == NtLmChallenge)
+        spin_lock(&GlobalMid_Lock);
-                phase = NtLmAuthenticate; /* if ntlmssp, now final phase */
+        ses->status = CifsGood;
+        ses->need_reconnect = false;
+        spin_unlock(&GlobalMid_Lock);
-        if (type == LANMAN) {
+        return 0;
-#ifndef CONFIG_CIFS_WEAK_PW_HASH
+}
-                /* LANMAN and plaintext are less secure and off by default.
-                So we make this explicitly be turned on in kconfig (in the
-                build) and turned on at runtime (changed from the default)
-                in proc/fs/cifs or via mount parm.  Unfortunately this is
-                needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
-                return -EOPNOTSUPP;
-#endif
-                wct = 10; /* lanman 2 style sessionsetup */
-        } else if ((type == NTLM) || (type == NTLMv2)) {
-                /* For NTLMv2 failures eventually may need to retry NTLM */
-                wct = 13; /* old style NTLM sessionsetup */
-        } else /* same size: negotiate or auth, NTLMSSP or extended security */
-                wct = 12;
-        rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
+static int
-                            (void **)&smb_buf);
+sess_sendreceive(struct sess_data *sess_data)
-        if (rc)
+{
-                return rc;
+        int rc;
+        struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base;
+        __u16 count;
-        pSMB = (SESSION_SETUP_ANDX *)smb_buf;
+        count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len;
+        smb_buf->smb_buf_length =
+                cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
+        put_bcc(count, smb_buf);
+        rc = SendReceive2(sess_data->xid, sess_data->ses,
+                          sess_data->iov, 3 /* num_iovecs */,
+                          &sess_data->buf0_type,
+                          CIFS_LOG_ERROR);
+        return rc;
+}
+/*
+ * LANMAN and plaintext are less secure and off by default.
+ * So we make this explicitly be turned on in kconfig (in the
+ * build) and turned on at runtime (changed from the default)
+ * in proc/fs/cifs or via mount parm.  Unfortunately this is
+ * needed for old Win (e.g. Win95), some obscure NAS and OS/2
+ */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+static void
+sess_auth_lanman(struct sess_data *sess_data)
+{
+        int rc = 0;
+        struct smb_hdr *smb_buf;
+        SESSION_SETUP_ANDX *pSMB;
+        char *bcc_ptr;
+        struct cifs_ses *ses = sess_data->ses;
+        char lnm_session_key[CIFS_AUTH_RESP_SIZE];
+        __u32 capabilities;
+        __u16 bytes_remaining;
+        /* lanman 2 style sessionsetup */
+        /* wct = 10 */
+        rc = sess_alloc_buffer(sess_data, 10);
+        if (rc)
+                goto out;
+        pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+        bcc_ptr = sess_data->iov[2].iov_base;
        capabilities = cifs_ssetup_hdr(ses, pSMB);
-        /* we will send the SMB in three pieces:
+        pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
-        a fixed length beginning part, an optional
-        SPNEGO blob (which can be zero length), and a
-        last part which will include the strings
-        and rest of bcc area. This allows us to avoid
-        a large buffer 17K allocation */
-        iov[0].iov_base = (char *)pSMB;
-        iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
-        /* setting this here allows the code at the end of the function
-           to free the request buffer if there's an error */
-        resp_buf_type = CIFS_SMALL_BUFFER;
-        /* 2000 big enough to fit max user, domain, NOS name etc. */
+        /* no capabilities flags in old lanman negotiation */
-        str_area = kmalloc(2000, GFP_KERNEL);
+        pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
-        if (str_area == NULL) {
-                rc = -ENOMEM;
-                goto ssetup_exit;
-        }
-        bcc_ptr = str_area;
-        iov[1].iov_base = NULL;
+        /* Calculate hash with password and copy into bcc_ptr.
-        iov[1].iov_len = 0;
+         * Encryption Key (stored as in cryptkey) gets used if the
+         * security mode bit in Negottiate Protocol response states
+         * to use challenge/response method (i.e. Password bit is 1).
+         */
+        rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
+                              ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
+                              true : false, lnm_session_key);
-        if (type == LANMAN) {
+        memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
-#ifdef CONFIG_CIFS_WEAK_PW_HASH
+        bcc_ptr += CIFS_AUTH_RESP_SIZE;
-                char lnm_session_key[CIFS_AUTH_RESP_SIZE];
+        /*
+         * can not sign if LANMAN negotiated so no need
+         * to calculate signing key? but what if server
+         * changed to do higher than lanman dialect and
+         * we reconnected would we ever calc signing_key?
+         */
-                pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
+        cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
+        /* Unicode not allowed for LANMAN dialects */
+        ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
-                /* no capabilities flags in old lanman negotiation */
+        sess_data->iov[2].iov_len = (long) bcc_ptr -
+                        (long) sess_data->iov[2].iov_base;
-                pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
+        rc = sess_sendreceive(sess_data);
+        if (rc)
+                goto out;
-                /* Calculate hash with password and copy into bcc_ptr.
+        pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
-                 * Encryption Key (stored as in cryptkey) gets used if the
+        smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
-                 * security mode bit in Negottiate Protocol response states
-                 * to use challenge/response method (i.e. Password bit is 1).
-                 */
-                rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
+        /* lanman response has a word count of 3 */
-                                 ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
+        if (smb_buf->WordCount != 3) {
-                                        true : false, lnm_session_key);
+                rc = -EIO;
+                cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+                goto out;
+        }
-                memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
+        if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
-                bcc_ptr += CIFS_AUTH_RESP_SIZE;
+                cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+        ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
+        cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
-                /* can not sign if LANMAN negotiated so no need
+        bytes_remaining = get_bcc(smb_buf);
-                to calculate signing key? but what if server
+        bcc_ptr = pByteArea(smb_buf);
-                changed to do higher than lanman dialect and
-                we reconnected would we ever calc signing_key? */
-                cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
+        /* BB check if Unicode and decode strings */
-                /* Unicode not allowed for LANMAN dialects */
+        if (bytes_remaining == 0) {
-                ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
+                /* no string area to decode, do nothing */
+        } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+                /* unicode string area must be word-aligned */
+                if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+                        ++bcc_ptr;
+                        --bytes_remaining;
+                }
+                decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+                                      sess_data->nls_cp);
+        } else {
+                decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+                                    sess_data->nls_cp);
+        }
+        rc = sess_establish_session(sess_data);
+out:
+        sess_data->result = rc;
+        sess_data->func = NULL;
+        sess_free_buffer(sess_data);
+}
+#else
+static void
+sess_auth_lanman(struct sess_data *sess_data)
+{
+        sess_data->result = -EOPNOTSUPP;
+        sess_data->func = NULL;
+}
 #endif
-        } else if (type == NTLM) {
-                pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+static void
-                pSMB->req_no_secext.CaseInsensitivePasswordLength =
+sess_auth_ntlm(struct sess_data *sess_data)
+{
+        int rc = 0;
+        struct smb_hdr *smb_buf;
+        SESSION_SETUP_ANDX *pSMB;
+        char *bcc_ptr;
+        struct cifs_ses *ses = sess_data->ses;
+        __u32 capabilities;
+        __u16 bytes_remaining;
+        /* old style NTLM sessionsetup */
+        /* wct = 13 */
+        rc = sess_alloc_buffer(sess_data, 13);
+        if (rc)
+                goto out;
+        pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+        bcc_ptr = sess_data->iov[2].iov_base;
+        capabilities = cifs_ssetup_hdr(ses, pSMB);
+        pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+        pSMB->req_no_secext.CaseInsensitivePasswordLength =
                        cpu_to_le16(CIFS_AUTH_RESP_SIZE);
-                pSMB->req_no_secext.CaseSensitivePasswordLength =
+        pSMB->req_no_secext.CaseSensitivePasswordLength =
                        cpu_to_le16(CIFS_AUTH_RESP_SIZE);
-                /* calculate ntlm response and session key */
+        /* calculate ntlm response and session key */
-                rc = setup_ntlm_response(ses, nls_cp);
+        rc = setup_ntlm_response(ses, sess_data->nls_cp);
-                if (rc) {
+        if (rc) {
-                        cifs_dbg(VFS, "Error %d during NTLM authentication\n",
+                cifs_dbg(VFS, "Error %d during NTLM authentication\n",
                                 rc);
-                        goto ssetup_exit;
+                goto out;
-                }
+        }
-                /* copy ntlm response */
+        /* copy ntlm response */
-                memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+        memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
-                                CIFS_AUTH_RESP_SIZE);
+                        CIFS_AUTH_RESP_SIZE);
-                bcc_ptr += CIFS_AUTH_RESP_SIZE;
+        bcc_ptr += CIFS_AUTH_RESP_SIZE;
-                memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+        memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
-                                CIFS_AUTH_RESP_SIZE);
+                        CIFS_AUTH_RESP_SIZE);
-                bcc_ptr += CIFS_AUTH_RESP_SIZE;
+        bcc_ptr += CIFS_AUTH_RESP_SIZE;
-                if (ses->capabilities & CAP_UNICODE) {
+        if (ses->capabilities & CAP_UNICODE) {
-                        /* unicode strings must be word aligned */
+                /* unicode strings must be word aligned */
-                        if (iov[0].iov_len % 2) {
+                if (sess_data->iov[0].iov_len % 2) {
-                                *bcc_ptr = 0;
+                        *bcc_ptr = 0;
-                                bcc_ptr++;
+                        bcc_ptr++;
-                        }
-                        unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
-                } else
-                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-        } else if (type == NTLMv2) {
-                pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
-                /* LM2 password would be here if we supported it */
-                pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
-                /* calculate nlmv2 response and session key */
-                rc = setup_ntlmv2_rsp(ses, nls_cp);
-                if (rc) {
-                        cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n",
-                                 rc);
-                        goto ssetup_exit;
                }
-                memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+                unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
-                                ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+        } else {
-                bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+                ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+        }
-                /* set case sensitive password length after tilen may get
-                 * assigned, tilen is 0 otherwise.
-                 */
-                pSMB->req_no_secext.CaseSensitivePasswordLength =
-                        cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
-                if (ses->capabilities & CAP_UNICODE) {
-                        if (iov[0].iov_len % 2) {
-                                *bcc_ptr = 0;
-                                bcc_ptr++;
-                        }
-                        unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
-                } else
-                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-        } else if (type == Kerberos) {
-#ifdef CONFIG_CIFS_UPCALL
-                struct cifs_spnego_msg *msg;
-                spnego_key = cifs_get_spnego_key(ses);
+        sess_data->iov[2].iov_len = (long) bcc_ptr -
-                if (IS_ERR(spnego_key)) {
+                        (long) sess_data->iov[2].iov_base;
-                        rc = PTR_ERR(spnego_key);
-                        spnego_key = NULL;
-                        goto ssetup_exit;
-                }
-                msg = spnego_key->payload.data;
+        rc = sess_sendreceive(sess_data);
-                /* check version field to make sure that cifs.upcall is
+        if (rc)
-                   sending us a response in an expected form */
+                goto out;
-                if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
-                        cifs_dbg(VFS, "incorrect version of cifs.upcall "
-                                   "expected %d but got %d)",
-                                   CIFS_SPNEGO_UPCALL_VERSION, msg->version);
-                        rc = -EKEYREJECTED;
-                        goto ssetup_exit;
-                }
-                ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
+        pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
-                                                 GFP_KERNEL);
+        smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
-                if (!ses->auth_key.response) {
-                        cifs_dbg(VFS,
-                                "Kerberos can't allocate (%u bytes) memory",
-                                msg->sesskey_len);
-                        rc = -ENOMEM;
-                        goto ssetup_exit;
-                }
-                ses->auth_key.len = msg->sesskey_len;
-                pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
-                capabilities |= CAP_EXTENDED_SECURITY;
-                pSMB->req.Capabilities = cpu_to_le32(capabilities);
-                iov[1].iov_base = msg->data + msg->sesskey_len;
-                iov[1].iov_len = msg->secblob_len;
-                pSMB->req.SecurityBlobLength = cpu_to_le16(iov[1].iov_len);
-                if (ses->capabilities & CAP_UNICODE) {
-                        /* unicode strings must be word aligned */
-                        if ((iov[0].iov_len + iov[1].iov_len) % 2) {
-                                *bcc_ptr = 0;
-                                bcc_ptr++;
-                        }
-                        unicode_oslm_strings(&bcc_ptr, nls_cp);
-                        unicode_domain_string(&bcc_ptr, ses, nls_cp);
-                } else
-                /* BB: is this right? */
-                        ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
-#else /* ! CONFIG_CIFS_UPCALL */
-                cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
-                rc = -ENOSYS;
-                goto ssetup_exit;
-#endif /* CONFIG_CIFS_UPCALL */
-        } else if (type == RawNTLMSSP) {
-                if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
-                        cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
-                        rc = -ENOSYS;
-                        goto ssetup_exit;
-                }
-                cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase);
+        if (smb_buf->WordCount != 3) {
-                pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+                rc = -EIO;
-                capabilities |= CAP_EXTENDED_SECURITY;
+                cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
-                pSMB->req.Capabilities |= cpu_to_le32(capabilities);
+                goto out;
-                switch(phase) {
+        }
-                case NtLmNegotiate:
-                        build_ntlmssp_negotiate_blob(
-                                pSMB->req.SecurityBlob, ses);
-                        iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
-                        iov[1].iov_base = pSMB->req.SecurityBlob;
-                        pSMB->req.SecurityBlobLength =
-                                cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
-                        break;
-                case NtLmAuthenticate:
-                        /*
-                         * 5 is an empirical value, large enough to hold
-                         * authenticate message plus max 10 of av paris,
-                         * domain, user, workstation names, flags, etc.
-                         */
-                        ntlmsspblob = kzalloc(
-                                5*sizeof(struct _AUTHENTICATE_MESSAGE),
-                                GFP_KERNEL);
-                        if (!ntlmsspblob) {
-                                rc = -ENOMEM;
-                                goto ssetup_exit;
-                        }
-                        rc = build_ntlmssp_auth_blob(ntlmsspblob,
+        if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
-                                                &blob_len, ses, nls_cp);
+                cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
-                        if (rc)
-                                goto ssetup_exit;
+        ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
-                        iov[1].iov_len = blob_len;
+        cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
-                        iov[1].iov_base = ntlmsspblob;
-                        pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
+        bytes_remaining = get_bcc(smb_buf);
-                        /*
+        bcc_ptr = pByteArea(smb_buf);
-                         * Make sure that we tell the server that we are using
-                         * the uid that it just gave us back on the response
+        /* BB check if Unicode and decode strings */
-                         * (challenge)
+        if (bytes_remaining == 0) {
-                         */
+                /* no string area to decode, do nothing */
-                        smb_buf->Uid = ses->Suid;
+        } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
-                        break;
+                /* unicode string area must be word-aligned */
-                default:
+                if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
-                        cifs_dbg(VFS, "invalid phase %d\n", phase);
+                        ++bcc_ptr;
-                        rc = -ENOSYS;
+                        --bytes_remaining;
-                        goto ssetup_exit;
                }
-                /* unicode strings must be word aligned */
+                decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
-                if ((iov[0].iov_len + iov[1].iov_len) % 2) {
+                                      sess_data->nls_cp);
+        } else {
+                decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+                                    sess_data->nls_cp);
+        }
+        rc = sess_establish_session(sess_data);
+out:
+        sess_data->result = rc;
+        sess_data->func = NULL;
+        sess_free_buffer(sess_data);
+        kfree(ses->auth_key.response);
+        ses->auth_key.response = NULL;
+}
+static void
+sess_auth_ntlmv2(struct sess_data *sess_data)
+{
+        int rc = 0;
+        struct smb_hdr *smb_buf;
+        SESSION_SETUP_ANDX *pSMB;
+        char *bcc_ptr;
+        struct cifs_ses *ses = sess_data->ses;
+        __u32 capabilities;
+        __u16 bytes_remaining;
+        /* old style NTLM sessionsetup */
+        /* wct = 13 */
+        rc = sess_alloc_buffer(sess_data, 13);
+        if (rc)
+                goto out;
+        pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+        bcc_ptr = sess_data->iov[2].iov_base;
+        capabilities = cifs_ssetup_hdr(ses, pSMB);
+        pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
+        /* LM2 password would be here if we supported it */
+        pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
+        /* calculate nlmv2 response and session key */
+        rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp);
+        if (rc) {
+                cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc);
+                goto out;
+        }
+        memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+                        ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+        bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
+        /* set case sensitive password length after tilen may get
+         * assigned, tilen is 0 otherwise.
+         */
+        pSMB->req_no_secext.CaseSensitivePasswordLength =
+                cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
+        if (ses->capabilities & CAP_UNICODE) {
+                if (sess_data->iov[0].iov_len % 2) {
                        *bcc_ptr = 0;
                        bcc_ptr++;
                }
-                unicode_oslm_strings(&bcc_ptr, nls_cp);
+                unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
        } else {
-                cifs_dbg(VFS, "secType %d not supported!\n", type);
+                ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
-                rc = -ENOSYS;
-                goto ssetup_exit;
        }
-        iov[2].iov_base = str_area;
-        iov[2].iov_len = (long) bcc_ptr - (long) str_area;
-        count = iov[1].iov_len + iov[2].iov_len;
+        sess_data->iov[2].iov_len = (long) bcc_ptr -
-        smb_buf->smb_buf_length =
+                        (long) sess_data->iov[2].iov_base;
-                cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
-        put_bcc(count, smb_buf);
+        rc = sess_sendreceive(sess_data);
+        if (rc)
+                goto out;
-        rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
+        pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
-                          CIFS_LOG_ERROR);
+        smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
-        /* SMB request buf freed in SendReceive2 */
+        if (smb_buf->WordCount != 3) {
+                rc = -EIO;
+                cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+                goto out;
+        }
+        if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+                cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+        ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
+        cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
-        pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base;
+        bytes_remaining = get_bcc(smb_buf);
-        smb_buf = (struct smb_hdr *)iov[0].iov_base;
+        bcc_ptr = pByteArea(smb_buf);
-        if ((type == RawNTLMSSP) && (resp_buf_type != CIFS_NO_BUFFER) &&
+        /* BB check if Unicode and decode strings */
-            (smb_buf->Status.CifsError ==
+        if (bytes_remaining == 0) {
-                        cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) {
+                /* no string area to decode, do nothing */
-                if (phase != NtLmNegotiate) {
+        } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
-                        cifs_dbg(VFS, "Unexpected more processing error\n");
+                /* unicode string area must be word-aligned */
-                        goto ssetup_exit;
+                if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+                        ++bcc_ptr;
+                        --bytes_remaining;
                }
-                /* NTLMSSP Negotiate sent now processing challenge (response) */
+                decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
-                phase = NtLmChallenge; /* process ntlmssp challenge */
+                                      sess_data->nls_cp);
-                rc = 0; /* MORE_PROC rc is not an error here, but expected */
+        } else {
+                decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+                                    sess_data->nls_cp);
        }
+        rc = sess_establish_session(sess_data);
+out:
+        sess_data->result = rc;
+        sess_data->func = NULL;
+        sess_free_buffer(sess_data);
+        kfree(ses->auth_key.response);
+        ses->auth_key.response = NULL;
+}
+#ifdef CONFIG_CIFS_UPCALL
+static void
+sess_auth_kerberos(struct sess_data *sess_data)
+{
+        int rc = 0;
+        struct smb_hdr *smb_buf;
+        SESSION_SETUP_ANDX *pSMB;
+        char *bcc_ptr;
+        struct cifs_ses *ses = sess_data->ses;
+        __u32 capabilities;
+        __u16 bytes_remaining;
+        struct key *spnego_key = NULL;
+        struct cifs_spnego_msg *msg;
+        u16 blob_len;
+        /* extended security */
+        /* wct = 12 */
+        rc = sess_alloc_buffer(sess_data, 12);
        if (rc)
-                goto ssetup_exit;
+                goto out;
-        if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) {
+        pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+        bcc_ptr = sess_data->iov[2].iov_base;
+        capabilities = cifs_ssetup_hdr(ses, pSMB);
+        spnego_key = cifs_get_spnego_key(ses);
+        if (IS_ERR(spnego_key)) {
+                rc = PTR_ERR(spnego_key);
+                spnego_key = NULL;
+                goto out;
+        }
+        msg = spnego_key->payload.data;
+        /*
+         * check version field to make sure that cifs.upcall is
+         * sending us a response in an expected form
+         */
+        if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
+                cifs_dbg(VFS,
+                  "incorrect version of cifs.upcall (expected %d but got %d)",
+                              CIFS_SPNEGO_UPCALL_VERSION, msg->version);
+                rc = -EKEYREJECTED;
+                goto out_put_spnego_key;
+        }
+        ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
+                                         GFP_KERNEL);
+        if (!ses->auth_key.response) {
+                cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory",
+                                msg->sesskey_len);
+                rc = -ENOMEM;
+                goto out_put_spnego_key;
+        }
+        ses->auth_key.len = msg->sesskey_len;
+        pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
+        capabilities |= CAP_EXTENDED_SECURITY;
+        pSMB->req.Capabilities = cpu_to_le32(capabilities);
+        sess_data->iov[1].iov_base = msg->data + msg->sesskey_len;
+        sess_data->iov[1].iov_len = msg->secblob_len;
+        pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len);
+        if (ses->capabilities & CAP_UNICODE) {
+                /* unicode strings must be word aligned */
+                if ((sess_data->iov[0].iov_len
+                        + sess_data->iov[1].iov_len) % 2) {
+                        *bcc_ptr = 0;
+                        bcc_ptr++;
+                }
+                unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
+                unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp);
+        } else {
+                /* BB: is this right? */
+                ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+        }
+        sess_data->iov[2].iov_len = (long) bcc_ptr -
+                        (long) sess_data->iov[2].iov_base;
+        rc = sess_sendreceive(sess_data);
+        if (rc)
+                goto out_put_spnego_key;
+        pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+        smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+        if (smb_buf->WordCount != 4) {
                rc = -EIO;
                cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
-                goto ssetup_exit;
+                goto out_put_spnego_key;
        }
-        action = le16_to_cpu(pSMB->resp.Action);
-        if (action & GUEST_LOGIN)
+        if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
                cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
        ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
        cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
-        /* response can have either 3 or 4 word count - Samba sends 3 */
-        /* and lanman response is 3 */
        bytes_remaining = get_bcc(smb_buf);
        bcc_ptr = pByteArea(smb_buf);
-        if (smb_buf->WordCount == 4) {
+        blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
-                blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+        if (blob_len > bytes_remaining) {
-                if (blob_len > bytes_remaining) {
+                cifs_dbg(VFS, "bad security blob length %d\n",
-                        cifs_dbg(VFS, "bad security blob length %d\n",
+                                blob_len);
-                                 blob_len);
+                rc = -EINVAL;
-                        rc = -EINVAL;
+                goto out_put_spnego_key;
-                        goto ssetup_exit;
-                }
-                if (phase == NtLmChallenge) {
-                        rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
-                        /* now goto beginning for ntlmssp authenticate phase */
-                        if (rc)
-                                goto ssetup_exit;
-                }
-                bcc_ptr += blob_len;
-                bytes_remaining -= blob_len;
        }
+        bcc_ptr += blob_len;
+        bytes_remaining -= blob_len;
        /* BB check if Unicode and decode strings */
        if (bytes_remaining == 0) {
@@ -906,60 +1083,371 @@ ssetup_ntlmssp_authenticate:
                        ++bcc_ptr;
                        --bytes_remaining;
                }
-                decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
+                decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+                                      sess_data->nls_cp);
        } else {
-                decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp);
+                decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+                                    sess_data->nls_cp);
        }
-ssetup_exit:
+        rc = sess_establish_session(sess_data);
-        if (spnego_key) {
+out_put_spnego_key:
-                key_invalidate(spnego_key);
+        key_invalidate(spnego_key);
-                key_put(spnego_key);
+        key_put(spnego_key);
+out:
+        sess_data->result = rc;
+        sess_data->func = NULL;
+        sess_free_buffer(sess_data);
+        kfree(ses->auth_key.response);
+        ses->auth_key.response = NULL;
+}
+#else
+static void
+sess_auth_kerberos(struct sess_data *sess_data)
+{
+        cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
+        sess_data->result = -ENOSYS;
+        sess_data->func = NULL;
+}
+#endif /* ! CONFIG_CIFS_UPCALL */
+/*
+ * The required kvec buffers have to be allocated before calling this
+ * function.
+ */
+static int
+_sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
+{
+        struct smb_hdr *smb_buf;
+        SESSION_SETUP_ANDX *pSMB;
+        struct cifs_ses *ses = sess_data->ses;
+        __u32 capabilities;
+        char *bcc_ptr;
+        pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+        smb_buf = (struct smb_hdr *)pSMB;
+        capabilities = cifs_ssetup_hdr(ses, pSMB);
+        if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
+                cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
+                return -ENOSYS;
        }
-        kfree(str_area);
-        kfree(ntlmsspblob);
-        ntlmsspblob = NULL;
-        if (resp_buf_type == CIFS_SMALL_BUFFER) {
-                cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base);
-                cifs_small_buf_release(iov[0].iov_base);
-        } else if (resp_buf_type == CIFS_LARGE_BUFFER)
-                cifs_buf_release(iov[0].iov_base);
-        /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
+        pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
-        if ((phase == NtLmChallenge) && (rc == 0))
+        capabilities |= CAP_EXTENDED_SECURITY;
-                goto ssetup_ntlmssp_authenticate;
+        pSMB->req.Capabilities |= cpu_to_le32(capabilities);
+        bcc_ptr = sess_data->iov[2].iov_base;
+        /* unicode strings must be word aligned */
+        if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) {
+                *bcc_ptr = 0;
+                bcc_ptr++;
+        }
+        unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
+        sess_data->iov[2].iov_len = (long) bcc_ptr -
+                                        (long) sess_data->iov[2].iov_base;
+        return 0;
+}
+static void
+sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data);
+static void
+sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data)
+{
+        int rc;
+        struct smb_hdr *smb_buf;
+        SESSION_SETUP_ANDX *pSMB;
+        struct cifs_ses *ses = sess_data->ses;
+        __u16 bytes_remaining;
+        char *bcc_ptr;
+        u16 blob_len;
+        cifs_dbg(FYI, "rawntlmssp session setup negotiate phase\n");
+        /*
+         * if memory allocation is successful, caller of this function
+         * frees it.
+         */
+        ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
+        if (!ses->ntlmssp) {
+                rc = -ENOMEM;
+                goto out;
+        }
+        ses->ntlmssp->sesskey_per_smbsess = false;
+        /* wct = 12 */
+        rc = sess_alloc_buffer(sess_data, 12);
+        if (rc)
+                goto out;
+        pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+        /* Build security blob before we assemble the request */
+        build_ntlmssp_negotiate_blob(pSMB->req.SecurityBlob, ses);
+        sess_data->iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+        sess_data->iov[1].iov_base = pSMB->req.SecurityBlob;
+        pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
+        rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
+        if (rc)
+                goto out;
+        rc = sess_sendreceive(sess_data);
+        pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+        smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+        /* If true, rc here is expected and not an error */
+        if (sess_data->buf0_type != CIFS_NO_BUFFER &&
+            smb_buf->Status.CifsError ==
+                        cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))
+                rc = 0;
+        if (rc)
+                goto out;
+        cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
+        if (smb_buf->WordCount != 4) {
+                rc = -EIO;
+                cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+                goto out;
+        }
+        ses->Suid = smb_buf->Uid;   /* UID left in wire format (le) */
+        cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
+        bytes_remaining = get_bcc(smb_buf);
+        bcc_ptr = pByteArea(smb_buf);
+        blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+        if (blob_len > bytes_remaining) {
+                cifs_dbg(VFS, "bad security blob length %d\n",
+                                blob_len);
+                rc = -EINVAL;
+                goto out;
+        }
+        rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
+out:
+        sess_free_buffer(sess_data);
        if (!rc) {
-                mutex_lock(&ses->server->srv_mutex);
+                sess_data->func = sess_auth_rawntlmssp_authenticate;
-                if (!ses->server->session_estab) {
+                return;
-                        if (ses->server->sign) {
+        }
-                                ses->server->session_key.response =
-                                        kmemdup(ses->auth_key.response,
+        /* Else error. Cleanup */
-                                        ses->auth_key.len, GFP_KERNEL);
+        kfree(ses->auth_key.response);
-                                if (!ses->server->session_key.response) {
+        ses->auth_key.response = NULL;
-                                        rc = -ENOMEM;
+        kfree(ses->ntlmssp);
-                                        mutex_unlock(&ses->server->srv_mutex);
+        ses->ntlmssp = NULL;
-                                        goto keycp_exit;
-                                }
+        sess_data->func = NULL;
-                                ses->server->session_key.len =
+        sess_data->result = rc;
-                                                        ses->auth_key.len;
+}
-                        }
-                        ses->server->sequence_number = 0x2;
-                        ses->server->session_estab = true;
-                }
-                mutex_unlock(&ses->server->srv_mutex);
-                cifs_dbg(FYI, "CIFS session established successfully\n");
+static void
-                spin_lock(&GlobalMid_Lock);
+sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
-                ses->status = CifsGood;
+{
-                ses->need_reconnect = false;
+        int rc;
-                spin_unlock(&GlobalMid_Lock);
+        struct smb_hdr *smb_buf;
+        SESSION_SETUP_ANDX *pSMB;
+        struct cifs_ses *ses = sess_data->ses;
+        __u16 bytes_remaining;
+        char *bcc_ptr;
+        char *ntlmsspblob = NULL;
+        u16 blob_len;
+        cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n");
+        /* wct = 12 */
+        rc = sess_alloc_buffer(sess_data, 12);
+        if (rc)
+                goto out;
+        /* Build security blob before we assemble the request */
+        pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+        smb_buf = (struct smb_hdr *)pSMB;
+        /*
+         * 5 is an empirical value, large enough to hold
+         * authenticate message plus max 10 of av paris,
+         * domain, user, workstation names, flags, etc.
+         */
+        ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE),
+                                GFP_KERNEL);
+        if (!ntlmsspblob) {
+                rc = -ENOMEM;
+                goto out;
        }
-keycp_exit:
+        rc = build_ntlmssp_auth_blob(ntlmsspblob,
+                                        &blob_len, ses, sess_data->nls_cp);
+        if (rc)
+                goto out_free_ntlmsspblob;
+        sess_data->iov[1].iov_len = blob_len;
+        sess_data->iov[1].iov_base = ntlmsspblob;
+        pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
+        /*
+         * Make sure that we tell the server that we are using
+         * the uid that it just gave us back on the response
+         * (challenge)
+         */
+        smb_buf->Uid = ses->Suid;
+        rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
+        if (rc)
+                goto out_free_ntlmsspblob;
+        rc = sess_sendreceive(sess_data);
+        if (rc)
+                goto out_free_ntlmsspblob;
+        pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
+        smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
+        if (smb_buf->WordCount != 4) {
+                rc = -EIO;
+                cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
+                goto out_free_ntlmsspblob;
+        }
+        if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
+                cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
+        bytes_remaining = get_bcc(smb_buf);
+        bcc_ptr = pByteArea(smb_buf);
+        blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
+        if (blob_len > bytes_remaining) {
+                cifs_dbg(VFS, "bad security blob length %d\n",
+                                blob_len);
+                rc = -EINVAL;
+                goto out_free_ntlmsspblob;
+        }
+        bcc_ptr += blob_len;
+        bytes_remaining -= blob_len;
+        /* BB check if Unicode and decode strings */
+        if (bytes_remaining == 0) {
+                /* no string area to decode, do nothing */
+        } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
+                /* unicode string area must be word-aligned */
+                if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
+                        ++bcc_ptr;
+                        --bytes_remaining;
+                }
+                decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
+                                      sess_data->nls_cp);
+        } else {
+                decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
+                                    sess_data->nls_cp);
+        }
+out_free_ntlmsspblob:
+        kfree(ntlmsspblob);
+out:
+        sess_free_buffer(sess_data);
+         if (!rc)
+                rc = sess_establish_session(sess_data);
+        /* Cleanup */
        kfree(ses->auth_key.response);
        ses->auth_key.response = NULL;
        kfree(ses->ntlmssp);
+        ses->ntlmssp = NULL;
+        sess_data->func = NULL;
+        sess_data->result = rc;
+}
+static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data)
+{
+        int type;
+        type = select_sectype(ses->server, ses->sectype);
+        cifs_dbg(FYI, "sess setup type %d\n", type);
+        if (type == Unspecified) {
+                cifs_dbg(VFS,
+                        "Unable to select appropriate authentication method!");
+                return -EINVAL;
+        }
+        switch (type) {
+        case LANMAN:
+                /* LANMAN and plaintext are less secure and off by default.
+                 * So we make this explicitly be turned on in kconfig (in the
+                 * build) and turned on at runtime (changed from the default)
+                 * in proc/fs/cifs or via mount parm.  Unfortunately this is
+                 * needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
+#ifdef CONFIG_CIFS_WEAK_PW_HASH
+                sess_data->func = sess_auth_lanman;
+                break;
+#else
+                return -EOPNOTSUPP;
+#endif
+        case NTLM:
+                sess_data->func = sess_auth_ntlm;
+                break;
+        case NTLMv2:
+                sess_data->func = sess_auth_ntlmv2;
+                break;
+        case Kerberos:
+#ifdef CONFIG_CIFS_UPCALL
+                sess_data->func = sess_auth_kerberos;
+                break;
+#else
+                cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
+                return -ENOSYS;
+                break;
+#endif /* CONFIG_CIFS_UPCALL */
+        case RawNTLMSSP:
+                sess_data->func = sess_auth_rawntlmssp_negotiate;
+                break;
+        default:
+                cifs_dbg(VFS, "secType %d not supported!\n", type);
+                return -ENOSYS;
+        }
+        return 0;
+}
+int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
+                    const struct nls_table *nls_cp)
+{
+        int rc = 0;
+        struct sess_data *sess_data;
+        if (ses == NULL) {
+                WARN(1, "%s: ses == NULL!", __func__);
+                return -EINVAL;
+        }
+        sess_data = kzalloc(sizeof(struct sess_data), GFP_KERNEL);
+        if (!sess_data)
+                return -ENOMEM;
+        rc = select_sec(ses, sess_data);
+        if (rc)
+                goto out;
+        sess_data->xid = xid;
+        sess_data->ses = ses;
+        sess_data->buf0_type = CIFS_NO_BUFFER;
+        sess_data->nls_cp = (struct nls_table *) nls_cp;
+        while (sess_data->func)
+                sess_data->func(sess_data);
+        /* Store result before we free sess_data */
+        rc = sess_data->result;
+out:
+        kfree(sess_data);
        return rc;
 }
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index d1fdfa848703..5e8c22d6c7b9 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -1009,6 +1009,12 @@ cifs_is_read_op(__u32 oplock)
        return oplock == OPLOCK_READ;
 }
+static unsigned int
+cifs_wp_retry_size(struct inode *inode)
+{
+        return CIFS_SB(inode->i_sb)->wsize;
+}
 struct smb_version_operations smb1_operations = {
        .send_cancel = send_nt_cancel,
        .compare_fids = cifs_compare_fids,
@@ -1019,6 +1025,7 @@ struct smb_version_operations smb1_operations = {
        .set_credits = cifs_set_credits,
        .get_credits_field = cifs_get_credits_field,
        .get_credits = cifs_get_credits,
+        .wait_mtu_credits = cifs_wait_mtu_credits,
        .get_next_mid = cifs_get_next_mid,
        .read_data_offset = cifs_read_data_offset,
        .read_data_length = cifs_read_data_length,
@@ -1078,6 +1085,7 @@ struct smb_version_operations smb1_operations = {
        .query_mf_symlink = cifs_query_mf_symlink,
        .create_mf_symlink = cifs_create_mf_symlink,
        .is_read_op = cifs_is_read_op,
+        .wp_retry_size = cifs_wp_retry_size,
 #ifdef CONFIG_CIFS_XATTR
        .query_all_EAs = CIFSSMBQAllEAs,
        .set_EA = CIFSSMBSetEA,
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 84c012a6aba0..0150182a4494 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -91,7 +91,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
        case SMB2_OP_SET_EOF:
                tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid,
                                     fid.volatile_fid, current->tgid,
-                                     (__le64 *)data);
+                                     (__le64 *)data, false);
                break;
        case SMB2_OP_SET_INFO:
                tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid,
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 94bd4fbb13d3..e31a9dfdcd39 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -605,7 +605,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
        {STATUS_MAPPED_FILE_SIZE_ZERO, -EIO, "STATUS_MAPPED_FILE_SIZE_ZERO"},
        {STATUS_TOO_MANY_OPENED_FILES, -EMFILE, "STATUS_TOO_MANY_OPENED_FILES"},
        {STATUS_CANCELLED, -EIO, "STATUS_CANCELLED"},
-        {STATUS_CANNOT_DELETE, -EIO, "STATUS_CANNOT_DELETE"},
+        {STATUS_CANNOT_DELETE, -EACCES, "STATUS_CANNOT_DELETE"},
        {STATUS_INVALID_COMPUTER_NAME, -EIO, "STATUS_INVALID_COMPUTER_NAME"},
        {STATUS_FILE_DELETED, -EIO, "STATUS_FILE_DELETED"},
        {STATUS_SPECIAL_ACCOUNT, -EIO, "STATUS_SPECIAL_ACCOUNT"},
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index b8021fde987d..f2e6ac29a8d6 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -437,7 +437,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
                        continue;
                cifs_dbg(FYI, "found in the open list\n");
-                cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
+                cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
                         le32_to_cpu(rsp->NewLeaseState));
                server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
@@ -467,7 +467,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
                }
                cifs_dbg(FYI, "found in the pending open list\n");
-                cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
+                cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
                         le32_to_cpu(rsp->NewLeaseState));
                open->oplock = lease_state;
@@ -546,7 +546,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
                        return false;
        }
-        cifs_dbg(FYI, "oplock level 0x%d\n", rsp->OplockLevel);
+        cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel);
        /* look up tcon based on tid & uid */
        spin_lock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 787844bde384..77f8aeb9c2fc 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/vfs.h>
+#include <linux/falloc.h>
 #include "cifsglob.h"
 #include "smb2pdu.h"
 #include "smb2proto.h"
@@ -112,6 +113,53 @@ smb2_get_credits(struct mid_q_entry *mid)
        return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest);
 }
+static int
+smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
+                      unsigned int *num, unsigned int *credits)
+{
+        int rc = 0;
+        unsigned int scredits;
+        spin_lock(&server->req_lock);
+        while (1) {
+                if (server->credits <= 0) {
+                        spin_unlock(&server->req_lock);
+                        cifs_num_waiters_inc(server);
+                        rc = wait_event_killable(server->request_q,
+                                        has_credits(server, &server->credits));
+                        cifs_num_waiters_dec(server);
+                        if (rc)
+                                return rc;
+                        spin_lock(&server->req_lock);
+                } else {
+                        if (server->tcpStatus == CifsExiting) {
+                                spin_unlock(&server->req_lock);
+                                return -ENOENT;
+                        }
+                        scredits = server->credits;
+                        /* can deadlock with reopen */
+                        if (scredits == 1) {
+                                *num = SMB2_MAX_BUFFER_SIZE;
+                                *credits = 0;
+                                break;
+                        }
+                        /* leave one credit for a possible reopen */
+                        scredits--;
+                        *num = min_t(unsigned int, size,
+                                     scredits * SMB2_MAX_BUFFER_SIZE);
+                        *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
+                        server->credits -= *credits;
+                        server->in_flight++;
+                        break;
+                }
+        }
+        spin_unlock(&server->req_lock);
+        return rc;
+}
 static __u64
 smb2_get_next_mid(struct TCP_Server_Info *server)
 {
@@ -182,8 +230,9 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
        /* start with specified wsize, or default */
        wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
        wsize = min_t(unsigned int, wsize, server->max_write);
-        /* set it to the maximum buffer size value we can send with 1 credit */
-        wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
+        if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+                wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
        return wsize;
 }
@@ -197,8 +246,9 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
        /* start with specified rsize, or default */
        rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
        rsize = min_t(unsigned int, rsize, server->max_read);
-        /* set it to the maximum buffer size value we can send with 1 credit */
-        rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
+        if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
+                rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
        return rsize;
 }
@@ -687,7 +737,7 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
 {
        __le64 eof = cpu_to_le64(size);
        return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
-                            cfile->fid.volatile_fid, cfile->pid, &eof);
+                            cfile->fid.volatile_fid, cfile->pid, &eof, false);
 }
 static int
@@ -1104,6 +1154,13 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch)
        return le32_to_cpu(lc->lcontext.LeaseState);
 }
+static unsigned int
+smb2_wp_retry_size(struct inode *inode)
+{
+        return min_t(unsigned int, CIFS_SB(inode->i_sb)->wsize,
+                     SMB2_MAX_BUFFER_SIZE);
+}
 struct smb_version_operations smb20_operations = {
        .compare_fids = smb2_compare_fids,
        .setup_request = smb2_setup_request,
@@ -1113,6 +1170,7 @@ struct smb_version_operations smb20_operations = {
        .set_credits = smb2_set_credits,
        .get_credits_field = smb2_get_credits_field,
        .get_credits = smb2_get_credits,
+        .wait_mtu_credits = cifs_wait_mtu_credits,
        .get_next_mid = smb2_get_next_mid,
        .read_data_offset = smb2_read_data_offset,
        .read_data_length = smb2_read_data_length,
@@ -1177,6 +1235,7 @@ struct smb_version_operations smb20_operations = {
        .create_lease_buf = smb2_create_lease_buf,
        .parse_lease_buf = smb2_parse_lease_buf,
        .clone_range = smb2_clone_range,
+        .wp_retry_size = smb2_wp_retry_size,
 };
 struct smb_version_operations smb21_operations = {
@@ -1188,6 +1247,7 @@ struct smb_version_operations smb21_operations = {
        .set_credits = smb2_set_credits,
        .get_credits_field = smb2_get_credits_field,
        .get_credits = smb2_get_credits,
+        .wait_mtu_credits = smb2_wait_mtu_credits,
        .get_next_mid = smb2_get_next_mid,
        .read_data_offset = smb2_read_data_offset,
        .read_data_length = smb2_read_data_length,
@@ -1252,6 +1312,7 @@ struct smb_version_operations smb21_operations = {
        .create_lease_buf = smb2_create_lease_buf,
        .parse_lease_buf = smb2_parse_lease_buf,
        .clone_range = smb2_clone_range,
+        .wp_retry_size = smb2_wp_retry_size,
 };
 struct smb_version_operations smb30_operations = {
@@ -1263,6 +1324,7 @@ struct smb_version_operations smb30_operations = {
        .set_credits = smb2_set_credits,
        .get_credits_field = smb2_get_credits_field,
        .get_credits = smb2_get_credits,
+        .wait_mtu_credits = smb2_wait_mtu_credits,
        .get_next_mid = smb2_get_next_mid,
        .read_data_offset = smb2_read_data_offset,
        .read_data_length = smb2_read_data_length,
@@ -1330,6 +1392,7 @@ struct smb_version_operations smb30_operations = {
        .parse_lease_buf = smb3_parse_lease_buf,
        .clone_range = smb2_clone_range,
        .validate_negotiate = smb3_validate_negotiate,
+        .wp_retry_size = smb2_wp_retry_size,
 };
 struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index b0b260dbb19d..42ebc1a8be6c 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -108,7 +108,6 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
        if (!tcon)
                goto out;
-        /* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */
        /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */
        /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */
        if ((tcon->ses) &&
@@ -245,10 +244,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
        if (rc)
                goto out;
        atomic_inc(&tconInfoReconnectCount);
-        /*
-         * BB FIXME add code to check if wsize needs update due to negotiated
-         * smb buffer size shrinking.
-         */
 out:
        /*
         * Check if handle based operation so we know whether we can continue
@@ -309,16 +304,6 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon,
        return rc;
 }
-static void
-free_rsp_buf(int resp_buftype, void *rsp)
-{
-        if (resp_buftype == CIFS_SMALL_BUFFER)
-                cifs_small_buf_release(rsp);
-        else if (resp_buftype == CIFS_LARGE_BUFFER)
-                cifs_buf_release(rsp);
-}
 /*
 *
 *      SMB2 Worker functions follow:
@@ -1738,12 +1723,18 @@ smb2_readv_callback(struct mid_q_entry *mid)
                                         rc);
                }
                /* FIXME: should this be counted toward the initiating task? */
-                task_io_account_read(rdata->bytes);
+                task_io_account_read(rdata->got_bytes);
-                cifs_stats_bytes_read(tcon, rdata->bytes);
+                cifs_stats_bytes_read(tcon, rdata->got_bytes);
                break;
        case MID_REQUEST_SUBMITTED:
        case MID_RETRY_NEEDED:
                rdata->result = -EAGAIN;
+                if (server->sign && rdata->got_bytes)
+                        /* reset bytes number since we can not check a sign */
+                        rdata->got_bytes = 0;
+                /* FIXME: should this be counted toward the initiating task? */
+                task_io_account_read(rdata->got_bytes);
+                cifs_stats_bytes_read(tcon, rdata->got_bytes);
                break;
        default:
                if (rdata->result != -ENODATA)
@@ -1762,11 +1753,12 @@ smb2_readv_callback(struct mid_q_entry *mid)
 int
 smb2_async_readv(struct cifs_readdata *rdata)
 {
-        int rc;
+        int rc, flags = 0;
        struct smb2_hdr *buf;
        struct cifs_io_parms io_parms;
        struct smb_rqst rqst = { .rq_iov = &rdata->iov,
                                 .rq_nvec = 1 };
+        struct TCP_Server_Info *server;
        cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
                 __func__, rdata->offset, rdata->bytes);
@@ -1777,18 +1769,41 @@ smb2_async_readv(struct cifs_readdata *rdata)
        io_parms.persistent_fid = rdata->cfile->fid.persistent_fid;
        io_parms.volatile_fid = rdata->cfile->fid.volatile_fid;
        io_parms.pid = rdata->pid;
+        server = io_parms.tcon->ses->server;
        rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0);
-        if (rc)
+        if (rc) {
+                if (rc == -EAGAIN && rdata->credits) {
+                        /* credits was reset by reconnect */
+                        rdata->credits = 0;
+                        /* reduce in_flight value since we won't send the req */
+                        spin_lock(&server->req_lock);
+                        server->in_flight--;
+                        spin_unlock(&server->req_lock);
+                }
                return rc;
+        }
        buf = (struct smb2_hdr *)rdata->iov.iov_base;
        /* 4 for rfc1002 length field */
        rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4;
+        if (rdata->credits) {
+                buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
+                                                SMB2_MAX_BUFFER_SIZE));
+                spin_lock(&server->req_lock);
+                server->credits += rdata->credits -
+                                                le16_to_cpu(buf->CreditCharge);
+                spin_unlock(&server->req_lock);
+                wake_up(&server->request_q);
+                flags = CIFS_HAS_CREDITS;
+        }
        kref_get(&rdata->refcount);
        rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
                             cifs_readv_receive, smb2_readv_callback,
-                             rdata, 0);
+                             rdata, flags);
        if (rc) {
                kref_put(&rdata->refcount, cifs_readdata_release);
                cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
@@ -1906,15 +1921,25 @@ int
 smb2_async_writev(struct cifs_writedata *wdata,
                  void (*release)(struct kref *kref))
 {
-        int rc = -EACCES;
+        int rc = -EACCES, flags = 0;
        struct smb2_write_req *req = NULL;
        struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+        struct TCP_Server_Info *server = tcon->ses->server;
        struct kvec iov;
        struct smb_rqst rqst;
        rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req);
-        if (rc)
+        if (rc) {
+                if (rc == -EAGAIN && wdata->credits) {
+                        /* credits was reset by reconnect */
+                        wdata->credits = 0;
+                        /* reduce in_flight value since we won't send the req */
+                        spin_lock(&server->req_lock);
+                        server->in_flight--;
+                        spin_unlock(&server->req_lock);
+                }
                goto async_writev_out;
+        }
        req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid);
@@ -1947,9 +1972,20 @@ smb2_async_writev(struct cifs_writedata *wdata,
        inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */);
+        if (wdata->credits) {
+                req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
+                                                    SMB2_MAX_BUFFER_SIZE));
+                spin_lock(&server->req_lock);
+                server->credits += wdata->credits -
+                                        le16_to_cpu(req->hdr.CreditCharge);
+                spin_unlock(&server->req_lock);
+                wake_up(&server->request_q);
+                flags = CIFS_HAS_CREDITS;
+        }
        kref_get(&wdata->refcount);
-        rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
+        rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, wdata,
-                                smb2_writev_callback, wdata, 0);
+                             flags);
        if (rc) {
                kref_put(&wdata->refcount, release);
@@ -2325,7 +2361,7 @@ SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
 int
 SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
-             u64 volatile_fid, u32 pid, __le64 *eof)
+             u64 volatile_fid, u32 pid, __le64 *eof, bool is_falloc)
 {
        struct smb2_file_eof_info info;
        void *data;
@@ -2336,8 +2372,12 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
        data = &info;
        size = sizeof(struct smb2_file_eof_info);
-        return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid,
+        if (is_falloc)
-                             FILE_END_OF_FILE_INFORMATION, 1, &data, &size);
+                return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+                        pid, FILE_ALLOCATION_INFORMATION, 1, &data, &size);
+        else
+                return send_set_info(xid, tcon, persistent_fid, volatile_fid,
+                        pid, FILE_END_OF_FILE_INFORMATION, 1, &data, &size);
 }
 int
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 0ce48db20a65..67e8ce8055de 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -139,7 +139,7 @@ extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
                             __le16 *target_file);
 extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
                        u64 persistent_fid, u64 volatile_fid, u32 pid,
-                        __le64 *eof);
+                        __le64 *eof, bool is_fallocate);
 extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
                         u64 persistent_fid, u64 volatile_fid,
                         FILE_BASIC_INFO *buf);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 59c748ce872f..5111e7272db6 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -466,7 +466,12 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 static inline void
 smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr)
 {
+        unsigned int i, num = le16_to_cpu(hdr->CreditCharge);
        hdr->MessageId = get_next_mid64(server);
+        /* skip message numbers according to CreditCharge field */
+        for (i = 1; i < num; i++)
+                get_next_mid(server);
 }
 static struct mid_q_entry *
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 18cd5650a5fc..9d087f4e7d4e 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -448,6 +448,15 @@ wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
        return wait_for_free_credits(server, timeout, val);
 }
+int
+cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
+                      unsigned int *num, unsigned int *credits)
+{
+        *num = size;
+        *credits = 0;
+        return 0;
+}
 static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
                        struct mid_q_entry **ppmidQ)
 {
@@ -531,20 +540,23 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
 {
        int rc, timeout, optype;
        struct mid_q_entry *mid;
+        unsigned int credits = 0;
        timeout = flags & CIFS_TIMEOUT_MASK;
        optype = flags & CIFS_OP_MASK;
-        rc = wait_for_free_request(server, timeout, optype);
+        if ((flags & CIFS_HAS_CREDITS) == 0) {
-        if (rc)
+                rc = wait_for_free_request(server, timeout, optype);
-                return rc;
+                if (rc)
+                        return rc;
+                credits = 1;
+        }
        mutex_lock(&server->srv_mutex);
        mid = server->ops->setup_async_request(server, rqst);
        if (IS_ERR(mid)) {
                mutex_unlock(&server->srv_mutex);
-                add_credits(server, 1, optype);
+                add_credits_and_wake_if(server, credits, optype);
-                wake_up(&server->request_q);
                return PTR_ERR(mid);
        }
@@ -572,8 +584,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
                return 0;
        cifs_delete_mid(mid);
-        add_credits(server, 1, optype);
+        add_credits_and_wake_if(server, credits, optype);
-        wake_up(&server->request_q);
        return rc;
 }
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 1da168c61d35..278f8fdeb9ef 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -13,7 +13,7 @@
 #include <linux/fs.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/sched.h>
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 2849f41e72a2..1326d38960db 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -13,7 +13,7 @@
 #include <linux/fs.h>
 #include <linux/stat.h>
 #include <linux/errno.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/string.h>
 #include <linux/coda.h>
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index cd8a63238b11..9c3dedc000d1 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -19,8 +19,7 @@
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/namei.h>
+#include <linux/uaccess.h>
-#include <asm/uaccess.h>
 #include <linux/coda.h>
 #include <linux/coda_psdev.h>
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 9e83b7790212..d244d743a232 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -18,7 +18,7 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/coda.h>
 #include <linux/coda_psdev.h>
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index fe3afb2de880..b945410bfcd5 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -21,9 +21,7 @@
 #include <linux/vfs.h>
 #include <linux/slab.h>
 #include <linux/pid_namespace.h>
+#include <linux/uaccess.h>
-#include <asm/uaccess.h>
 #include <linux/fs.h>
 #include <linux/vmalloc.h>
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 3f5de96bbb58..4326d172fc27 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -16,7 +16,7 @@
 #include <linux/string.h>
 #include <linux/namei.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/coda.h>
 #include <linux/coda_psdev.h>
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 5c1e4242368b..822629126e89 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -40,7 +40,7 @@
 #include <linux/pid_namespace.h>
 #include <asm/io.h>
 #include <asm/poll.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/coda.h>
 #include <linux/coda_psdev.h>
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 21fcf8dcb9cd..5bb6e27298a4 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -27,7 +27,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index e82289047272..afec6450450f 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -59,7 +59,7 @@
 #include <linux/gfp.h>
 #include <net/bluetooth/bluetooth.h>
-#include <net/bluetooth/hci.h>
+#include <net/bluetooth/hci_sock.h>
 #include <net/bluetooth/rfcomm.h>
 #include <linux/capi.h>
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index ddcfe590b8a8..355c522f3585 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -11,6 +11,8 @@
 * The actual compression is based on zlib, see the other files.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
@@ -21,7 +23,7 @@
 #include <linux/vfs.h>
 #include <linux/mutex.h>
 #include <uapi/linux/cramfs_fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
@@ -153,7 +155,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE];
 static unsigned buffer_blocknr[READ_BUFFERS];
-static struct super_block * buffer_dev[READ_BUFFERS];
+static struct super_block *buffer_dev[READ_BUFFERS];
 static int next_buffer;
 /*
@@ -205,6 +207,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
        for (i = 0; i < BLKS_PER_BUF; i++) {
                struct page *page = pages[i];
                if (page) {
                        wait_on_page_locked(page);
                        if (!PageUptodate(page)) {
@@ -223,6 +226,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
        data = read_buffers[buffer];
        for (i = 0; i < BLKS_PER_BUF; i++) {
                struct page *page = pages[i];
                if (page) {
                        memcpy(data, kmap(page), PAGE_CACHE_SIZE);
                        kunmap(page);
@@ -237,6 +241,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 static void cramfs_kill_sb(struct super_block *sb)
 {
        struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
        kill_block_super(sb);
        kfree(sbi);
 }
@@ -277,7 +282,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
                /* check for wrong endianness */
                if (super.magic == CRAMFS_MAGIC_WEND) {
                        if (!silent)
-                                printk(KERN_ERR "cramfs: wrong endianness\n");
+                                pr_err("wrong endianness\n");
                        return -EINVAL;
                }
@@ -287,22 +292,22 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
                mutex_unlock(&read_mutex);
                if (super.magic != CRAMFS_MAGIC) {
                        if (super.magic == CRAMFS_MAGIC_WEND && !silent)
-                                printk(KERN_ERR "cramfs: wrong endianness\n");
+                                pr_err("wrong endianness\n");
                        else if (!silent)
-                                printk(KERN_ERR "cramfs: wrong magic\n");
+                                pr_err("wrong magic\n");
                        return -EINVAL;
                }
        }
        /* get feature flags first */
        if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) {
-                printk(KERN_ERR "cramfs: unsupported filesystem features\n");
+                pr_err("unsupported filesystem features\n");
                return -EINVAL;
        }
        /* Check that the root inode is in a sane state */
        if (!S_ISDIR(super.root.mode)) {
-                printk(KERN_ERR "cramfs: root is not a directory\n");
+                pr_err("root is not a directory\n");
                return -EINVAL;
        }
        /* correct strange, hard-coded permissions of mkcramfs */
@@ -310,23 +315,23 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
        root_offset = super.root.offset << 2;
        if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
-                sbi->size=super.size;
+                sbi->size = super.size;
-                sbi->blocks=super.fsid.blocks;
+                sbi->blocks = super.fsid.blocks;
-                sbi->files=super.fsid.files;
+                sbi->files = super.fsid.files;
        } else {
-                sbi->size=1<<28;
+                sbi->size = 1<<28;
-                sbi->blocks=0;
+                sbi->blocks = 0;
-                sbi->files=0;
+                sbi->files = 0;
        }
-        sbi->magic=super.magic;
+        sbi->magic = super.magic;
-        sbi->flags=super.flags;
+        sbi->flags = super.flags;
        if (root_offset == 0)
-                printk(KERN_INFO "cramfs: empty filesystem");
+                pr_info("empty filesystem");
        else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) &&
                 ((root_offset != sizeof(struct cramfs_super)) &&
                  (root_offset != 512 + sizeof(struct cramfs_super))))
        {
-                printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset);
+                pr_err("bad root offset %lu\n", root_offset);
                return -EINVAL;
        }
@@ -425,7 +430,7 @@ static int cramfs_readdir(struct file *file, struct dir_context *ctx)
 /*
 * Lookup and fill in the inode data..
 */
-static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 {
        unsigned int offset = 0;
        struct inode *inode = NULL;
@@ -483,7 +488,7 @@ out:
        return NULL;
 }
-static int cramfs_readpage(struct file *file, struct page * page)
+static int cramfs_readpage(struct file *file, struct page *page)
 {
        struct inode *inode = page->mapping->host;
        u32 maxblock;
@@ -511,7 +516,7 @@ static int cramfs_readpage(struct file *file, struct page * page)
                if (compr_len == 0)
                        ; /* hole */
                else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
-                        pr_err("cramfs: bad compressed blocksize %u\n",
+                        pr_err("bad compressed blocksize %u\n",
                                compr_len);
                        goto err;
                } else {
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index 1760c1b84d97..ec4f1d4fdad0 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -15,6 +15,8 @@
 * then is used by multiple filesystems.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/vmalloc.h>
@@ -37,7 +39,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
        err = zlib_inflateReset(&stream);
        if (err != Z_OK) {
-                printk("zlib_inflateReset error %d\n", err);
+                pr_err("zlib_inflateReset error %d\n", err);
                zlib_inflateEnd(&stream);
                zlib_inflateInit(&stream);
        }
@@ -48,8 +50,8 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
        return stream.total_out;
 err:
-        printk("Error %d while decompressing!\n", err);
+        pr_err("Error %d while decompressing!\n", err);
-        printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
+        pr_err("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
        return -EIO;
 }
@@ -57,7 +59,7 @@ int cramfs_uncompress_init(void)
 {
        if (!initialized++) {
                stream.workspace = vmalloc(zlib_inflate_workspacesize());
-                if ( !stream.workspace ) {
+                if (!stream.workspace) {
                        initialized = 0;
                        return -ENOMEM;
                }
diff --git a/fs/dcache.c b/fs/dcache.c
index 06f65857a855..d30ce699ae4b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -731,8 +731,6 @@ EXPORT_SYMBOL(dget_parent);
 /**
 * d_find_alias - grab a hashed alias of inode
 * @inode: inode in question
- * @want_discon:  flag, used by d_splice_alias, to request
- *          that only a DISCONNECTED alias be returned.
 *
 * If inode has a hashed alias, or is a directory and has any alias,
 * acquire the reference to alias and return it. Otherwise return NULL.
@@ -741,10 +739,9 @@ EXPORT_SYMBOL(dget_parent);
 * of a filesystem.
 *
 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
- * any other hashed alias over that one unless @want_discon is set,
+ * any other hashed alias over that one.
- * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
 */
-static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
+static struct dentry *__d_find_alias(struct inode *inode)
 {
        struct dentry *alias, *discon_alias;
@@ -756,7 +753,7 @@ again:
                        if (IS_ROOT(alias) &&
                            (alias->d_flags & DCACHE_DISCONNECTED)) {
                                discon_alias = alias;
-                        } else if (!want_discon) {
+                        } else {
                                __dget_dlock(alias);
                                spin_unlock(&alias->d_lock);
                                return alias;
@@ -768,12 +765,9 @@ again:
                alias = discon_alias;
                spin_lock(&alias->d_lock);
                if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
-                        if (IS_ROOT(alias) &&
+                        __dget_dlock(alias);
-                            (alias->d_flags & DCACHE_DISCONNECTED)) {
+                        spin_unlock(&alias->d_lock);
-                                __dget_dlock(alias);
+                        return alias;
-                                spin_unlock(&alias->d_lock);
-                                return alias;
-                        }
                }
                spin_unlock(&alias->d_lock);
                goto again;
@@ -787,7 +781,7 @@ struct dentry *d_find_alias(struct inode *inode)
        if (!hlist_empty(&inode->i_dentry)) {
                spin_lock(&inode->i_lock);
-                de = __d_find_alias(inode, 0);
+                de = __d_find_alias(inode);
                spin_unlock(&inode->i_lock);
        }
        return de;
@@ -1781,25 +1775,7 @@ struct dentry *d_find_any_alias(struct inode *inode)
 }
 EXPORT_SYMBOL(d_find_any_alias);
-/**
+static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
- * d_obtain_alias - find or allocate a dentry for a given inode
- * @inode: inode to allocate the dentry for
- *
- * Obtain a dentry for an inode resulting from NFS filehandle conversion or
- * similar open by handle operations.  The returned dentry may be anonymous,
- * or may have a full name (if the inode was already in the cache).
- *
- * When called on a directory inode, we must ensure that the inode only ever
- * has one dentry.  If a dentry is found, that is returned instead of
- * allocating a new one.
- *
- * On successful return, the reference to the inode has been transferred
- * to the dentry.  In case of an error the reference on the inode is released.
- * To make it easier to use in export operations a %NULL or IS_ERR inode may
- * be passed in and will be the error will be propagate to the return value,
- * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
- */
-struct dentry *d_obtain_alias(struct inode *inode)
 {
        static const struct qstr anonstring = QSTR_INIT("/", 1);
        struct dentry *tmp;
@@ -1830,7 +1806,10 @@ struct dentry *d_obtain_alias(struct inode *inode)
        }
        /* attach a disconnected dentry */
-        add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED;
+        add_flags = d_flags_for_inode(inode);
+        if (disconnected)
+                add_flags |= DCACHE_DISCONNECTED;
        spin_lock(&tmp->d_lock);
        tmp->d_inode = inode;
@@ -1851,59 +1830,51 @@ struct dentry *d_obtain_alias(struct inode *inode)
        iput(inode);
        return res;
 }
-EXPORT_SYMBOL(d_obtain_alias);
 /**
- * d_splice_alias - splice a disconnected dentry into the tree if one exists
+ * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
- * @inode:  the inode which may have a disconnected dentry
+ * @inode: inode to allocate the dentry for
- * @dentry: a negative dentry which we want to point to the inode.
- *
- * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
- * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
- * and return it, else simply d_add the inode to the dentry and return NULL.
 *
- * This is needed in the lookup routine of any filesystem that is exportable
+ * Obtain a dentry for an inode resulting from NFS filehandle conversion or
- * (via knfsd) so that we can build dcache paths to directories effectively.
+ * similar open by handle operations.  The returned dentry may be anonymous,
+ * or may have a full name (if the inode was already in the cache).
 *
- * If a dentry was found and moved, then it is returned.  Otherwise NULL
+ * When called on a directory inode, we must ensure that the inode only ever
- * is returned.  This matches the expected return value of ->lookup.
+ * has one dentry.  If a dentry is found, that is returned instead of
+ * allocating a new one.
 *
- * Cluster filesystems may call this function with a negative, hashed dentry.
+ * On successful return, the reference to the inode has been transferred
- * In that case, we know that the inode will be a regular file, and also this
+ * to the dentry.  In case of an error the reference on the inode is released.
- * will only occur during atomic_open. So we need to check for the dentry
+ * To make it easier to use in export operations a %NULL or IS_ERR inode may
- * being already hashed only in the final case.
+ * be passed in and the error will be propagated to the return value,
+ * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
 */
-struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+struct dentry *d_obtain_alias(struct inode *inode)
 {
-        struct dentry *new = NULL;
+        return __d_obtain_alias(inode, 1);
+}
-        if (IS_ERR(inode))
+EXPORT_SYMBOL(d_obtain_alias);
-                return ERR_CAST(inode);
-        if (inode && S_ISDIR(inode->i_mode)) {
+/**
-                spin_lock(&inode->i_lock);
+ * d_obtain_root - find or allocate a dentry for a given inode
-                new = __d_find_alias(inode, 1);
+ * @inode: inode to allocate the dentry for
-                if (new) {
+ *
-                        BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
+ * Obtain an IS_ROOT dentry for the root of a filesystem.
-                        spin_unlock(&inode->i_lock);
+ *
-                        security_d_instantiate(new, inode);
+ * We must ensure that directory inodes only ever have one dentry.  If a
-                        d_move(new, dentry);
+ * dentry is found, that is returned instead of allocating a new one.
-                        iput(inode);
+ *
-                } else {
+ * On successful return, the reference to the inode has been transferred
-                        /* already taking inode->i_lock, so d_add() by hand */
+ * to the dentry.  In case of an error the reference on the inode is
-                        __d_instantiate(dentry, inode);
+ * released.  A %NULL or IS_ERR inode may be passed in and will be the
-                        spin_unlock(&inode->i_lock);
+ * error will be propagate to the return value, with a %NULL @inode
-                        security_d_instantiate(dentry, inode);
+ * replaced by ERR_PTR(-ESTALE).
-                        d_rehash(dentry);
+ */
-                }
+struct dentry *d_obtain_root(struct inode *inode)
-        } else {
+{
-                d_instantiate(dentry, inode);
+        return __d_obtain_alias(inode, 0);
-                if (d_unhashed(dentry))
-                        d_rehash(dentry);
-        }
-        return new;
 }
-EXPORT_SYMBOL(d_splice_alias);
+EXPORT_SYMBOL(d_obtain_root);
 /**
 * d_add_ci - lookup or allocate new dentry with case-exact name
@@ -2697,6 +2668,75 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 }
 /**
+ * d_splice_alias - splice a disconnected dentry into the tree if one exists
+ * @inode:  the inode which may have a disconnected dentry
+ * @dentry: a negative dentry which we want to point to the inode.
+ *
+ * If inode is a directory and has an IS_ROOT alias, then d_move that in
+ * place of the given dentry and return it, else simply d_add the inode
+ * to the dentry and return NULL.
+ *
+ * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
+ * we should error out: directories can't have multiple aliases.
+ *
+ * This is needed in the lookup routine of any filesystem that is exportable
+ * (via knfsd) so that we can build dcache paths to directories effectively.
+ *
+ * If a dentry was found and moved, then it is returned.  Otherwise NULL
+ * is returned.  This matches the expected return value of ->lookup.
+ *
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
+ */
+struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+{
+        struct dentry *new = NULL;
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        if (inode && S_ISDIR(inode->i_mode)) {
+                spin_lock(&inode->i_lock);
+                new = __d_find_any_alias(inode);
+                if (new) {
+                        if (!IS_ROOT(new)) {
+                                spin_unlock(&inode->i_lock);
+                                dput(new);
+                                return ERR_PTR(-EIO);
+                        }
+                        if (d_ancestor(new, dentry)) {
+                                spin_unlock(&inode->i_lock);
+                                dput(new);
+                                return ERR_PTR(-EIO);
+                        }
+                        write_seqlock(&rename_lock);
+                        __d_materialise_dentry(dentry, new);
+                        write_sequnlock(&rename_lock);
+                        __d_drop(new);
+                        _d_rehash(new);
+                        spin_unlock(&new->d_lock);
+                        spin_unlock(&inode->i_lock);
+                        security_d_instantiate(new, inode);
+                        iput(inode);
+                } else {
+                        /* already taking inode->i_lock, so d_add() by hand */
+                        __d_instantiate(dentry, inode);
+                        spin_unlock(&inode->i_lock);
+                        security_d_instantiate(dentry, inode);
+                        d_rehash(dentry);
+                }
+        } else {
+                d_instantiate(dentry, inode);
+                if (d_unhashed(dentry))
+                        d_rehash(dentry);
+        }
+        return new;
+}
+EXPORT_SYMBOL(d_splice_alias);
+/**
 * d_materialise_unique - introduce an inode into the tree
 * @dentry: candidate dentry
 * @inode: inode to bind to the dentry, to which aliases may be attached
@@ -2724,7 +2764,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
                struct dentry *alias;
                /* Does an aliased dentry already exist? */
-                alias = __d_find_alias(inode, 0);
+                alias = __d_find_alias(inode);
                if (alias) {
                        actual = alias;
                        write_seqlock(&rename_lock);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 63146295153b..76c08c2beb2f 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -451,7 +451,7 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf,
 {
        char buf[3];
        u32 *val = file->private_data;
-        
        if (*val)
                buf[0] = 'Y';
        else
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 8c41b52da358..1e3b99d3db0d 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -66,7 +66,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev
                        break;
                }
        }
-        return inode; 
+        return inode;
 }
 /* SMP-safe */
@@ -317,7 +317,7 @@ static struct dentry *__create_file(const char *name, umode_t mode,
                goto exit;
        /* If the parent is not specified, we create it in the root.
-         * We need the root dentry to do this, which is in the super 
+         * We need the root dentry to do this, which is in the super
         * block. A pointer to that is in the struct vfsmount that we
         * have around.
         */
@@ -330,7 +330,7 @@ static struct dentry *__create_file(const char *name, umode_t mode,
                switch (mode & S_IFMT) {
                case S_IFDIR:
                        error = debugfs_mkdir(parent->d_inode, dentry, mode);
-                                              
                        break;
                case S_IFLNK:
                        error = debugfs_link(parent->d_inode, dentry, mode,
@@ -534,7 +534,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
 */
 void debugfs_remove_recursive(struct dentry *dentry)
 {
-        struct dentry *child, *next, *parent;
+        struct dentry *child, *parent;
        if (IS_ERR_OR_NULL(dentry))
                return;
@@ -546,30 +546,49 @@ void debugfs_remove_recursive(struct dentry *dentry)
        parent = dentry;
 down:
        mutex_lock(&parent->d_inode->i_mutex);
-        list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) {
+ loop:
+        /*
+         * The parent->d_subdirs is protected by the d_lock. Outside that
+         * lock, the child can be unlinked and set to be freed which can
+         * use the d_u.d_child as the rcu head and corrupt this list.
+         */
+        spin_lock(&parent->d_lock);
+        list_for_each_entry(child, &parent->d_subdirs, d_u.d_child) {
                if (!debugfs_positive(child))
                        continue;
                /* perhaps simple_empty(child) makes more sense */
                if (!list_empty(&child->d_subdirs)) {
+                        spin_unlock(&parent->d_lock);
                        mutex_unlock(&parent->d_inode->i_mutex);
                        parent = child;
                        goto down;
                }
- up:
+                spin_unlock(&parent->d_lock);
                if (!__debugfs_remove(child, parent))
                        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+                /*
+                 * The parent->d_lock protects agaist child from unlinking
+                 * from d_subdirs. When releasing the parent->d_lock we can
+                 * no longer trust that the next pointer is valid.
+                 * Restart the loop. We'll skip this one with the
+                 * debugfs_positive() check.
+                 */
+                goto loop;
        }
+        spin_unlock(&parent->d_lock);
        mutex_unlock(&parent->d_inode->i_mutex);
        child = parent;
        parent = parent->d_parent;
        mutex_lock(&parent->d_inode->i_mutex);
-        if (child != dentry) {
+        if (child != dentry)
-                next = list_next_entry(child, d_u.d_child);
+                /* go up */
-                goto up;
+                goto loop;
-        }
        if (!__debugfs_remove(child, parent))
                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 17e39b047de5..c3116404ab49 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -158,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 {
        ssize_t ret;
-        ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE,
+        ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES,
                                &sdio->from);
        if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8d77ba7b1756..1323c568e362 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -718,16 +718,11 @@ static const struct file_operations waiters_fops = {
 void dlm_delete_debug_file(struct dlm_ls *ls)
 {
-        if (ls->ls_debug_rsb_dentry)
+        debugfs_remove(ls->ls_debug_rsb_dentry);
-                debugfs_remove(ls->ls_debug_rsb_dentry);
+        debugfs_remove(ls->ls_debug_waiters_dentry);
-        if (ls->ls_debug_waiters_dentry)
+        debugfs_remove(ls->ls_debug_locks_dentry);
-                debugfs_remove(ls->ls_debug_waiters_dentry);
+        debugfs_remove(ls->ls_debug_all_dentry);
-        if (ls->ls_debug_locks_dentry)
+        debugfs_remove(ls->ls_debug_toss_dentry);
-                debugfs_remove(ls->ls_debug_locks_dentry);
-        if (ls->ls_debug_all_dentry)
-                debugfs_remove(ls->ls_debug_all_dentry);
-        if (ls->ls_debug_toss_dentry)
-                debugfs_remove(ls->ls_debug_toss_dentry);
 }
 int dlm_create_debug_file(struct dlm_ls *ls)
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 356c044e2cd3..bbee8f063dfa 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -12,7 +12,8 @@
 #include "efs.h"
-static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) {
+static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len)
+{
        struct buffer_head *bh;
        int                     slot, namelen;
@@ -40,10 +41,10 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len)
                if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) {
                        pr_err("%s(): invalid directory block\n", __func__);
                        brelse(bh);
-                        return(0);
+                        return 0;
                }
-                for(slot = 0; slot < dirblock->slots; slot++) {
+                for (slot = 0; slot < dirblock->slots; slot++) {
                        dirslot  = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
                        namelen  = dirslot->namelen;
@@ -52,12 +53,12 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len)
                        if ((namelen == len) && (!memcmp(name, nameptr, len))) {
                                inodenum = be32_to_cpu(dirslot->inode);
                                brelse(bh);
-                                return(inodenum);
+                                return inodenum;
                        }
                }
                brelse(bh);
        }
-        return(0);
+        return 0;
 }
 struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
diff --git a/fs/exec.c b/fs/exec.c
index a3d33fe592d6..a2b42a98c743 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -368,10 +368,6 @@ static int bprm_mm_init(struct linux_binprm *bprm)
        if (!mm)
                goto err;
-        err = init_new_context(current, mm);
-        if (err)
-                goto err;
        err = __bprm_mm_init(bprm);
        if (err)
                goto err;
@@ -1216,7 +1212,7 @@ EXPORT_SYMBOL(install_exec_creds);
 /*
 * determine how safe it is to execute the proposed program
 * - the caller must hold ->cred_guard_mutex to protect against
- *   PTRACE_ATTACH
+ *   PTRACE_ATTACH or seccomp thread-sync
 */
 static void check_unsafe_exec(struct linux_binprm *bprm)
 {
@@ -1234,7 +1230,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
         * This isn't strictly necessary, but it makes it harder for LSMs to
         * mess up.
         */
-        if (current->no_new_privs)
+        if (task_no_new_privs(current))
                bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
        t = p;
@@ -1272,7 +1268,7 @@ int prepare_binprm(struct linux_binprm *bprm)
        bprm->cred->egid = current_egid();
        if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
-            !current->no_new_privs &&
+            !task_no_new_privs(current) &&
            kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
            kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
                /* Set-uid? */
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 7f20f25c232c..84529b8a331b 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -116,7 +116,7 @@ static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
                        num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
                                                        pages_in_unit - i);
-                        __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL);
+                        __a1pa = kcalloc(num_a1pa, sizeof__a1pa, GFP_KERNEL);
                        if (unlikely(!__a1pa)) {
                                ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
                                           num_a1pa);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 3750031cfa2f..b88edc05c230 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -161,7 +161,7 @@ static struct kmem_cache * ext2_inode_cachep;
 static struct inode *ext2_alloc_inode(struct super_block *sb)
 {
        struct ext2_inode_info *ei;
-        ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
+        ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        ei->i_block_alloc_info = NULL;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index fca382037ddd..581ef40fbe90 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -639,7 +639,6 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
        if (!(*errp) &&
            ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
                dquot_alloc_block_nofail(inode,
                                EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ef1bed66c14f..0bb3f9ea0832 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -571,6 +571,31 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
        return 0;
 }
+int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
+                      int buf_size)
+{
+        struct ext4_dir_entry_2 *de;
+        int nlen, rlen;
+        unsigned int offset = 0;
+        char *top;
+        de = (struct ext4_dir_entry_2 *)buf;
+        top = buf + buf_size;
+        while ((char *) de < top) {
+                if (ext4_check_dir_entry(dir, NULL, de, bh,
+                                         buf, buf_size, offset))
+                        return -EIO;
+                nlen = EXT4_DIR_REC_LEN(de->name_len);
+                rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+                de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+                offset += rlen;
+        }
+        if ((char *) de > top)
+                return -EIO;
+        return 0;
+}
 const struct file_operations ext4_dir_operations = {
        .llseek         = ext4_dir_llseek,
        .read           = generic_read_dir,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7cc5a0e23688..5b19760b1de5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -591,7 +591,6 @@ enum {
 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER   0x0010
 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER    0x0020
-#define EXT4_FREE_BLOCKS_RESERVE                0x0040
 /*
 * ioctl commands
@@ -2029,6 +2028,8 @@ static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
        return ext4_filetype_table[filetype];
 }
+extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
+                             void *buf, int buf_size);
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
@@ -2144,8 +2145,8 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
 extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
 extern void ext4_ind_truncate(handle_t *, struct inode *inode);
-extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
-                                 ext4_lblk_t first, ext4_lblk_t stop);
+                                 ext4_lblk_t start, ext4_lblk_t end);
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -2560,7 +2561,6 @@ extern const struct file_operations ext4_file_operations;
 extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 /* inline.c */
-extern int ext4_has_inline_data(struct inode *inode);
 extern int ext4_get_max_inline_size(struct inode *inode);
 extern int ext4_find_inline_data_nolock(struct inode *inode);
 extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
@@ -2626,6 +2626,12 @@ extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
 extern int ext4_convert_inline_data(struct inode *inode);
+static inline int ext4_has_inline_data(struct inode *inode)
+{
+        return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
+               EXT4_I(inode)->i_inline_off;
+}
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
 extern const struct inode_operations ext4_special_inode_operations;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4da228a0e6d0..76c2df382b7d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -161,6 +161,8 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
                     struct inode *inode, struct ext4_ext_path *path)
 {
        int err;
+        WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
        if (path->p_bh) {
                ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
                /* path points to block */
@@ -1808,8 +1810,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
        brelse(path[1].p_bh);
        ext4_free_blocks(handle, inode, NULL, blk, 1,
-                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET |
+                         EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
-                         EXT4_FREE_BLOCKS_RESERVE);
 }
 /*
@@ -3253,7 +3254,7 @@ out:
 fix_extent_len:
        ex->ee_len = orig_ex.ee_len;
-        ext4_ext_dirty(handle, inode, path + depth);
+        ext4_ext_dirty(handle, inode, path + path->p_depth);
        return err;
 }
@@ -5403,16 +5404,13 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
        int ret;
        /* Collapse range works only on fs block size aligned offsets. */
-        if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
+        if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
-            len & (EXT4_BLOCK_SIZE(sb) - 1))
+            len & (EXT4_CLUSTER_SIZE(sb) - 1))
                return -EINVAL;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
-        if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1)
-                return -EOPNOTSUPP;
        trace_ext4_collapse_range(inode, offset, len);
        punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8695f70af1ef..aca7b24a4432 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -200,10 +200,6 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
-        struct address_space *mapping = file->f_mapping;
-        if (!mapping->a_ops->readpage)
-                return -ENOEXEC;
        file_accessed(file);
        vma->vm_ops = &ext4_file_vm_ops;
        return 0;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index fd69da194826..e75f840000a0 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1295,97 +1295,220 @@ do_indirects:
        }
 }
-static int free_hole_blocks(handle_t *handle, struct inode *inode,
+/**
-                            struct buffer_head *parent_bh, __le32 *i_data,
+ *      ext4_ind_remove_space - remove space from the range
-                            int level, ext4_lblk_t first,
+ *      @handle: JBD handle for this transaction
-                            ext4_lblk_t count, int max)
+ *      @inode: inode we are dealing with
+ *      @start: First block to remove
+ *      @end:   One block after the last block to remove (exclusive)
+ *
+ *      Free the blocks in the defined range (end is exclusive endpoint of
+ *      range). This is used by ext4_punch_hole().
+ */
+int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
+                          ext4_lblk_t start, ext4_lblk_t end)
 {
-        struct buffer_head *bh = NULL;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        __le32 *i_data = ei->i_data;
        int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
-        int ret = 0;
+        ext4_lblk_t offsets[4], offsets2[4];
-        int i, inc;
+        Indirect chain[4], chain2[4];
-        ext4_lblk_t offset;
+        Indirect *partial, *partial2;
-        __le32 blk;
+        ext4_lblk_t max_block;
+        __le32 nr = 0, nr2 = 0;
-        inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level);
+        int n = 0, n2 = 0;
-        for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) {
+        unsigned blocksize = inode->i_sb->s_blocksize;
-                if (offset >= count + first)
-                        break;
-                if (*i_data == 0 || (offset + inc) <= first)
-                        continue;
-                blk = *i_data;
-                if (level > 0) {
-                        ext4_lblk_t first2;
-                        ext4_lblk_t count2;
-                        bh = sb_bread(inode->i_sb, le32_to_cpu(blk));
+        max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
-                        if (!bh) {
+                                        >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
-                                EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk),
+        if (end >= max_block)
-                                                       "Read failure");
+                end = max_block;
-                                return -EIO;
+        if ((start >= end) || (start > max_block))
-                        }
+                return 0;
-                        if (first > offset) {
-                                first2 = first - offset;
+        n = ext4_block_to_path(inode, start, offsets, NULL);
-                                count2 = count;
+        n2 = ext4_block_to_path(inode, end, offsets2, NULL);
+        BUG_ON(n > n2);
+        if ((n == 1) && (n == n2)) {
+                /* We're punching only within direct block range */
+                ext4_free_data(handle, inode, NULL, i_data + offsets[0],
+                               i_data + offsets2[0]);
+                return 0;
+        } else if (n2 > n) {
+                /*
+                 * Start and end are on a different levels so we're going to
+                 * free partial block at start, and partial block at end of
+                 * the range. If there are some levels in between then
+                 * do_indirects label will take care of that.
+                 */
+                if (n == 1) {
+                        /*
+                         * Start is at the direct block level, free
+                         * everything to the end of the level.
+                         */
+                        ext4_free_data(handle, inode, NULL, i_data + offsets[0],
+                                       i_data + EXT4_NDIR_BLOCKS);
+                        goto end_range;
+                }
+                partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+                if (nr) {
+                        if (partial == chain) {
+                                /* Shared branch grows from the inode */
+                                ext4_free_branches(handle, inode, NULL,
+                                           &nr, &nr+1, (chain+n-1) - partial);
+                                *partial->p = 0;
                        } else {
-                                first2 = 0;
+                                /* Shared branch grows from an indirect block */
-                                count2 = count - (offset - first);
+                                BUFFER_TRACE(partial->bh, "get_write_access");
+                                ext4_free_branches(handle, inode, partial->bh,
+                                        partial->p,
+                                        partial->p+1, (chain+n-1) - partial);
                        }
-                        ret = free_hole_blocks(handle, inode, bh,
+                }
-                                               (__le32 *)bh->b_data, level - 1,
-                                               first2, count2,
+                /*
-                                               inode->i_sb->s_blocksize >> 2);
+                 * Clear the ends of indirect blocks on the shared branch
-                        if (ret) {
+                 * at the start of the range
-                                brelse(bh);
+                 */
-                                goto err;
+                while (partial > chain) {
+                        ext4_free_branches(handle, inode, partial->bh,
+                                partial->p + 1,
+                                (__le32 *)partial->bh->b_data+addr_per_block,
+                                (chain+n-1) - partial);
+                        BUFFER_TRACE(partial->bh, "call brelse");
+                        brelse(partial->bh);
+                        partial--;
+                }
+end_range:
+                partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+                if (nr2) {
+                        if (partial2 == chain2) {
+                                /*
+                                 * Remember, end is exclusive so here we're at
+                                 * the start of the next level we're not going
+                                 * to free. Everything was covered by the start
+                                 * of the range.
+                                 */
+                                return 0;
+                        } else {
+                                /* Shared branch grows from an indirect block */
+                                partial2--;
                        }
+                } else {
+                        /*
+                         * ext4_find_shared returns Indirect structure which
+                         * points to the last element which should not be
+                         * removed by truncate. But this is end of the range
+                         * in punch_hole so we need to point to the next element
+                         */
+                        partial2->p++;
                }
-                if (level == 0 ||
-                    (bh && all_zeroes((__le32 *)bh->b_data,
+                /*
-                                      (__le32 *)bh->b_data + addr_per_block))) {
+                 * Clear the ends of indirect blocks on the shared branch
-                        ext4_free_data(handle, inode, parent_bh,
+                 * at the end of the range
-                                       i_data, i_data + 1);
+                 */
+                while (partial2 > chain2) {
+                        ext4_free_branches(handle, inode, partial2->bh,
+                                           (__le32 *)partial2->bh->b_data,
+                                           partial2->p,
+                                           (chain2+n2-1) - partial2);
+                        BUFFER_TRACE(partial2->bh, "call brelse");
+                        brelse(partial2->bh);
+                        partial2--;
                }
-                brelse(bh);
+                goto do_indirects;
-                bh = NULL;
        }
-err:
+        /* Punch happened within the same level (n == n2) */
-        return ret;
+        partial = ext4_find_shared(inode, n, offsets, chain, &nr);
-}
+        partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+        /*
-int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+         * ext4_find_shared returns Indirect structure which
-                          ext4_lblk_t first, ext4_lblk_t stop)
+         * points to the last element which should not be
-{
+         * removed by truncate. But this is end of the range
-        int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+         * in punch_hole so we need to point to the next element
-        int level, ret = 0;
+         */
-        int num = EXT4_NDIR_BLOCKS;
+        partial2->p++;
-        ext4_lblk_t count, max = EXT4_NDIR_BLOCKS;
+        while ((partial > chain) || (partial2 > chain2)) {
-        __le32 *i_data = EXT4_I(inode)->i_data;
+                /* We're at the same block, so we're almost finished */
+                if ((partial->bh && partial2->bh) &&
-        count = stop - first;
+                    (partial->bh->b_blocknr == partial2->bh->b_blocknr)) {
-        for (level = 0; level < 4; level++, max *= addr_per_block) {
+                        if ((partial > chain) && (partial2 > chain2)) {
-                if (first < max) {
+                                ext4_free_branches(handle, inode, partial->bh,
-                        ret = free_hole_blocks(handle, inode, NULL, i_data,
+                                                   partial->p + 1,
-                                               level, first, count, num);
+                                                   partial2->p,
-                        if (ret)
+                                                   (chain+n-1) - partial);
-                                goto err;
+                                BUFFER_TRACE(partial->bh, "call brelse");
-                        if (count > max - first)
+                                brelse(partial->bh);
-                                count -= max - first;
+                                BUFFER_TRACE(partial2->bh, "call brelse");
-                        else
+                                brelse(partial2->bh);
-                                break;
+                        }
-                        first = 0;
+                        return 0;
-                } else {
-                        first -= max;
                }
-                i_data += num;
+                /*
-                if (level == 0) {
+                 * Clear the ends of indirect blocks on the shared branch
-                        num = 1;
+                 * at the start of the range
-                        max = 1;
+                 */
+                if (partial > chain) {
+                        ext4_free_branches(handle, inode, partial->bh,
+                                   partial->p + 1,
+                                   (__le32 *)partial->bh->b_data+addr_per_block,
+                                   (chain+n-1) - partial);
+                        BUFFER_TRACE(partial->bh, "call brelse");
+                        brelse(partial->bh);
+                        partial--;
+                }
+                /*
+                 * Clear the ends of indirect blocks on the shared branch
+                 * at the end of the range
+                 */
+                if (partial2 > chain2) {
+                        ext4_free_branches(handle, inode, partial2->bh,
+                                           (__le32 *)partial2->bh->b_data,
+                                           partial2->p,
+                                           (chain2+n-1) - partial2);
+                        BUFFER_TRACE(partial2->bh, "call brelse");
+                        brelse(partial2->bh);
+                        partial2--;
                }
        }
-err:
+do_indirects:
-        return ret;
+        /* Kill the remaining (whole) subtrees */
+        switch (offsets[0]) {
+        default:
+                if (++n >= n2)
+                        return 0;
+                nr = i_data[EXT4_IND_BLOCK];
+                if (nr) {
+                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+                        i_data[EXT4_IND_BLOCK] = 0;
+                }
+        case EXT4_IND_BLOCK:
+                if (++n >= n2)
+                        return 0;
+                nr = i_data[EXT4_DIND_BLOCK];
+                if (nr) {
+                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+                        i_data[EXT4_DIND_BLOCK] = 0;
+                }
+        case EXT4_DIND_BLOCK:
+                if (++n >= n2)
+                        return 0;
+                nr = i_data[EXT4_TIND_BLOCK];
+                if (nr) {
+                        ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+                        i_data[EXT4_TIND_BLOCK] = 0;
+                }
+        case EXT4_TIND_BLOCK:
+                ;
+        }
+        return 0;
 }
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 645205d8ada6..bea662bd0ca6 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -120,12 +120,6 @@ int ext4_get_max_inline_size(struct inode *inode)
        return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
 }
-int ext4_has_inline_data(struct inode *inode)
-{
-        return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
-               EXT4_I(inode)->i_inline_off;
-}
 /*
 * this function does not take xattr_sem, which is OK because it is
 * currently only used in a code path coming form ext4_iget, before
@@ -1178,6 +1172,18 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
        if (error < 0)
                goto out;
+        /*
+         * Make sure the inline directory entries pass checks before we try to
+         * convert them, so that we avoid touching stuff that needs fsck.
+         */
+        if (S_ISDIR(inode->i_mode)) {
+                error = ext4_check_all_de(inode, iloc->bh,
+                                        buf + EXT4_INLINE_DOTDOT_SIZE,
+                                        inline_size - EXT4_INLINE_DOTDOT_SIZE);
+                if (error)
+                        goto out;
+        }
        error = ext4_destroy_inline_data_nolock(handle, inode);
        if (error)
                goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8a064734e6eb..367a60c07cf0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -325,18 +325,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode)
 #endif
 /*
- * Calculate the number of metadata blocks need to reserve
- * to allocate a block located at @lblock
- */
-static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
-{
-        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-                return ext4_ext_calc_metadata_amount(inode, lblock);
-        return ext4_ind_calc_metadata_amount(inode, lblock);
-}
-/*
 * Called with i_data_sem down, which is important since we can call
 * ext4_discard_preallocations() from here.
 */
@@ -357,35 +345,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
                used = ei->i_reserved_data_blocks;
        }
-        if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
-                ext4_warning(inode->i_sb, "ino %lu, allocated %d "
-                        "with only %d reserved metadata blocks "
-                        "(releasing %d blocks with reserved %d data blocks)",
-                        inode->i_ino, ei->i_allocated_meta_blocks,
-                             ei->i_reserved_meta_blocks, used,
-                             ei->i_reserved_data_blocks);
-                WARN_ON(1);
-                ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
-        }
        /* Update per-inode reservations */
        ei->i_reserved_data_blocks -= used;
-        ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+        percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);
-        percpu_counter_sub(&sbi->s_dirtyclusters_counter,
-                           used + ei->i_allocated_meta_blocks);
-        ei->i_allocated_meta_blocks = 0;
-        if (ei->i_reserved_data_blocks == 0) {
-                /*
-                 * We can release all of the reserved metadata blocks
-                 * only when we have written all of the delayed
-                 * allocation blocks.
-                 */
-                percpu_counter_sub(&sbi->s_dirtyclusters_counter,
-                                   ei->i_reserved_meta_blocks);
-                ei->i_reserved_meta_blocks = 0;
-                ei->i_da_metadata_calc_len = 0;
-        }
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        /* Update quota subsystem for data blocks */
@@ -1222,49 +1185,6 @@ static int ext4_journalled_write_end(struct file *file,
 }
 /*
- * Reserve a metadata for a single block located at lblock
- */
-static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
-{
-        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        struct ext4_inode_info *ei = EXT4_I(inode);
-        unsigned int md_needed;
-        ext4_lblk_t save_last_lblock;
-        int save_len;
-        /*
-         * recalculate the amount of metadata blocks to reserve
-         * in order to allocate nrblocks
-         * worse case is one extent per block
-         */
-        spin_lock(&ei->i_block_reservation_lock);
-        /*
-         * ext4_calc_metadata_amount() has side effects, which we have
-         * to be prepared undo if we fail to claim space.
-         */
-        save_len = ei->i_da_metadata_calc_len;
-        save_last_lblock = ei->i_da_metadata_calc_last_lblock;
-        md_needed = EXT4_NUM_B2C(sbi,
-                                 ext4_calc_metadata_amount(inode, lblock));
-        trace_ext4_da_reserve_space(inode, md_needed);
-        /*
-         * We do still charge estimated metadata to the sb though;
-         * we cannot afford to run out of free blocks.
-         */
-        if (ext4_claim_free_clusters(sbi, md_needed, 0)) {
-                ei->i_da_metadata_calc_len = save_len;
-                ei->i_da_metadata_calc_last_lblock = save_last_lblock;
-                spin_unlock(&ei->i_block_reservation_lock);
-                return -ENOSPC;
-        }
-        ei->i_reserved_meta_blocks += md_needed;
-        spin_unlock(&ei->i_block_reservation_lock);
-        return 0;       /* success */
-}
-/*
 * Reserve a single cluster located at lblock
 */
 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
@@ -1273,8 +1193,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int md_needed;
        int ret;
-        ext4_lblk_t save_last_lblock;
-        int save_len;
        /*
         * We will charge metadata quota at writeout time; this saves
@@ -1295,25 +1213,15 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
         * ext4_calc_metadata_amount() has side effects, which we have
         * to be prepared undo if we fail to claim space.
         */
-        save_len = ei->i_da_metadata_calc_len;
+        md_needed = 0;
-        save_last_lblock = ei->i_da_metadata_calc_last_lblock;
+        trace_ext4_da_reserve_space(inode, 0);
-        md_needed = EXT4_NUM_B2C(sbi,
-                                 ext4_calc_metadata_amount(inode, lblock));
-        trace_ext4_da_reserve_space(inode, md_needed);
-        /*
+        if (ext4_claim_free_clusters(sbi, 1, 0)) {
-         * We do still charge estimated metadata to the sb though;
-         * we cannot afford to run out of free blocks.
-         */
-        if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
-                ei->i_da_metadata_calc_len = save_len;
-                ei->i_da_metadata_calc_last_lblock = save_last_lblock;
                spin_unlock(&ei->i_block_reservation_lock);
                dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
                return -ENOSPC;
        }
        ei->i_reserved_data_blocks++;
-        ei->i_reserved_meta_blocks += md_needed;
        spin_unlock(&ei->i_block_reservation_lock);
        return 0;       /* success */
@@ -1346,20 +1254,6 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
        }
        ei->i_reserved_data_blocks -= to_free;
-        if (ei->i_reserved_data_blocks == 0) {
-                /*
-                 * We can release all of the reserved metadata blocks
-                 * only when we have written all of the delayed
-                 * allocation blocks.
-                 * Note that in case of bigalloc, i_reserved_meta_blocks,
-                 * i_reserved_data_blocks, etc. refer to number of clusters.
-                 */
-                percpu_counter_sub(&sbi->s_dirtyclusters_counter,
-                                   ei->i_reserved_meta_blocks);
-                ei->i_reserved_meta_blocks = 0;
-                ei->i_da_metadata_calc_len = 0;
-        }
        /* update fs dirty data blocks counter */
        percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
@@ -1500,10 +1394,6 @@ static void ext4_print_free_blocks(struct inode *inode)
        ext4_msg(sb, KERN_CRIT, "Block reservation details");
        ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
                 ei->i_reserved_data_blocks);
-        ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
-               ei->i_reserved_meta_blocks);
-        ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u",
-               ei->i_allocated_meta_blocks);
        return;
 }
@@ -1620,13 +1510,6 @@ add_delayed:
                                retval = ret;
                                goto out_unlock;
                        }
-                } else {
-                        ret = ext4_da_reserve_metadata(inode, iblock);
-                        if (ret) {
-                                /* not enough space to reserve */
-                                retval = ret;
-                                goto out_unlock;
-                        }
                }
                ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
@@ -2843,8 +2726,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
 {
        trace_ext4_alloc_da_blocks(inode);
-        if (!EXT4_I(inode)->i_reserved_data_blocks &&
+        if (!EXT4_I(inode)->i_reserved_data_blocks)
-            !EXT4_I(inode)->i_reserved_meta_blocks)
                return 0;
        /*
@@ -3624,7 +3506,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
                ret = ext4_ext_remove_space(inode, first_block,
                                            stop_block - 1);
        else
-                ret = ext4_free_hole_blocks(handle, inode, first_block,
+                ret = ext4_ind_remove_space(handle, inode, first_block,
                                            stop_block);
        up_write(&EXT4_I(inode)->i_data_sem);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 2dcb936be90e..956027711faf 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3075,8 +3075,9 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
                                                        (23 - bsbits)) << 23;
                size = 8 * 1024 * 1024;
        } else {
-                start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
+                start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
-                size      = ac->ac_o_ex.fe_len << bsbits;
+                size      = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
+                                              ac->ac_o_ex.fe_len) << bsbits;
        }
        size = size >> bsbits;
        start = start_off >> bsbits;
@@ -3216,8 +3217,27 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
 {
        struct ext4_prealloc_space *pa = ac->ac_pa;
+        struct ext4_buddy e4b;
+        int err;
-        if (pa && pa->pa_type == MB_INODE_PA)
+        if (pa == NULL) {
+                err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
+                if (err) {
+                        /*
+                         * This should never happen since we pin the
+                         * pages in the ext4_allocation_context so
+                         * ext4_mb_load_buddy() should never fail.
+                         */
+                        WARN(1, "mb_load_buddy failed (%d)", err);
+                        return;
+                }
+                ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
+                mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
+                               ac->ac_f_ex.fe_len);
+                ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
+                return;
+        }
+        if (pa->pa_type == MB_INODE_PA)
                pa->pa_free += ac->ac_b_ex.fe_len;
 }
@@ -4627,7 +4647,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
        struct buffer_head *gd_bh;
        ext4_group_t block_group;
        struct ext4_sb_info *sbi;
-        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_buddy e4b;
        unsigned int count_clusters;
        int err = 0;
@@ -4838,19 +4857,7 @@ do_more:
                             &sbi->s_flex_groups[flex_group].free_clusters);
        }
-        if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) {
+        if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
-                percpu_counter_add(&sbi->s_dirtyclusters_counter,
-                                   count_clusters);
-                spin_lock(&ei->i_block_reservation_lock);
-                if (flags & EXT4_FREE_BLOCKS_METADATA)
-                        ei->i_reserved_meta_blocks += count_clusters;
-                else
-                        ei->i_reserved_data_blocks += count_clusters;
-                spin_unlock(&ei->i_block_reservation_lock);
-                if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
-                        dquot_reclaim_block(inode,
-                                        EXT4_C2B(sbi, count_clusters));
-        } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
                dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
        percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index ec092437d3e0..d3567f27bae7 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -39,6 +39,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
        newext.ee_block = cpu_to_le32(lb->first_block);
        newext.ee_len   = cpu_to_le16(lb->last_block - lb->first_block + 1);
        ext4_ext_store_pblock(&newext, lb->first_pblock);
+        /* Locking only for convinience since we are operating on temp inode */
+        down_write(&EXT4_I(inode)->i_data_sem);
        path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0);
        if (IS_ERR(path)) {
@@ -61,7 +63,9 @@ static int finish_range(handle_t *handle, struct inode *inode,
         */
        if (needed && ext4_handle_has_enough_credits(handle,
                                                EXT4_RESERVE_TRANS_BLOCKS)) {
+                up_write((&EXT4_I(inode)->i_data_sem));
                retval = ext4_journal_restart(handle, needed);
+                down_write((&EXT4_I(inode)->i_data_sem));
                if (retval)
                        goto err_out;
        } else if (needed) {
@@ -70,13 +74,16 @@ static int finish_range(handle_t *handle, struct inode *inode,
                        /*
                         * IF not able to extend the journal restart the journal
                         */
+                        up_write((&EXT4_I(inode)->i_data_sem));
                        retval = ext4_journal_restart(handle, needed);
+                        down_write((&EXT4_I(inode)->i_data_sem));
                        if (retval)
                                goto err_out;
                }
        }
        retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
 err_out:
+        up_write((&EXT4_I(inode)->i_data_sem));
        if (path) {
                ext4_ext_drop_refs(path);
                kfree(path);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2484c7ec6a72..671a74b14fd7 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -1013,10 +1013,11 @@ data_copy:
                *err = -EBUSY;
                goto unlock_pages;
        }
+        ext4_double_down_write_data_sem(orig_inode, donor_inode);
        replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
                                               orig_blk_offset,
                                               block_len_in_page, err);
+        ext4_double_up_write_data_sem(orig_inode, donor_inode);
        if (*err) {
                if (replaced_count) {
                        block_len_in_page = replaced_count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3520ab8a6639..b147a67baa0d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3455,7 +3455,6 @@ const struct inode_operations ext4_dir_inode_operations = {
        .rmdir          = ext4_rmdir,
        .mknod          = ext4_mknod,
        .tmpfile        = ext4_tmpfile,
-        .rename         = ext4_rename,
        .rename2        = ext4_rename2,
        .setattr        = ext4_setattr,
        .setxattr       = generic_setxattr,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6df7bc611dbd..32b43ad154b9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2142,10 +2142,6 @@ static int ext4_check_descriptors(struct super_block *sb,
        }
        if (NULL != first_not_zeroed)
                *first_not_zeroed = grp;
-        ext4_free_blocks_count_set(sbi->s_es,
-                                   EXT4_C2B(sbi, ext4_count_free_clusters(sb)));
-        sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
        return 1;
 }
@@ -3883,13 +3879,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                goto failed_mount2;
        }
-        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
-                if (!ext4_fill_flex_info(sb)) {
-                        ext4_msg(sb, KERN_ERR,
-                               "unable to initialize "
-                               "flex_bg meta info!");
-                        goto failed_mount2;
-                }
        sbi->s_gdb_count = db_count;
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
@@ -3902,23 +3891,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /* Register extent status tree shrinker */
        ext4_es_register_shrinker(sbi);
-        err = percpu_counter_init(&sbi->s_freeclusters_counter,
+        if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) {
-                        ext4_count_free_clusters(sb));
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_freeinodes_counter,
-                                ext4_count_free_inodes(sb));
-        }
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_dirs_counter,
-                                ext4_count_dirs(sb));
-        }
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
-        }
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0);
-        }
-        if (err) {
                ext4_msg(sb, KERN_ERR, "insufficient memory");
                goto failed_mount3;
        }
@@ -4022,18 +3995,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
-        /*
-         * The journal may have updated the bg summary counts, so we
-         * need to update the global counters.
-         */
-        percpu_counter_set(&sbi->s_freeclusters_counter,
-                           ext4_count_free_clusters(sb));
-        percpu_counter_set(&sbi->s_freeinodes_counter,
-                           ext4_count_free_inodes(sb));
-        percpu_counter_set(&sbi->s_dirs_counter,
-                           ext4_count_dirs(sb));
-        percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
 no_journal:
        if (ext4_mballoc_ready) {
                sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
@@ -4141,6 +4102,33 @@ no_journal:
                goto failed_mount5;
        }
+        block = ext4_count_free_clusters(sb);
+        ext4_free_blocks_count_set(sbi->s_es, 
+                                   EXT4_C2B(sbi, block));
+        err = percpu_counter_init(&sbi->s_freeclusters_counter, block);
+        if (!err) {
+                unsigned long freei = ext4_count_free_inodes(sb);
+                sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
+                err = percpu_counter_init(&sbi->s_freeinodes_counter, freei);
+        }
+        if (!err)
+                err = percpu_counter_init(&sbi->s_dirs_counter,
+                                          ext4_count_dirs(sb));
+        if (!err)
+                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "insufficient memory");
+                goto failed_mount6;
+        }
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+                if (!ext4_fill_flex_info(sb)) {
+                        ext4_msg(sb, KERN_ERR,
+                               "unable to initialize "
+                               "flex_bg meta info!");
+                        goto failed_mount6;
+                }
        err = ext4_register_li_request(sb, first_not_zeroed);
        if (err)
                goto failed_mount6;
@@ -4215,6 +4203,12 @@ failed_mount7:
        ext4_unregister_li_request(sb);
 failed_mount6:
        ext4_mb_release(sb);
+        if (sbi->s_flex_groups)
+                ext4_kvfree(sbi->s_flex_groups);
+        percpu_counter_destroy(&sbi->s_freeclusters_counter);
+        percpu_counter_destroy(&sbi->s_freeinodes_counter);
+        percpu_counter_destroy(&sbi->s_dirs_counter);
+        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
 failed_mount5:
        ext4_ext_release(sb);
        ext4_release_system_zone(sb);
@@ -4233,12 +4227,6 @@ failed_mount_wq:
 failed_mount3:
        ext4_es_unregister_shrinker(sbi);
        del_timer_sync(&sbi->s_err_report);
-        if (sbi->s_flex_groups)
-                ext4_kvfree(sbi->s_flex_groups);
-        percpu_counter_destroy(&sbi->s_freeclusters_counter);
-        percpu_counter_destroy(&sbi->s_freeinodes_counter);
-        percpu_counter_destroy(&sbi->s_dirs_counter);
-        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
        percpu_counter_destroy(&sbi->s_extent_cache_cnt);
        if (sbi->s_mmp_tsk)
                kthread_stop(sbi->s_mmp_tsk);
@@ -4556,11 +4544,13 @@ static int ext4_commit_super(struct super_block *sb, int sync)
        else
                es->s_kbytes_written =
                        cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-        ext4_free_blocks_count_set(es,
+        if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
+                ext4_free_blocks_count_set(es,
                        EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
                                &EXT4_SB(sb)->s_freeclusters_counter)));
-        es->s_free_inodes_count =
+        if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
-                cpu_to_le32(percpu_counter_sum_positive(
+                es->s_free_inodes_count =
+                        cpu_to_le32(percpu_counter_sum_positive(
                                &EXT4_SB(sb)->s_freeinodes_counter));
        BUFFER_TRACE(sbh, "marking dirty");
        ext4_superblock_csum_set(sb);
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index dbe2141d10ad..83b9b5a8d112 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -203,12 +203,6 @@ static int __f2fs_set_acl(struct inode *inode, int type,
        size_t size = 0;
        int error;
-        if (acl) {
-                error = posix_acl_valid(acl);
-                if (error < 0)
-                        return error;
-        }
        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0b4710c1d370..6aeed5bada52 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -22,7 +22,7 @@
 #include "segment.h"
 #include <trace/events/f2fs.h>
-static struct kmem_cache *orphan_entry_slab;
+static struct kmem_cache *ino_entry_slab;
 static struct kmem_cache *inode_entry_slab;
 /*
@@ -282,72 +282,120 @@ const struct address_space_operations f2fs_meta_aops = {
        .set_page_dirty = f2fs_set_meta_page_dirty,
 };
+static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
+{
+        struct ino_entry *e;
+retry:
+        spin_lock(&sbi->ino_lock[type]);
+        e = radix_tree_lookup(&sbi->ino_root[type], ino);
+        if (!e) {
+                e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC);
+                if (!e) {
+                        spin_unlock(&sbi->ino_lock[type]);
+                        goto retry;
+                }
+                if (radix_tree_insert(&sbi->ino_root[type], ino, e)) {
+                        spin_unlock(&sbi->ino_lock[type]);
+                        kmem_cache_free(ino_entry_slab, e);
+                        goto retry;
+                }
+                memset(e, 0, sizeof(struct ino_entry));
+                e->ino = ino;
+                list_add_tail(&e->list, &sbi->ino_list[type]);
+        }
+        spin_unlock(&sbi->ino_lock[type]);
+}
+static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
+{
+        struct ino_entry *e;
+        spin_lock(&sbi->ino_lock[type]);
+        e = radix_tree_lookup(&sbi->ino_root[type], ino);
+        if (e) {
+                list_del(&e->list);
+                radix_tree_delete(&sbi->ino_root[type], ino);
+                if (type == ORPHAN_INO)
+                        sbi->n_orphans--;
+                spin_unlock(&sbi->ino_lock[type]);
+                kmem_cache_free(ino_entry_slab, e);
+                return;
+        }
+        spin_unlock(&sbi->ino_lock[type]);
+}
+void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+{
+        /* add new dirty ino entry into list */
+        __add_ino_entry(sbi, ino, type);
+}
+void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+{
+        /* remove dirty ino entry from list */
+        __remove_ino_entry(sbi, ino, type);
+}
+/* mode should be APPEND_INO or UPDATE_INO */
+bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
+{
+        struct ino_entry *e;
+        spin_lock(&sbi->ino_lock[mode]);
+        e = radix_tree_lookup(&sbi->ino_root[mode], ino);
+        spin_unlock(&sbi->ino_lock[mode]);
+        return e ? true : false;
+}
+static void release_dirty_inode(struct f2fs_sb_info *sbi)
+{
+        struct ino_entry *e, *tmp;
+        int i;
+        for (i = APPEND_INO; i <= UPDATE_INO; i++) {
+                spin_lock(&sbi->ino_lock[i]);
+                list_for_each_entry_safe(e, tmp, &sbi->ino_list[i], list) {
+                        list_del(&e->list);
+                        radix_tree_delete(&sbi->ino_root[i], e->ino);
+                        kmem_cache_free(ino_entry_slab, e);
+                }
+                spin_unlock(&sbi->ino_lock[i]);
+        }
+}
 int acquire_orphan_inode(struct f2fs_sb_info *sbi)
 {
        int err = 0;
-        spin_lock(&sbi->orphan_inode_lock);
+        spin_lock(&sbi->ino_lock[ORPHAN_INO]);
        if (unlikely(sbi->n_orphans >= sbi->max_orphans))
                err = -ENOSPC;
        else
                sbi->n_orphans++;
-        spin_unlock(&sbi->orphan_inode_lock);
+        spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
        return err;
 }
 void release_orphan_inode(struct f2fs_sb_info *sbi)
 {
-        spin_lock(&sbi->orphan_inode_lock);
+        spin_lock(&sbi->ino_lock[ORPHAN_INO]);
        f2fs_bug_on(sbi->n_orphans == 0);
        sbi->n_orphans--;
-        spin_unlock(&sbi->orphan_inode_lock);
+        spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
 }
 void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 {
-        struct list_head *head;
+        /* add new orphan ino entry into list */
-        struct orphan_inode_entry *new, *orphan;
+        __add_ino_entry(sbi, ino, ORPHAN_INO);
-        new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
-        new->ino = ino;
-        spin_lock(&sbi->orphan_inode_lock);
-        head = &sbi->orphan_inode_list;
-        list_for_each_entry(orphan, head, list) {
-                if (orphan->ino == ino) {
-                        spin_unlock(&sbi->orphan_inode_lock);
-                        kmem_cache_free(orphan_entry_slab, new);
-                        return;
-                }
-                if (orphan->ino > ino)
-                        break;
-        }
-        /* add new orphan entry into list which is sorted by inode number */
-        list_add_tail(&new->list, &orphan->list);
-        spin_unlock(&sbi->orphan_inode_lock);
 }
 void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 {
-        struct list_head *head;
+        /* remove orphan entry from orphan list */
-        struct orphan_inode_entry *orphan;
+        __remove_ino_entry(sbi, ino, ORPHAN_INO);
-        spin_lock(&sbi->orphan_inode_lock);
-        head = &sbi->orphan_inode_list;
-        list_for_each_entry(orphan, head, list) {
-                if (orphan->ino == ino) {
-                        list_del(&orphan->list);
-                        f2fs_bug_on(sbi->n_orphans == 0);
-                        sbi->n_orphans--;
-                        spin_unlock(&sbi->orphan_inode_lock);
-                        kmem_cache_free(orphan_entry_slab, orphan);
-                        return;
-                }
-        }
-        spin_unlock(&sbi->orphan_inode_lock);
 }
 static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -401,14 +449,14 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
        unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
                (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
        struct page *page = NULL;
-        struct orphan_inode_entry *orphan = NULL;
+        struct ino_entry *orphan = NULL;
        for (index = 0; index < orphan_blocks; index++)
                grab_meta_page(sbi, start_blk + index);
        index = 1;
-        spin_lock(&sbi->orphan_inode_lock);
+        spin_lock(&sbi->ino_lock[ORPHAN_INO]);
-        head = &sbi->orphan_inode_list;
+        head = &sbi->ino_list[ORPHAN_INO];
        /* loop for each orphan inode entry and write them in Jornal block */
        list_for_each_entry(orphan, head, list) {
@@ -448,7 +496,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
                f2fs_put_page(page, 1);
        }
-        spin_unlock(&sbi->orphan_inode_lock);
+        spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
 }
 static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
@@ -714,10 +762,10 @@ retry_flush_dents:
         * until finishing nat/sit flush.
         */
 retry_flush_nodes:
-        mutex_lock(&sbi->node_write);
+        down_write(&sbi->node_write);
        if (get_pages(sbi, F2FS_DIRTY_NODES)) {
-                mutex_unlock(&sbi->node_write);
+                up_write(&sbi->node_write);
                sync_node_pages(sbi, 0, &wbc);
                goto retry_flush_nodes;
        }
@@ -726,7 +774,7 @@ retry_flush_nodes:
 static void unblock_operations(struct f2fs_sb_info *sbi)
 {
-        mutex_unlock(&sbi->node_write);
+        up_write(&sbi->node_write);
        f2fs_unlock_all(sbi);
 }
@@ -748,6 +796,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
 static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
 {
        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
        nid_t last_nid = 0;
        block_t start_blk;
        struct page *cp_page;
@@ -761,7 +810,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
         * This avoids to conduct wrong roll-forward operations and uses
         * metapages, so should be called prior to sync_meta_pages below.
         */
-        discard_next_dnode(sbi);
+        discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
        /* Flush all the NAT/SIT pages */
        while (get_pages(sbi, F2FS_DIRTY_META))
@@ -885,8 +934,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        /* Here, we only have one bio having CP pack */
        sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
-        if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
+        if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
                clear_prefree_segments(sbi);
+                release_dirty_inode(sbi);
                F2FS_RESET_SB_DIRT(sbi);
        }
 }
@@ -932,31 +982,37 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
        trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
 }
-void init_orphan_info(struct f2fs_sb_info *sbi)
+void init_ino_entry_info(struct f2fs_sb_info *sbi)
 {
-        spin_lock_init(&sbi->orphan_inode_lock);
+        int i;
-        INIT_LIST_HEAD(&sbi->orphan_inode_list);
-        sbi->n_orphans = 0;
+        for (i = 0; i < MAX_INO_ENTRY; i++) {
+                INIT_RADIX_TREE(&sbi->ino_root[i], GFP_ATOMIC);
+                spin_lock_init(&sbi->ino_lock[i]);
+                INIT_LIST_HEAD(&sbi->ino_list[i]);
+        }
        /*
         * considering 512 blocks in a segment 8 blocks are needed for cp
         * and log segment summaries. Remaining blocks are used to keep
         * orphan entries with the limitation one reserved segment
         * for cp pack we can have max 1020*504 orphan entries
         */
+        sbi->n_orphans = 0;
        sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
                                * F2FS_ORPHANS_PER_BLOCK;
 }
 int __init create_checkpoint_caches(void)
 {
-        orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
+        ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry",
-                        sizeof(struct orphan_inode_entry));
+                        sizeof(struct ino_entry));
-        if (!orphan_entry_slab)
+        if (!ino_entry_slab)
                return -ENOMEM;
        inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
                        sizeof(struct dir_inode_entry));
        if (!inode_entry_slab) {
-                kmem_cache_destroy(orphan_entry_slab);
+                kmem_cache_destroy(ino_entry_slab);
                return -ENOMEM;
        }
        return 0;
@@ -964,6 +1020,6 @@ int __init create_checkpoint_caches(void)
 void destroy_checkpoint_caches(void)
 {
-        kmem_cache_destroy(orphan_entry_slab);
+        kmem_cache_destroy(ino_entry_slab);
        kmem_cache_destroy(inode_entry_slab);
 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f8cf619edb5f..03313099c51c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -139,7 +139,10 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
        /* change META to META_FLUSH in the checkpoint procedure */
        if (type >= META_FLUSH) {
                io->fio.type = META_FLUSH;
-                io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
+                if (test_opt(sbi, NOBARRIER))
+                        io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO;
+                else
+                        io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
        }
        __submit_merged_bio(io);
        up_write(&io->io_rwsem);
@@ -626,8 +629,10 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
        if (check_extent_cache(inode, pgofs, bh_result))
                goto out;
-        if (create)
+        if (create) {
+                f2fs_balance_fs(sbi);
                f2fs_lock_op(sbi);
+        }
        /* When reading holes, we need its node page */
        set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -784,9 +789,11 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
                        !is_cold_data(page) &&
                        need_inplace_update(inode))) {
                rewrite_data_page(page, old_blkaddr, fio);
+                set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
        } else {
                write_data_page(page, &dn, &new_blkaddr, fio);
                update_extent_cache(new_blkaddr, &dn);
+                set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
        }
 out_writepage:
        f2fs_put_dnode(&dn);
@@ -914,6 +921,16 @@ skip_write:
        return 0;
 }
+static void f2fs_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, inode->i_size);
+                truncate_blocks(inode, inode->i_size);
+        }
+}
 static int f2fs_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata)
@@ -931,11 +948,13 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 repeat:
        err = f2fs_convert_inline_data(inode, pos + len);
        if (err)
-                return err;
+                goto fail;
        page = grab_cache_page_write_begin(mapping, index, flags);
-        if (!page)
+        if (!page) {
-                return -ENOMEM;
+                err = -ENOMEM;
+                goto fail;
+        }
        /* to avoid latency during memory pressure */
        unlock_page(page);
@@ -949,10 +968,9 @@ repeat:
        set_new_dnode(&dn, inode, NULL, NULL, 0);
        err = f2fs_reserve_block(&dn, index);
        f2fs_unlock_op(sbi);
        if (err) {
                f2fs_put_page(page, 0);
-                return err;
+                goto fail;
        }
 inline_data:
        lock_page(page);
@@ -982,19 +1000,20 @@ inline_data:
                        err = f2fs_read_inline_data(inode, page);
                        if (err) {
                                page_cache_release(page);
-                                return err;
+                                goto fail;
                        }
                } else {
                        err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
                                                        READ_SYNC);
                        if (err)
-                                return err;
+                                goto fail;
                }
                lock_page(page);
                if (unlikely(!PageUptodate(page))) {
                        f2fs_put_page(page, 1);
-                        return -EIO;
+                        err = -EIO;
+                        goto fail;
                }
                if (unlikely(page->mapping != mapping)) {
                        f2fs_put_page(page, 1);
@@ -1005,6 +1024,9 @@ out:
        SetPageUptodate(page);
        clear_cold_data(page);
        return 0;
+fail:
+        f2fs_write_failed(mapping, pos + len);
+        return err;
 }
 static int f2fs_write_end(struct file *file,
@@ -1016,7 +1038,6 @@ static int f2fs_write_end(struct file *file,
        trace_f2fs_write_end(inode, pos, len, copied);
-        SetPageUptodate(page);
        set_page_dirty(page);
        if (pos + copied > i_size_read(inode)) {
@@ -1050,7 +1071,10 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
                struct iov_iter *iter, loff_t offset)
 {
        struct file *file = iocb->ki_filp;
-        struct inode *inode = file->f_mapping->host;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        size_t count = iov_iter_count(iter);
+        int err;
        /* Let buffer I/O handle the inline data case. */
        if (f2fs_has_inline_data(inode))
@@ -1062,8 +1086,15 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
        /* clear fsync mark to recover these blocks */
        fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino);
-        return blockdev_direct_IO(rw, iocb, inode, iter, offset,
+        trace_f2fs_direct_IO_enter(inode, offset, count, rw);
-                                  get_data_block);
+        err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
+        if (err < 0 && (rw & WRITE))
+                f2fs_write_failed(mapping, offset + count);
+        trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
+        return err;
 }
 static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index b52c12cf5873..a441ba33be11 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -167,7 +167,7 @@ get_cache:
        si->cache_mem += npages << PAGE_CACHE_SHIFT;
        npages = META_MAPPING(sbi)->nrpages;
        si->cache_mem += npages << PAGE_CACHE_SHIFT;
-        si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry);
+        si->cache_mem += sbi->n_orphans * sizeof(struct ino_entry);
        si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
 }
@@ -345,21 +345,14 @@ void __init f2fs_create_root_stats(void)
        f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
        if (!f2fs_debugfs_root)
-                goto bail;
+                return;
        file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
                        NULL, &stat_fops);
-        if (!file)
+        if (!file) {
-                goto free_debugfs_dir;
+                debugfs_remove(f2fs_debugfs_root);
+                f2fs_debugfs_root = NULL;
-        return;
+        }
-free_debugfs_dir:
-        debugfs_remove(f2fs_debugfs_root);
-bail:
-        f2fs_debugfs_root = NULL;
-        return;
 }
 void f2fs_destroy_root_stats(void)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index a4addd72ebbd..bcf893c3d903 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -77,8 +77,8 @@ static unsigned long dir_block_index(unsigned int level,
        return bidx;
 }
-static bool early_match_name(const char *name, size_t namelen,
+static bool early_match_name(size_t namelen, f2fs_hash_t namehash,
-                        f2fs_hash_t namehash, struct f2fs_dir_entry *de)
+                                struct f2fs_dir_entry *de)
 {
        if (le16_to_cpu(de->name_len) != namelen)
                return false;
@@ -90,7 +90,7 @@ static bool early_match_name(const char *name, size_t namelen,
 }
 static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
-                        const char *name, size_t namelen, int *max_slots,
+                        struct qstr *name, int *max_slots,
                        f2fs_hash_t namehash, struct page **res_page)
 {
        struct f2fs_dir_entry *de;
@@ -109,9 +109,10 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
                        continue;
                }
                de = &dentry_blk->dentry[bit_pos];
-                if (early_match_name(name, namelen, namehash, de)) {
+                if (early_match_name(name->len, namehash, de)) {
                        if (!memcmp(dentry_blk->filename[bit_pos],
-                                                        name, namelen)) {
+                                                        name->name,
+                                                        name->len)) {
                                *res_page = dentry_page;
                                goto found;
                        }
@@ -120,6 +121,13 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
                        *max_slots = max_len;
                        max_len = 0;
                }
+                /*
+                 * For the most part, it should be a bug when name_len is zero.
+                 * We stop here for figuring out where the bugs are occurred.
+                 */
+                f2fs_bug_on(!de->name_len);
                bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
        }
@@ -132,10 +140,10 @@ found:
 }
 static struct f2fs_dir_entry *find_in_level(struct inode *dir,
-                unsigned int level, const char *name, size_t namelen,
+                        unsigned int level, struct qstr *name,
                        f2fs_hash_t namehash, struct page **res_page)
 {
-        int s = GET_DENTRY_SLOTS(namelen);
+        int s = GET_DENTRY_SLOTS(name->len);
        unsigned int nbucket, nblock;
        unsigned int bidx, end_block;
        struct page *dentry_page;
@@ -160,8 +168,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
                        continue;
                }
-                de = find_in_block(dentry_page, name, namelen,
+                de = find_in_block(dentry_page, name, &max_slots,
-                                        &max_slots, namehash, res_page);
+                                        namehash, res_page);
                if (de)
                        break;
@@ -187,8 +195,6 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
                        struct qstr *child, struct page **res_page)
 {
-        const char *name = child->name;
-        size_t namelen = child->len;
        unsigned long npages = dir_blocks(dir);
        struct f2fs_dir_entry *de = NULL;
        f2fs_hash_t name_hash;
@@ -200,12 +206,11 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
        *res_page = NULL;
-        name_hash = f2fs_dentry_hash(name, namelen);
+        name_hash = f2fs_dentry_hash(child);
        max_depth = F2FS_I(dir)->i_current_depth;
        for (level = 0; level < max_depth; level++) {
-                de = find_in_level(dir, level, name,
+                de = find_in_level(dir, level, child, name_hash, res_page);
-                                namelen, name_hash, res_page);
                if (de)
                        break;
        }
@@ -298,14 +303,13 @@ static int make_empty_dir(struct inode *inode,
        struct page *dentry_page;
        struct f2fs_dentry_block *dentry_blk;
        struct f2fs_dir_entry *de;
-        void *kaddr;
        dentry_page = get_new_data_page(inode, page, 0, true);
        if (IS_ERR(dentry_page))
                return PTR_ERR(dentry_page);
-        kaddr = kmap_atomic(dentry_page);
-        dentry_blk = (struct f2fs_dentry_block *)kaddr;
+        dentry_blk = kmap_atomic(dentry_page);
        de = &dentry_blk->dentry[0];
        de->name_len = cpu_to_le16(1);
@@ -323,7 +327,7 @@ static int make_empty_dir(struct inode *inode,
        test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
        test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
-        kunmap_atomic(kaddr);
+        kunmap_atomic(dentry_blk);
        set_page_dirty(dentry_page);
        f2fs_put_page(dentry_page, 1);
@@ -333,11 +337,12 @@ static int make_empty_dir(struct inode *inode,
 static struct page *init_inode_metadata(struct inode *inode,
                struct inode *dir, const struct qstr *name)
 {
+        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
        struct page *page;
        int err;
        if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
-                page = new_inode_page(inode, name);
+                page = new_inode_page(inode);
                if (IS_ERR(page))
                        return page;
@@ -362,7 +367,8 @@ static struct page *init_inode_metadata(struct inode *inode,
                set_cold_node(inode, page);
        }
-        init_dent_inode(name, page);
+        if (name)
+                init_dent_inode(name, page);
        /*
         * This file should be checkpointed during fsync.
@@ -370,6 +376,12 @@ static struct page *init_inode_metadata(struct inode *inode,
         */
        if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
                file_lost_pino(inode);
+                /*
+                 * If link the tmpfile to alias through linkat path,
+                 * we should remove this inode from orphan list.
+                 */
+                if (inode->i_nlink == 0)
+                        remove_orphan_inode(sbi, inode->i_ino);
                inc_nlink(inode);
        }
        return page;
@@ -453,7 +465,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
        int err = 0;
        int i;
-        dentry_hash = f2fs_dentry_hash(name->name, name->len);
+        dentry_hash = f2fs_dentry_hash(name);
        level = 0;
        current_depth = F2FS_I(dir)->i_current_depth;
        if (F2FS_I(dir)->chash == dentry_hash) {
@@ -529,6 +541,27 @@ fail:
        return err;
 }
+int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
+{
+        struct page *page;
+        int err = 0;
+        down_write(&F2FS_I(inode)->i_sem);
+        page = init_inode_metadata(inode, dir, NULL);
+        if (IS_ERR(page)) {
+                err = PTR_ERR(page);
+                goto fail;
+        }
+        /* we don't need to mark_inode_dirty now */
+        update_inode(inode, page);
+        f2fs_put_page(page, 1);
+        clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
+fail:
+        up_write(&F2FS_I(inode)->i_sem);
+        return err;
+}
 /*
 * It only removes the dentry from the dentry page,corresponding name
 * entry in name page does not need to be touched during deletion.
@@ -541,14 +574,13 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
        struct address_space *mapping = page->mapping;
        struct inode *dir = mapping->host;
        int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
-        void *kaddr = page_address(page);
        int i;
        lock_page(page);
        f2fs_wait_on_page_writeback(page, DATA);
-        dentry_blk = (struct f2fs_dentry_block *)kaddr;
+        dentry_blk = page_address(page);
-        bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry;
+        bit_pos = dentry - dentry_blk->dentry;
        for (i = 0; i < slots; i++)
                test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
@@ -603,7 +635,6 @@ bool f2fs_empty_dir(struct inode *dir)
        unsigned long nblock = dir_blocks(dir);
        for (bidx = 0; bidx < nblock; bidx++) {
-                void *kaddr;
                dentry_page = get_lock_data_page(dir, bidx);
                if (IS_ERR(dentry_page)) {
                        if (PTR_ERR(dentry_page) == -ENOENT)
@@ -612,8 +643,8 @@ bool f2fs_empty_dir(struct inode *dir)
                                return false;
                }
-                kaddr = kmap_atomic(dentry_page);
-                dentry_blk = (struct f2fs_dentry_block *)kaddr;
+                dentry_blk = kmap_atomic(dentry_page);
                if (bidx == 0)
                        bit_pos = 2;
                else
@@ -621,7 +652,7 @@ bool f2fs_empty_dir(struct inode *dir)
                bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
                                                NR_DENTRY_IN_BLOCK,
                                                bit_pos);
-                kunmap_atomic(kaddr);
+                kunmap_atomic(dentry_blk);
                f2fs_put_page(dentry_page, 1);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 58df97e174d0..4dab5338a97a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -41,6 +41,7 @@
 #define F2FS_MOUNT_INLINE_XATTR         0x00000080
 #define F2FS_MOUNT_INLINE_DATA          0x00000100
 #define F2FS_MOUNT_FLUSH_MERGE          0x00000200
+#define F2FS_MOUNT_NOBARRIER            0x00000400
 #define clear_opt(sbi, option)  (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)    (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -99,8 +100,15 @@ enum {
        META_SSA
 };
-/* for the list of orphan inodes */
+/* for the list of ino */
-struct orphan_inode_entry {
+enum {
+        ORPHAN_INO,             /* for orphan ino list */
+        APPEND_INO,             /* for append ino list */
+        UPDATE_INO,             /* for update ino list */
+        MAX_INO_ENTRY,          /* max. list */
+};
+struct ino_entry {
        struct list_head list;  /* list head */
        nid_t ino;              /* inode number */
 };
@@ -256,6 +264,8 @@ struct f2fs_nm_info {
        unsigned int nat_cnt;           /* the # of cached nat entries */
        struct list_head nat_entries;   /* cached nat entry list (clean) */
        struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
+        struct list_head nat_entry_set; /* nat entry set list */
+        unsigned int dirty_nat_cnt;     /* total num of nat entries in set */
        /* free node ids management */
        struct radix_tree_root free_nid_root;/* root of the free_nid cache */
@@ -442,14 +452,17 @@ struct f2fs_sb_info {
        struct inode *meta_inode;               /* cache meta blocks */
        struct mutex cp_mutex;                  /* checkpoint procedure lock */
        struct rw_semaphore cp_rwsem;           /* blocking FS operations */
-        struct mutex node_write;                /* locking node writes */
+        struct rw_semaphore node_write;         /* locking node writes */
        struct mutex writepages;                /* mutex for writepages() */
        bool por_doing;                         /* recovery is doing or not */
        wait_queue_head_t cp_wait;
-        /* for orphan inode management */
+        /* for inode management */
-        struct list_head orphan_inode_list;     /* orphan inode list */
+        struct radix_tree_root ino_root[MAX_INO_ENTRY]; /* ino entry array */
-        spinlock_t orphan_inode_lock;           /* for orphan inode list */
+        spinlock_t ino_lock[MAX_INO_ENTRY];             /* for ino entry lock */
+        struct list_head ino_list[MAX_INO_ENTRY];       /* inode list head */
+        /* for orphan inode, use 0'th array */
        unsigned int n_orphans;                 /* # of orphan inodes */
        unsigned int max_orphans;               /* max orphan inodes */
@@ -768,7 +781,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
                if (flag == NAT_BITMAP)
                        return &ckpt->sit_nat_version_bitmap;
                else
-                        return ((unsigned char *)ckpt + F2FS_BLKSIZE);
+                        return (unsigned char *)ckpt + F2FS_BLKSIZE;
        } else {
                offset = (flag == NAT_BITMAP) ?
                        le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0;
@@ -983,11 +996,15 @@ enum {
        FI_NO_EXTENT,           /* not to use the extent cache */
        FI_INLINE_XATTR,        /* used for inline xattr */
        FI_INLINE_DATA,         /* used for inline data*/
+        FI_APPEND_WRITE,        /* inode has appended data */
+        FI_UPDATE_WRITE,        /* inode has in-place-update data */
+        FI_NEED_IPU,            /* used fo ipu for fdatasync */
 };
 static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
 {
-        set_bit(flag, &fi->flags);
+        if (!test_bit(flag, &fi->flags))
+                set_bit(flag, &fi->flags);
 }
 static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag)
@@ -997,7 +1014,8 @@ static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag)
 static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag)
 {
-        clear_bit(flag, &fi->flags);
+        if (test_bit(flag, &fi->flags))
+                clear_bit(flag, &fi->flags);
 }
 static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode)
@@ -1136,6 +1154,7 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
 int update_dent_inode(struct inode *, const struct qstr *);
 int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
 void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
+int f2fs_do_tmpfile(struct inode *, struct inode *);
 int f2fs_make_empty(struct inode *, struct inode *);
 bool f2fs_empty_dir(struct inode *);
@@ -1155,7 +1174,7 @@ void f2fs_msg(struct super_block *, const char *, const char *, ...);
 /*
 * hash.c
 */
-f2fs_hash_t f2fs_dentry_hash(const char *, size_t);
+f2fs_hash_t f2fs_dentry_hash(const struct qstr *);
 /*
 * node.c
@@ -1173,7 +1192,7 @@ int truncate_inode_blocks(struct inode *, pgoff_t);
 int truncate_xattr_node(struct inode *, struct page *);
 int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
 void remove_inode_page(struct inode *);
-struct page *new_inode_page(struct inode *, const struct qstr *);
+struct page *new_inode_page(struct inode *);
 struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
 void ra_node_page(struct f2fs_sb_info *, nid_t);
 struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
@@ -1185,6 +1204,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t);
 void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
 void recover_node_page(struct f2fs_sb_info *, struct page *,
                struct f2fs_summary *, struct node_info *, block_t);
+void recover_inline_xattr(struct inode *, struct page *);
 bool recover_xattr_data(struct inode *, struct page *, block_t);
 int recover_inode_page(struct f2fs_sb_info *, struct page *);
 int restore_node_summary(struct f2fs_sb_info *, unsigned int,
@@ -1206,7 +1226,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
 void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
 void clear_prefree_segments(struct f2fs_sb_info *);
-void discard_next_dnode(struct f2fs_sb_info *);
+void discard_next_dnode(struct f2fs_sb_info *, block_t);
 int npages_for_summary_flush(struct f2fs_sb_info *);
 void allocate_new_segments(struct f2fs_sb_info *);
 struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
@@ -1240,6 +1260,9 @@ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
 struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
 int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
 long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
+void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
+void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
+bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
 int acquire_orphan_inode(struct f2fs_sb_info *);
 void release_orphan_inode(struct f2fs_sb_info *);
 void add_orphan_inode(struct f2fs_sb_info *, nid_t);
@@ -1251,7 +1274,7 @@ void add_dirty_dir_inode(struct inode *);
 void remove_dirty_dir_inode(struct inode *);
 void sync_dirty_dir_inodes(struct f2fs_sb_info *);
 void write_checkpoint(struct f2fs_sb_info *, bool);
-void init_orphan_info(struct f2fs_sb_info *);
+void init_ino_entry_info(struct f2fs_sb_info *);
 int __init create_checkpoint_caches(void);
 void destroy_checkpoint_caches(void);
@@ -1295,7 +1318,6 @@ bool space_for_roll_forward(struct f2fs_sb_info *);
 struct f2fs_stat_info {
        struct list_head stat_list;
        struct f2fs_sb_info *sbi;
-        struct mutex stat_lock;
        int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
        int main_area_segs, main_area_sections, main_area_zones;
        int hit_ext, total_ext;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7d8b96275092..208f1a9bd569 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -127,12 +127,30 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                return 0;
        trace_f2fs_sync_file_enter(inode);
+        /* if fdatasync is triggered, let's do in-place-update */
+        if (datasync)
+                set_inode_flag(fi, FI_NEED_IPU);
        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+        if (datasync)
+                clear_inode_flag(fi, FI_NEED_IPU);
        if (ret) {
                trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
                return ret;
        }
+        /*
+         * if there is no written data, don't waste time to write recovery info.
+         */
+        if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
+                !exist_written_data(sbi, inode->i_ino, APPEND_INO)) {
+                if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
+                        exist_written_data(sbi, inode->i_ino, UPDATE_INO))
+                        goto flush_out;
+                goto out;
+        }
        /* guarantee free sections for fsync */
        f2fs_balance_fs(sbi);
@@ -188,6 +206,13 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                ret = wait_on_node_pages_writeback(sbi, inode->i_ino);
                if (ret)
                        goto out;
+                /* once recovery info is written, don't need to tack this */
+                remove_dirty_inode(sbi, inode->i_ino, APPEND_INO);
+                clear_inode_flag(fi, FI_APPEND_WRITE);
+flush_out:
+                remove_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
+                clear_inode_flag(fi, FI_UPDATE_WRITE);
                ret = f2fs_issue_flush(F2FS_SB(inode->i_sb));
        }
 out:
@@ -206,8 +231,9 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping,
        /* find first dirty page index */
        pagevec_init(&pvec, 0);
-        nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1);
+        nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs,
-        pgofs = nr_pages ? pvec.pages[0]->index: LONG_MAX;
+                                        PAGECACHE_TAG_DIRTY, 1);
+        pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX;
        pagevec_release(&pvec);
        return pgofs;
 }
@@ -272,8 +298,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
                        }
                }
-                end_offset = IS_INODE(dn.node_page) ?
+                end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
-                        ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
                /* find data/hole in dnode block */
                for (; dn.ofs_in_node < end_offset;
@@ -380,13 +405,15 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
                return;
        lock_page(page);
-        if (unlikely(page->mapping != inode->i_mapping)) {
+        if (unlikely(!PageUptodate(page) ||
-                f2fs_put_page(page, 1);
+                        page->mapping != inode->i_mapping))
-                return;
+                goto out;
-        }
        f2fs_wait_on_page_writeback(page, DATA);
        zero_user(page, offset, PAGE_CACHE_SIZE - offset);
        set_page_dirty(page);
+out:
        f2fs_put_page(page, 1);
 }
@@ -645,6 +672,8 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
        loff_t off_start, off_end;
        int ret = 0;
+        f2fs_balance_fs(sbi);
        ret = inode_newsize_ok(inode, (len + offset));
        if (ret)
                return ret;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index b90dbe55403a..d7947d90ccc3 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -186,7 +186,6 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
 static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-        unsigned int hint = 0;
        unsigned int secno;
        /*
@@ -194,11 +193,9 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
         * selected by background GC before.
         * Those segments guarantee they have small valid blocks.
         */
-next:
+        for_each_set_bit(secno, dirty_i->victim_secmap, TOTAL_SECS(sbi)) {
-        secno = find_next_bit(dirty_i->victim_secmap, TOTAL_SECS(sbi), hint++);
-        if (secno < TOTAL_SECS(sbi)) {
                if (sec_usage_check(sbi, secno))
-                        goto next;
+                        continue;
                clear_bit(secno, dirty_i->victim_secmap);
                return secno * sbi->segs_per_sec;
        }
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 6eb8d269b53b..948d17bf7281 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -69,12 +69,14 @@ static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num)
                *buf++ = pad;
 }
-f2fs_hash_t f2fs_dentry_hash(const char *name, size_t len)
+f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
 {
        __u32 hash;
        f2fs_hash_t f2fs_hash;
        const char *p;
        __u32 in[8], buf[4];
+        const char *name = name_info->name;
+        size_t len = name_info->len;
        if ((len <= 2) && (name[0] == '.') &&
                (name[1] == '.' || name[1] == '\0'))
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 1bba5228c197..5beeccef9ae1 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -172,6 +172,7 @@ int f2fs_write_inline_data(struct inode *inode,
                stat_inc_inline_inode(inode);
        }
+        set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
        sync_inode_page(&dn);
        f2fs_put_dnode(&dn);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2cf6962f6cc8..2c39999f3868 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -267,13 +267,14 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 void f2fs_evict_inode(struct inode *inode)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+        nid_t xnid = F2FS_I(inode)->i_xattr_nid;
        trace_f2fs_evict_inode(inode);
        truncate_inode_pages_final(&inode->i_data);
        if (inode->i_ino == F2FS_NODE_INO(sbi) ||
                        inode->i_ino == F2FS_META_INO(sbi))
-                goto no_delete;
+                goto out_clear;
        f2fs_bug_on(get_dirty_dents(inode));
        remove_dirty_dir_inode(inode);
@@ -295,6 +296,13 @@ void f2fs_evict_inode(struct inode *inode)
        sb_end_intwrite(inode->i_sb);
 no_delete:
-        clear_inode(inode);
        invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
+        if (xnid)
+                invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
+        if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE))
+                add_dirty_inode(sbi, inode->i_ino, APPEND_INO);
+        if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE))
+                add_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
+out_clear:
+        clear_inode(inode);
 }
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a6bdddc33ce2..27b03776ffd2 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/sched.h>
 #include <linux/ctype.h>
+#include <linux/dcache.h>
 #include "f2fs.h"
 #include "node.h"
@@ -22,14 +23,13 @@
 static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 {
-        struct super_block *sb = dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
-        struct f2fs_sb_info *sbi = F2FS_SB(sb);
        nid_t ino;
        struct inode *inode;
        bool nid_free = false;
        int err;
-        inode = new_inode(sb);
+        inode = new_inode(dir->i_sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
@@ -102,8 +102,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
 static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                                                bool excl)
 {
-        struct super_block *sb = dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
-        struct f2fs_sb_info *sbi = F2FS_SB(sb);
        struct inode *inode;
        nid_t ino = 0;
        int err;
@@ -146,8 +145,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
                struct dentry *dentry)
 {
        struct inode *inode = old_dentry->d_inode;
-        struct super_block *sb = dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
-        struct f2fs_sb_info *sbi = F2FS_SB(sb);
        int err;
        f2fs_balance_fs(sbi);
@@ -207,8 +205,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 {
-        struct super_block *sb = dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
-        struct f2fs_sb_info *sbi = F2FS_SB(sb);
        struct inode *inode = dentry->d_inode;
        struct f2fs_dir_entry *de;
        struct page *page;
@@ -242,8 +239,7 @@ fail:
 static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
                                        const char *symname)
 {
-        struct super_block *sb = dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
-        struct f2fs_sb_info *sbi = F2FS_SB(sb);
        struct inode *inode;
        size_t symlen = strlen(symname) + 1;
        int err;
@@ -330,8 +326,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
 static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
                                umode_t mode, dev_t rdev)
 {
-        struct super_block *sb = dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
-        struct f2fs_sb_info *sbi = F2FS_SB(sb);
        struct inode *inode;
        int err = 0;
@@ -369,8 +364,7 @@ out:
 static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry)
 {
-        struct super_block *sb = old_dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(old_dir->i_sb);
-        struct f2fs_sb_info *sbi = F2FS_SB(sb);
        struct inode *old_inode = old_dentry->d_inode;
        struct inode *new_inode = new_dentry->d_inode;
        struct page *old_dir_page;
@@ -393,8 +387,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        goto out_old;
        }
-        f2fs_lock_op(sbi);
        if (new_inode) {
                err = -ENOTEMPTY;
@@ -407,6 +399,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (!new_entry)
                        goto out_dir;
+                f2fs_lock_op(sbi);
                err = acquire_orphan_inode(sbi);
                if (err)
                        goto put_out_dir;
@@ -435,9 +429,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
                update_inode_page(old_inode);
                update_inode_page(new_inode);
        } else {
+                f2fs_lock_op(sbi);
                err = f2fs_add_link(new_dentry, old_inode);
-                if (err)
+                if (err) {
+                        f2fs_unlock_op(sbi);
                        goto out_dir;
+                }
                if (old_dir_entry) {
                        inc_nlink(new_dir);
@@ -472,6 +470,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
        return 0;
 put_out_dir:
+        f2fs_unlock_op(sbi);
        kunmap(new_page);
        f2fs_put_page(new_page, 0);
 out_dir:
@@ -479,7 +478,151 @@ out_dir:
                kunmap(old_dir_page);
                f2fs_put_page(old_dir_page, 0);
        }
+out_old:
+        kunmap(old_page);
+        f2fs_put_page(old_page, 0);
+out:
+        return err;
+}
+static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
+                             struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct super_block *sb = old_dir->i_sb;
+        struct f2fs_sb_info *sbi = F2FS_SB(sb);
+        struct inode *old_inode = old_dentry->d_inode;
+        struct inode *new_inode = new_dentry->d_inode;
+        struct page *old_dir_page, *new_dir_page;
+        struct page *old_page, *new_page;
+        struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL;
+        struct f2fs_dir_entry *old_entry, *new_entry;
+        int old_nlink = 0, new_nlink = 0;
+        int err = -ENOENT;
+        f2fs_balance_fs(sbi);
+        old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+        if (!old_entry)
+                goto out;
+        new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page);
+        if (!new_entry)
+                goto out_old;
+        /* prepare for updating ".." directory entry info later */
+        if (old_dir != new_dir) {
+                if (S_ISDIR(old_inode->i_mode)) {
+                        err = -EIO;
+                        old_dir_entry = f2fs_parent_dir(old_inode,
+                                                        &old_dir_page);
+                        if (!old_dir_entry)
+                                goto out_new;
+                }
+                if (S_ISDIR(new_inode->i_mode)) {
+                        err = -EIO;
+                        new_dir_entry = f2fs_parent_dir(new_inode,
+                                                        &new_dir_page);
+                        if (!new_dir_entry)
+                                goto out_old_dir;
+                }
+        }
+        /*
+         * If cross rename between file and directory those are not
+         * in the same directory, we will inc nlink of file's parent
+         * later, so we should check upper boundary of its nlink.
+         */
+        if ((!old_dir_entry || !new_dir_entry) &&
+                                old_dir_entry != new_dir_entry) {
+                old_nlink = old_dir_entry ? -1 : 1;
+                new_nlink = -old_nlink;
+                err = -EMLINK;
+                if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) ||
+                        (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX))
+                        goto out_new_dir;
+        }
+        f2fs_lock_op(sbi);
+        err = update_dent_inode(old_inode, &new_dentry->d_name);
+        if (err)
+                goto out_unlock;
+        err = update_dent_inode(new_inode, &old_dentry->d_name);
+        if (err)
+                goto out_undo;
+        /* update ".." directory entry info of old dentry */
+        if (old_dir_entry)
+                f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir);
+        /* update ".." directory entry info of new dentry */
+        if (new_dir_entry)
+                f2fs_set_link(new_inode, new_dir_entry, new_dir_page, old_dir);
+        /* update directory entry info of old dir inode */
+        f2fs_set_link(old_dir, old_entry, old_page, new_inode);
+        down_write(&F2FS_I(old_inode)->i_sem);
+        file_lost_pino(old_inode);
+        up_write(&F2FS_I(old_inode)->i_sem);
+        update_inode_page(old_inode);
+        old_dir->i_ctime = CURRENT_TIME;
+        if (old_nlink) {
+                down_write(&F2FS_I(old_dir)->i_sem);
+                if (old_nlink < 0)
+                        drop_nlink(old_dir);
+                else
+                        inc_nlink(old_dir);
+                up_write(&F2FS_I(old_dir)->i_sem);
+        }
+        mark_inode_dirty(old_dir);
+        update_inode_page(old_dir);
+        /* update directory entry info of new dir inode */
+        f2fs_set_link(new_dir, new_entry, new_page, old_inode);
+        down_write(&F2FS_I(new_inode)->i_sem);
+        file_lost_pino(new_inode);
+        up_write(&F2FS_I(new_inode)->i_sem);
+        update_inode_page(new_inode);
+        new_dir->i_ctime = CURRENT_TIME;
+        if (new_nlink) {
+                down_write(&F2FS_I(new_dir)->i_sem);
+                if (new_nlink < 0)
+                        drop_nlink(new_dir);
+                else
+                        inc_nlink(new_dir);
+                up_write(&F2FS_I(new_dir)->i_sem);
+        }
+        mark_inode_dirty(new_dir);
+        update_inode_page(new_dir);
+        f2fs_unlock_op(sbi);
+        return 0;
+out_undo:
+        /* Still we may fail to recover name info of f2fs_inode here */
+        update_dent_inode(old_inode, &old_dentry->d_name);
+out_unlock:
        f2fs_unlock_op(sbi);
+out_new_dir:
+        if (new_dir_entry) {
+                kunmap(new_dir_page);
+                f2fs_put_page(new_dir_page, 0);
+        }
+out_old_dir:
+        if (old_dir_entry) {
+                kunmap(old_dir_page);
+                f2fs_put_page(old_dir_page, 0);
+        }
+out_new:
+        kunmap(new_page);
+        f2fs_put_page(new_page, 0);
 out_old:
        kunmap(old_page);
        f2fs_put_page(old_page, 0);
@@ -487,6 +630,71 @@ out:
        return err;
 }
+static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir, struct dentry *new_dentry,
+                        unsigned int flags)
+{
+        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+                return -EINVAL;
+        if (flags & RENAME_EXCHANGE) {
+                return f2fs_cross_rename(old_dir, old_dentry,
+                                         new_dir, new_dentry);
+        }
+        /*
+         * VFS has already handled the new dentry existence case,
+         * here, we just deal with "RENAME_NOREPLACE" as regular rename.
+         */
+        return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+        struct inode *inode;
+        int err;
+        inode = f2fs_new_inode(dir, mode);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        inode->i_op = &f2fs_file_inode_operations;
+        inode->i_fop = &f2fs_file_operations;
+        inode->i_mapping->a_ops = &f2fs_dblock_aops;
+        f2fs_lock_op(sbi);
+        err = acquire_orphan_inode(sbi);
+        if (err)
+                goto out;
+        err = f2fs_do_tmpfile(inode, dir);
+        if (err)
+                goto release_out;
+        /*
+         * add this non-linked tmpfile to orphan list, in this way we could
+         * remove all unused data of tmpfile after abnormal power-off.
+         */
+        add_orphan_inode(sbi, inode->i_ino);
+        f2fs_unlock_op(sbi);
+        alloc_nid_done(sbi, inode->i_ino);
+        d_tmpfile(dentry, inode);
+        unlock_new_inode(inode);
+        return 0;
+release_out:
+        release_orphan_inode(sbi);
+out:
+        f2fs_unlock_op(sbi);
+        clear_nlink(inode);
+        unlock_new_inode(inode);
+        make_bad_inode(inode);
+        iput(inode);
+        alloc_nid_failed(sbi, inode->i_ino);
+        return err;
+}
 const struct inode_operations f2fs_dir_inode_operations = {
        .create         = f2fs_create,
        .lookup         = f2fs_lookup,
@@ -497,6 +705,8 @@ const struct inode_operations f2fs_dir_inode_operations = {
        .rmdir          = f2fs_rmdir,
        .mknod          = f2fs_mknod,
        .rename         = f2fs_rename,
+        .rename2        = f2fs_rename2,
+        .tmpfile        = f2fs_tmpfile,
        .getattr        = f2fs_getattr,
        .setattr        = f2fs_setattr,
        .get_acl        = f2fs_get_acl,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 4b697ccc9b0c..d3d90d284631 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -25,6 +25,7 @@
 static struct kmem_cache *nat_entry_slab;
 static struct kmem_cache *free_nid_slab;
+static struct kmem_cache *nat_entry_set_slab;
 bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 {
@@ -90,12 +91,8 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
        /* get current nat block page with lock */
        src_page = get_meta_page(sbi, src_off);
-        /* Dirty src_page means that it is already the new target NAT page. */
-        if (PageDirty(src_page))
-                return src_page;
        dst_page = grab_meta_page(sbi, dst_off);
+        f2fs_bug_on(PageDirty(src_page));
        src_addr = page_address(src_page);
        dst_addr = page_address(dst_page);
@@ -845,7 +842,7 @@ void remove_inode_page(struct inode *inode)
        truncate_node(&dn);
 }
-struct page *new_inode_page(struct inode *inode, const struct qstr *name)
+struct page *new_inode_page(struct inode *inode)
 {
        struct dnode_of_data dn;
@@ -1234,12 +1231,12 @@ static int f2fs_write_node_page(struct page *page,
        if (wbc->for_reclaim)
                goto redirty_out;
-        mutex_lock(&sbi->node_write);
+        down_read(&sbi->node_write);
        set_page_writeback(page);
        write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
        set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page));
        dec_page_count(sbi, F2FS_DIRTY_NODES);
-        mutex_unlock(&sbi->node_write);
+        up_read(&sbi->node_write);
        unlock_page(page);
        return 0;
@@ -1552,7 +1549,7 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
        clear_node_page_dirty(page);
 }
-static void recover_inline_xattr(struct inode *inode, struct page *page)
+void recover_inline_xattr(struct inode *inode, struct page *page)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        void *src_addr, *dst_addr;
@@ -1591,8 +1588,6 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
        nid_t new_xnid = nid_of_node(page);
        struct node_info ni;
-        recover_inline_xattr(inode, page);
        if (!f2fs_has_xattr_block(ofs_of_node(page)))
                return false;
@@ -1744,7 +1739,90 @@ skip:
        return err;
 }
-static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
+static struct nat_entry_set *grab_nat_entry_set(void)
+{
+        struct nat_entry_set *nes =
+                        f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
+        nes->entry_cnt = 0;
+        INIT_LIST_HEAD(&nes->set_list);
+        INIT_LIST_HEAD(&nes->entry_list);
+        return nes;
+}
+static void release_nat_entry_set(struct nat_entry_set *nes,
+                                                struct f2fs_nm_info *nm_i)
+{
+        f2fs_bug_on(!list_empty(&nes->entry_list));
+        nm_i->dirty_nat_cnt -= nes->entry_cnt;
+        list_del(&nes->set_list);
+        kmem_cache_free(nat_entry_set_slab, nes);
+}
+static void adjust_nat_entry_set(struct nat_entry_set *nes,
+                                                struct list_head *head)
+{
+        struct nat_entry_set *next = nes;
+        if (list_is_last(&nes->set_list, head))
+                return;
+        list_for_each_entry_continue(next, head, set_list)
+                if (nes->entry_cnt <= next->entry_cnt)
+                        break;
+        list_move_tail(&nes->set_list, &next->set_list);
+}
+static void add_nat_entry(struct nat_entry *ne, struct list_head *head)
+{
+        struct nat_entry_set *nes;
+        nid_t start_nid = START_NID(ne->ni.nid);
+        list_for_each_entry(nes, head, set_list) {
+                if (nes->start_nid == start_nid) {
+                        list_move_tail(&ne->list, &nes->entry_list);
+                        nes->entry_cnt++;
+                        adjust_nat_entry_set(nes, head);
+                        return;
+                }
+        }
+        nes = grab_nat_entry_set();
+        nes->start_nid = start_nid;
+        list_move_tail(&ne->list, &nes->entry_list);
+        nes->entry_cnt++;
+        list_add(&nes->set_list, head);
+}
+static void merge_nats_in_set(struct f2fs_sb_info *sbi)
+{
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
+        struct list_head *dirty_list = &nm_i->dirty_nat_entries;
+        struct list_head *set_list = &nm_i->nat_entry_set;
+        struct nat_entry *ne, *tmp;
+        write_lock(&nm_i->nat_tree_lock);
+        list_for_each_entry_safe(ne, tmp, dirty_list, list) {
+                if (nat_get_blkaddr(ne) == NEW_ADDR)
+                        continue;
+                add_nat_entry(ne, set_list);
+                nm_i->dirty_nat_cnt++;
+        }
+        write_unlock(&nm_i->nat_tree_lock);
+}
+static bool __has_cursum_space(struct f2fs_summary_block *sum, int size)
+{
+        if (nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES)
+                return true;
+        else
+                return false;
+}
+static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -1752,12 +1830,6 @@ static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
        int i;
        mutex_lock(&curseg->curseg_mutex);
-        if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) {
-                mutex_unlock(&curseg->curseg_mutex);
-                return false;
-        }
        for (i = 0; i < nats_in_cursum(sum); i++) {
                struct nat_entry *ne;
                struct f2fs_nat_entry raw_ne;
@@ -1767,23 +1839,21 @@ static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
 retry:
                write_lock(&nm_i->nat_tree_lock);
                ne = __lookup_nat_cache(nm_i, nid);
-                if (ne) {
+                if (ne)
-                        __set_nat_cache_dirty(nm_i, ne);
+                        goto found;
-                        write_unlock(&nm_i->nat_tree_lock);
-                        continue;
-                }
                ne = grab_nat_entry(nm_i, nid);
                if (!ne) {
                        write_unlock(&nm_i->nat_tree_lock);
                        goto retry;
                }
                node_info_from_raw_nat(&ne->ni, &raw_ne);
+found:
                __set_nat_cache_dirty(nm_i, ne);
                write_unlock(&nm_i->nat_tree_lock);
        }
        update_nats_in_cursum(sum, -i);
        mutex_unlock(&curseg->curseg_mutex);
-        return true;
 }
 /*
@@ -1794,80 +1864,91 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
        struct f2fs_summary_block *sum = curseg->sum_blk;
-        struct nat_entry *ne, *cur;
+        struct nat_entry_set *nes, *tmp;
-        struct page *page = NULL;
+        struct list_head *head = &nm_i->nat_entry_set;
-        struct f2fs_nat_block *nat_blk = NULL;
+        bool to_journal = true;
-        nid_t start_nid = 0, end_nid = 0;
-        bool flushed;
-        flushed = flush_nats_in_journal(sbi);
+        /* merge nat entries of dirty list to nat entry set temporarily */
+        merge_nats_in_set(sbi);
-        if (!flushed)
-                mutex_lock(&curseg->curseg_mutex);
-        /* 1) flush dirty nat caches */
-        list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) {
-                nid_t nid;
-                struct f2fs_nat_entry raw_ne;
-                int offset = -1;
-                if (nat_get_blkaddr(ne) == NEW_ADDR)
-                        continue;
-                nid = nat_get_nid(ne);
+        /*
+         * if there are no enough space in journal to store dirty nat
+         * entries, remove all entries from journal and merge them
+         * into nat entry set.
+         */
+        if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt)) {
+                remove_nats_in_journal(sbi);
-                if (flushed)
+                /*
-                        goto to_nat_page;
+                 * merge nat entries of dirty list to nat entry set temporarily
+                 */
+                merge_nats_in_set(sbi);
+        }
-                /* if there is room for nat enries in curseg->sumpage */
+        if (!nm_i->dirty_nat_cnt)
-                offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1);
+                return;
-                if (offset >= 0) {
-                        raw_ne = nat_in_journal(sum, offset);
-                        goto flush_now;
-                }
-to_nat_page:
-                if (!page || (start_nid > nid || nid > end_nid)) {
-                        if (page) {
-                                f2fs_put_page(page, 1);
-                                page = NULL;
-                        }
-                        start_nid = START_NID(nid);
-                        end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1;
-                        /*
+        /*
-                         * get nat block with dirty flag, increased reference
+         * there are two steps to flush nat entries:
-                         * count, mapped and lock
+         * #1, flush nat entries to journal in current hot data summary block.
-                         */
+         * #2, flush nat entries to nat page.
+         */
+        list_for_each_entry_safe(nes, tmp, head, set_list) {
+                struct f2fs_nat_block *nat_blk;
+                struct nat_entry *ne, *cur;
+                struct page *page;
+                nid_t start_nid = nes->start_nid;
+                if (to_journal && !__has_cursum_space(sum, nes->entry_cnt))
+                        to_journal = false;
+                if (to_journal) {
+                        mutex_lock(&curseg->curseg_mutex);
+                } else {
                        page = get_next_nat_page(sbi, start_nid);
                        nat_blk = page_address(page);
+                        f2fs_bug_on(!nat_blk);
                }
-                f2fs_bug_on(!nat_blk);
+                /* flush dirty nats in nat entry set */
-                raw_ne = nat_blk->entries[nid - start_nid];
+                list_for_each_entry_safe(ne, cur, &nes->entry_list, list) {
-flush_now:
+                        struct f2fs_nat_entry *raw_ne;
-                raw_nat_from_node_info(&raw_ne, &ne->ni);
+                        nid_t nid = nat_get_nid(ne);
+                        int offset;
-                if (offset < 0) {
-                        nat_blk->entries[nid - start_nid] = raw_ne;
+                        if (to_journal) {
-                } else {
+                                offset = lookup_journal_in_cursum(sum,
-                        nat_in_journal(sum, offset) = raw_ne;
+                                                        NAT_JOURNAL, nid, 1);
-                        nid_in_journal(sum, offset) = cpu_to_le32(nid);
+                                f2fs_bug_on(offset < 0);
-                }
+                                raw_ne = &nat_in_journal(sum, offset);
+                                nid_in_journal(sum, offset) = cpu_to_le32(nid);
+                        } else {
+                                raw_ne = &nat_blk->entries[nid - start_nid];
+                        }
+                        raw_nat_from_node_info(raw_ne, &ne->ni);
-                if (nat_get_blkaddr(ne) == NULL_ADDR &&
+                        if (nat_get_blkaddr(ne) == NULL_ADDR &&
                                add_free_nid(sbi, nid, false) <= 0) {
-                        write_lock(&nm_i->nat_tree_lock);
+                                write_lock(&nm_i->nat_tree_lock);
-                        __del_from_nat_cache(nm_i, ne);
+                                __del_from_nat_cache(nm_i, ne);
-                        write_unlock(&nm_i->nat_tree_lock);
+                                write_unlock(&nm_i->nat_tree_lock);
-                } else {
+                        } else {
-                        write_lock(&nm_i->nat_tree_lock);
+                                write_lock(&nm_i->nat_tree_lock);
-                        __clear_nat_cache_dirty(nm_i, ne);
+                                __clear_nat_cache_dirty(nm_i, ne);
-                        write_unlock(&nm_i->nat_tree_lock);
+                                write_unlock(&nm_i->nat_tree_lock);
+                        }
                }
+                if (to_journal)
+                        mutex_unlock(&curseg->curseg_mutex);
+                else
+                        f2fs_put_page(page, 1);
+                release_nat_entry_set(nes, nm_i);
        }
-        if (!flushed)
-                mutex_unlock(&curseg->curseg_mutex);
+        f2fs_bug_on(!list_empty(head));
-        f2fs_put_page(page, 1);
+        f2fs_bug_on(nm_i->dirty_nat_cnt);
 }
 static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -1896,6 +1977,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
        INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
        INIT_LIST_HEAD(&nm_i->nat_entries);
        INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
+        INIT_LIST_HEAD(&nm_i->nat_entry_set);
        mutex_init(&nm_i->build_lock);
        spin_lock_init(&nm_i->free_nid_list_lock);
@@ -1976,19 +2058,30 @@ int __init create_node_manager_caches(void)
        nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
                        sizeof(struct nat_entry));
        if (!nat_entry_slab)
-                return -ENOMEM;
+                goto fail;
        free_nid_slab = f2fs_kmem_cache_create("free_nid",
                        sizeof(struct free_nid));
-        if (!free_nid_slab) {
+        if (!free_nid_slab)
-                kmem_cache_destroy(nat_entry_slab);
+                goto destory_nat_entry;
-                return -ENOMEM;
-        }
+        nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
+                        sizeof(struct nat_entry_set));
+        if (!nat_entry_set_slab)
+                goto destory_free_nid;
        return 0;
+destory_free_nid:
+        kmem_cache_destroy(free_nid_slab);
+destory_nat_entry:
+        kmem_cache_destroy(nat_entry_slab);
+fail:
+        return -ENOMEM;
 }
 void destroy_node_manager_caches(void)
 {
+        kmem_cache_destroy(nat_entry_set_slab);
        kmem_cache_destroy(free_nid_slab);
        kmem_cache_destroy(nat_entry_slab);
 }
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 7281112cd1c8..8a116a407599 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -89,6 +89,13 @@ enum mem_type {
        DIRTY_DENTS     /* indicates dirty dentry pages */
 };
+struct nat_entry_set {
+        struct list_head set_list;      /* link with all nat sets */
+        struct list_head entry_list;    /* link with dirty nat entries */
+        nid_t start_nid;                /* start nid of nats in set */
+        unsigned int entry_cnt;         /* the # of nat entries in set */
+};
 /*
 * For free nid mangement
 */
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index a112368a4a86..fe1c6d921ba2 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -300,6 +300,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        struct node_info ni;
        int err = 0, recovered = 0;
+        recover_inline_xattr(inode, page);
        if (recover_inline_data(inode, page))
                goto out;
@@ -434,7 +436,9 @@ next:
 int recover_fsync_data(struct f2fs_sb_info *sbi)
 {
+        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
        struct list_head inode_list;
+        block_t blkaddr;
        int err;
        bool need_writecp = false;
@@ -447,6 +451,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
        /* step #1: find fsynced inode numbers */
        sbi->por_doing = true;
+        blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
        err = find_fsync_dnodes(sbi, &inode_list);
        if (err)
                goto out;
@@ -462,8 +469,21 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 out:
        destroy_fsync_dnodes(&inode_list);
        kmem_cache_destroy(fsync_entry_slab);
+        if (err) {
+                truncate_inode_pages_final(NODE_MAPPING(sbi));
+                truncate_inode_pages_final(META_MAPPING(sbi));
+        }
        sbi->por_doing = false;
-        if (!err && need_writecp)
+        if (err) {
+                discard_next_dnode(sbi, blkaddr);
+                /* Flush all the NAT/SIT pages */
+                while (get_pages(sbi, F2FS_DIRTY_META))
+                        sync_meta_pages(sbi, META, LONG_MAX);
+        } else if (need_writecp) {
                write_checkpoint(sbi, false);
+        }
        return err;
 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d04613df710a..0dfeebae2a50 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -239,6 +239,12 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
        struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
        struct flush_cmd cmd;
+        trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER),
+                                        test_opt(sbi, FLUSH_MERGE));
+        if (test_opt(sbi, NOBARRIER))
+                return 0;
        if (!test_opt(sbi, FLUSH_MERGE))
                return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
@@ -272,13 +278,13 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
                return -ENOMEM;
        spin_lock_init(&fcc->issue_lock);
        init_waitqueue_head(&fcc->flush_wait_queue);
-        sbi->sm_info->cmd_control_info = fcc;
+        SM_I(sbi)->cmd_control_info = fcc;
        fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
                                "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
        if (IS_ERR(fcc->f2fs_issue_flush)) {
                err = PTR_ERR(fcc->f2fs_issue_flush);
                kfree(fcc);
-                sbi->sm_info->cmd_control_info = NULL;
+                SM_I(sbi)->cmd_control_info = NULL;
                return err;
        }
@@ -287,13 +293,12 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
 void destroy_flush_cmd_control(struct f2fs_sb_info *sbi)
 {
-        struct flush_cmd_control *fcc =
+        struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
-                                sbi->sm_info->cmd_control_info;
        if (fcc && fcc->f2fs_issue_flush)
                kthread_stop(fcc->f2fs_issue_flush);
        kfree(fcc);
-        sbi->sm_info->cmd_control_info = NULL;
+        SM_I(sbi)->cmd_control_info = NULL;
 }
 static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
@@ -377,11 +382,8 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
        return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
 }
-void discard_next_dnode(struct f2fs_sb_info *sbi)
+void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
 {
-        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
-        block_t blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
        if (f2fs_issue_discard(sbi, blkaddr, 1)) {
                struct page *page = grab_meta_page(sbi, blkaddr);
                /* zero-filled page */
@@ -437,17 +439,12 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi,
 static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
 {
        struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
-        unsigned int segno = -1;
+        unsigned int segno;
        unsigned int total_segs = TOTAL_SEGS(sbi);
        mutex_lock(&dirty_i->seglist_lock);
-        while (1) {
+        for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], total_segs)
-                segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
-                                segno + 1);
-                if (segno >= total_segs)
-                        break;
                __set_test_and_free(sbi, segno);
-        }
        mutex_unlock(&dirty_i->seglist_lock);
 }
@@ -974,14 +971,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 {
        struct sit_info *sit_i = SIT_I(sbi);
        struct curseg_info *curseg;
-        unsigned int old_cursegno;
        curseg = CURSEG_I(sbi, type);
        mutex_lock(&curseg->curseg_mutex);
        *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
-        old_cursegno = curseg->segno;
        /*
         * __add_sum_entry should be resided under the curseg_mutex
@@ -1002,7 +997,6 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
         * since SSR needs latest valid block information.
         */
        refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
-        locate_dirty_segment(sbi, old_cursegno);
        mutex_unlock(&sit_i->sentry_lock);
@@ -1532,7 +1526,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi)
        struct page *page = NULL;
        struct f2fs_sit_block *raw_sit = NULL;
        unsigned int start = 0, end = 0;
-        unsigned int segno = -1;
+        unsigned int segno;
        bool flushed;
        mutex_lock(&curseg->curseg_mutex);
@@ -1544,7 +1538,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi)
         */
        flushed = flush_sits_in_journal(sbi);
-        while ((segno = find_next_bit(bitmap, nsegs, segno + 1)) < nsegs) {
+        for_each_set_bit(segno, bitmap, nsegs) {
                struct seg_entry *se = get_seg_entry(sbi, segno);
                int sit_offset, offset;
@@ -1703,7 +1697,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
        struct curseg_info *array;
        int i;
-        array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL);
+        array = kcalloc(NR_CURSEG_TYPE, sizeof(*array), GFP_KERNEL);
        if (!array)
                return -ENOMEM;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 7091204680f4..55973f7b0330 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -347,8 +347,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
        if (test_and_clear_bit(segno, free_i->free_segmap)) {
                free_i->free_segments++;
-                next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi),
+                next = find_next_bit(free_i->free_segmap,
-                                                                start_segno);
+                                start_segno + sbi->segs_per_sec, start_segno);
                if (next >= start_segno + sbi->segs_per_sec) {
                        if (test_and_clear_bit(secno, free_i->free_secmap))
                                free_i->free_sections++;
@@ -486,6 +486,10 @@ static inline bool need_inplace_update(struct inode *inode)
        if (S_ISDIR(inode->i_mode))
                return false;
+        /* this is only set during fdatasync */
+        if (is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
+                return true;
        switch (SM_I(sbi)->ipu_policy) {
        case F2FS_IPU_FORCE:
                return true;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8f96d9372ade..657582fc7601 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -52,6 +52,7 @@ enum {
        Opt_inline_xattr,
        Opt_inline_data,
        Opt_flush_merge,
+        Opt_nobarrier,
        Opt_err,
 };
@@ -69,6 +70,7 @@ static match_table_t f2fs_tokens = {
        {Opt_inline_xattr, "inline_xattr"},
        {Opt_inline_data, "inline_data"},
        {Opt_flush_merge, "flush_merge"},
+        {Opt_nobarrier, "nobarrier"},
        {Opt_err, NULL},
 };
@@ -339,6 +341,9 @@ static int parse_options(struct super_block *sb, char *options)
                case Opt_flush_merge:
                        set_opt(sbi, FLUSH_MERGE);
                        break;
+                case Opt_nobarrier:
+                        set_opt(sbi, NOBARRIER);
+                        break;
                default:
                        f2fs_msg(sb, KERN_ERR,
                                "Unrecognized mount option \"%s\" or missing value",
@@ -544,6 +549,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
                seq_puts(seq, ",inline_data");
        if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
                seq_puts(seq, ",flush_merge");
+        if (test_opt(sbi, NOBARRIER))
+                seq_puts(seq, ",nobarrier");
        seq_printf(seq, ",active_logs=%u", sbi->active_logs);
        return 0;
@@ -615,7 +622,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
         * Previous and new state of filesystem is RO,
         * so skip checking GC and FLUSH_MERGE conditions.
         */
-        if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+        if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
                goto skip;
        /*
@@ -642,8 +649,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
         */
        if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
                destroy_flush_cmd_control(sbi);
-        } else if (test_opt(sbi, FLUSH_MERGE) &&
+        } else if (test_opt(sbi, FLUSH_MERGE) && !SM_I(sbi)->cmd_control_info) {
-                                        !sbi->sm_info->cmd_control_info) {
                err = create_flush_cmd_control(sbi);
                if (err)
                        goto restore_gc;
@@ -947,7 +953,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        mutex_init(&sbi->gc_mutex);
        mutex_init(&sbi->writepages);
        mutex_init(&sbi->cp_mutex);
-        mutex_init(&sbi->node_write);
+        init_rwsem(&sbi->node_write);
        sbi->por_doing = false;
        spin_lock_init(&sbi->stat_lock);
@@ -997,7 +1003,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        INIT_LIST_HEAD(&sbi->dir_inode_list);
        spin_lock_init(&sbi->dir_inode_lock);
-        init_orphan_info(sbi);
+        init_ino_entry_info(sbi);
        /* setup f2fs internal modules */
        err = build_segment_manager(sbi);
@@ -1034,8 +1040,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_node_inode;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+                iput(root);
                err = -EINVAL;
-                goto free_root_inode;
+                goto free_node_inode;
        }
        sb->s_root = d_make_root(root); /* allocate root dentry */
@@ -1082,7 +1089,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
         * If filesystem is not mounted as read-only then
         * do start the gc_thread.
         */
-        if (!(sb->s_flags & MS_RDONLY)) {
+        if (!f2fs_readonly(sb)) {
                /* After POR, we can run background GC thread.*/
                err = start_gc_thread(sbi);
                if (err)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 72c82f69b01b..22d1c3df61ac 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -21,6 +21,7 @@
 #include <linux/rcupdate.h>
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
+#include <linux/shmem_fs.h>
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -336,6 +337,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
        case F_GETPIPE_SZ:
                err = pipe_fcntl(filp, cmd, arg);
                break;
+        case F_ADD_SEALS:
+        case F_GET_SEALS:
+                err = shmem_fcntl(filp, cmd, arg);
+                break;
        default:
                break;
        }
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
new file mode 100644
index 000000000000..9368236ca100
--- /dev/null
+++ b/fs/fs_pin.c
@@ -0,0 +1,78 @@
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/fs_pin.h>
+#include "internal.h"
+#include "mount.h"
+static void pin_free_rcu(struct rcu_head *head)
+{
+        kfree(container_of(head, struct fs_pin, rcu));
+}
+static DEFINE_SPINLOCK(pin_lock);
+void pin_put(struct fs_pin *p)
+{
+        if (atomic_long_dec_and_test(&p->count))
+                call_rcu(&p->rcu, pin_free_rcu);
+}
+void pin_remove(struct fs_pin *pin)
+{
+        spin_lock(&pin_lock);
+        hlist_del(&pin->m_list);
+        hlist_del(&pin->s_list);
+        spin_unlock(&pin_lock);
+}
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+{
+        spin_lock(&pin_lock);
+        hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
+        hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
+        spin_unlock(&pin_lock);
+}
+void mnt_pin_kill(struct mount *m)
+{
+        while (1) {
+                struct hlist_node *p;
+                struct fs_pin *pin;
+                rcu_read_lock();
+                p = ACCESS_ONCE(m->mnt_pins.first);
+                if (!p) {
+                        rcu_read_unlock();
+                        break;
+                }
+                pin = hlist_entry(p, struct fs_pin, m_list);
+                if (!atomic_long_inc_not_zero(&pin->count)) {
+                        rcu_read_unlock();
+                        cpu_relax();
+                        continue;
+                }
+                rcu_read_unlock();
+                pin->kill(pin);
+        }
+}
+void sb_pin_kill(struct super_block *sb)
+{
+        while (1) {
+                struct hlist_node *p;
+                struct fs_pin *pin;
+                rcu_read_lock();
+                p = ACCESS_ONCE(sb->s_pins.first);
+                if (!p) {
+                        rcu_read_unlock();
+                        break;
+                }
+                pin = hlist_entry(p, struct fs_pin, s_list);
+                if (!atomic_long_inc_not_zero(&pin->count)) {
+                        rcu_read_unlock();
+                        cpu_relax();
+                        continue;
+                }
+                rcu_read_unlock();
+                pin->kill(pin);
+        }
+}
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index a31b83c5cbd9..b39d487ccfb0 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -67,7 +67,7 @@ static int fscache_max_active_sysctl(struct ctl_table *table, int write,
        return ret;
 }
-struct ctl_table fscache_sysctls[] = {
+static struct ctl_table fscache_sysctls[] = {
        {
                .procname       = "object_max_active",
                .data           = &fscache_object_max_active,
@@ -87,7 +87,7 @@ struct ctl_table fscache_sysctls[] = {
        {}
 };
-struct ctl_table fscache_sysctls_root[] = {
+static struct ctl_table fscache_sysctls_root[] = {
        {
                .procname       = "fscache",
                .mode           = 0555,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0c6048247a34..de1d84af9f7c 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -845,12 +845,6 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
        return err;
 }
-static int fuse_rename(struct inode *olddir, struct dentry *oldent,
-                       struct inode *newdir, struct dentry *newent)
-{
-        return fuse_rename2(olddir, oldent, newdir, newent, 0);
-}
 static int fuse_link(struct dentry *entry, struct inode *newdir,
                     struct dentry *newent)
 {
@@ -2024,7 +2018,6 @@ static const struct inode_operations fuse_dir_inode_operations = {
        .symlink        = fuse_symlink,
        .unlink         = fuse_unlink,
        .rmdir          = fuse_rmdir,
-        .rename         = fuse_rename,
        .rename2        = fuse_rename2,
        .link           = fuse_link,
        .setattr        = fuse_setattr,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 40ac2628ddcf..912061ac4baf 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1303,10 +1303,10 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
        while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
                unsigned npages;
                size_t start;
-                unsigned n = req->max_pages - req->num_pages;
                ssize_t ret = iov_iter_get_pages(ii,
                                        &req->pages[req->num_pages],
-                                        n * PAGE_SIZE, &start);
+                                        req->max_pages - req->num_pages,
+                                        &start);
                if (ret < 0)
                        return ret;
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 9c88da0e855a..4fcd40d6f308 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -89,6 +89,7 @@ extern int do_mknod(const char *file, int mode, unsigned int major,
 extern int link_file(const char *from, const char *to);
 extern int hostfs_do_readlink(char *file, char *buf, int size);
 extern int rename_file(char *from, char *to);
+extern int rename2_file(char *from, char *to, unsigned int flags);
 extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
                     long long *bfree_out, long long *bavail_out,
                     long long *files_out, long long *ffree_out,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index bb529f3b7f2b..fd62cae0fdcb 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -741,21 +741,31 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
        return err;
 }
-static int hostfs_rename(struct inode *from_ino, struct dentry *from,
+static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
-                         struct inode *to_ino, struct dentry *to)
+                          struct inode *new_dir, struct dentry *new_dentry,
+                          unsigned int flags)
 {
-        char *from_name, *to_name;
+        char *old_name, *new_name;
        int err;
-        if ((from_name = dentry_name(from)) == NULL)
+        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+                return -EINVAL;
+        old_name = dentry_name(old_dentry);
+        if (old_name == NULL)
                return -ENOMEM;
-        if ((to_name = dentry_name(to)) == NULL) {
+        new_name = dentry_name(new_dentry);
-                __putname(from_name);
+        if (new_name == NULL) {
+                __putname(old_name);
                return -ENOMEM;
        }
-        err = rename_file(from_name, to_name);
+        if (!flags)
-        __putname(from_name);
+                err = rename_file(old_name, new_name);
-        __putname(to_name);
+        else
+                err = rename2_file(old_name, new_name, flags);
+        __putname(old_name);
+        __putname(new_name);
        return err;
 }
@@ -867,7 +877,7 @@ static const struct inode_operations hostfs_dir_iops = {
        .mkdir          = hostfs_mkdir,
        .rmdir          = hostfs_rmdir,
        .mknod          = hostfs_mknod,
-        .rename         = hostfs_rename,
+        .rename2        = hostfs_rename2,
        .permission     = hostfs_permission,
        .setattr        = hostfs_setattr,
 };
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 67838f3aa20a..9765dab95cbd 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -14,6 +14,7 @@
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/vfs.h>
+#include <sys/syscall.h>
 #include "hostfs.h"
 #include <utime.h>
@@ -360,6 +361,33 @@ int rename_file(char *from, char *to)
        return 0;
 }
+int rename2_file(char *from, char *to, unsigned int flags)
+{
+        int err;
+#ifndef SYS_renameat2
+#  ifdef __x86_64__
+#    define SYS_renameat2 316
+#  endif
+#  ifdef __i386__
+#    define SYS_renameat2 353
+#  endif
+#endif
+#ifdef SYS_renameat2
+        err = syscall(SYS_renameat2, AT_FDCWD, from, AT_FDCWD, to, flags);
+        if (err < 0) {
+                if (errno != ENOSYS)
+                        return -errno;
+                else
+                        return -EINVAL;
+        }
+        return 0;
+#else
+        return -EINVAL;
+#endif
+}
 int do_statfs(char *root, long *bsize_out, long long *blocks_out,
              long long *bfree_out, long long *bavail_out,
              long long *files_out, long long *ffree_out,
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index f36fc010fccb..2923a7bd82ac 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -545,12 +545,13 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
                        struct dnode *d1;
                        struct quad_buffer_head qbh1;
                        if (hpfs_sb(i->i_sb)->sb_chk)
-                            if (up != i->i_ino) {
+                                if (up != i->i_ino) {
-                                hpfs_error(i->i_sb,
+                                        hpfs_error(i->i_sb,
-                                        "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx",
+                                                   "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx",
-                                        dno, up, (unsigned long)i->i_ino);
+                                                   dno, up,
-                                return;
+                                                   (unsigned long)i->i_ino);
-                            }
+                                        return;
+                                }
                        if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) {
                                d1->up = cpu_to_le32(up);
                                d1->root_dnode = 1;
@@ -1061,8 +1062,8 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
                hpfs_brelse4(qbh);
                if (hpfs_sb(s)->sb_chk)
                        if (hpfs_stop_cycles(s, dno, &c1, &c2, "map_fnode_dirent #1")) {
-                        kfree(name2);
+                                kfree(name2);
-                        return NULL;
+                                return NULL;
                }
                goto go_down;
        }
diff --git a/fs/inode.c b/fs/inode.c
index 5938f3928944..26753ba7b6d6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -165,6 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        mapping->a_ops = &empty_aops;
        mapping->host = inode;
        mapping->flags = 0;
+        atomic_set(&mapping->i_mmap_writable, 0);
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->private_data = NULL;
        mapping->backing_dev_info = &default_backing_dev_info;
diff --git a/fs/internal.h b/fs/internal.h
index 465742407466..e325b4f9c799 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -131,7 +131,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
 /*
 * read_write.c
 */
-extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
 extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
 /*
@@ -144,3 +143,9 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 * pipe.c
 */
 extern const struct file_operations pipefifo_fops;
+/*
+ * fs_pin.c
+ */
+extern void sb_pin_kill(struct super_block *sb);
+extern void mnt_pin_kill(struct mount *m);
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 592e5115a561..f311bf084015 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -158,8 +158,8 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
                                               "zisofs: zisofs_inflate returned"
                                               " %d, inode = %lu,"
                                               " page idx = %d, bh idx = %d,"
-                                               " avail_in = %d,"
+                                               " avail_in = %ld,"
-                                               " avail_out = %d\n",
+                                               " avail_out = %ld\n",
                                               zerr, inode->i_ino, curpage,
                                               curbh, stream.avail_in,
                                               stream.avail_out);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 009ec0b5993d..2f7a3c090489 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -202,8 +202,7 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
        } else {
                acl = ERR_PTR(rc);
        }
-        if (value)
+        kfree(value);
-                kfree(value);
        if (!IS_ERR(acl))
                set_cached_acl(inode, type, acl);
        return acl;
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 0b9a1e44e833..5698dae5d92d 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -94,11 +94,12 @@ static int jffs2_zlib_compress(unsigned char *data_in,
        while (def_strm.total_out < *dstlen - STREAM_END_SPACE && def_strm.total_in < *sourcelen) {
                def_strm.avail_out = *dstlen - (def_strm.total_out + STREAM_END_SPACE);
-                def_strm.avail_in = min((unsigned)(*sourcelen-def_strm.total_in), def_strm.avail_out);
+                def_strm.avail_in = min_t(unsigned long,
-                jffs2_dbg(1, "calling deflate with avail_in %d, avail_out %d\n",
+                        (*sourcelen-def_strm.total_in), def_strm.avail_out);
+                jffs2_dbg(1, "calling deflate with avail_in %ld, avail_out %ld\n",
                          def_strm.avail_in, def_strm.avail_out);
                ret = zlib_deflate(&def_strm, Z_PARTIAL_FLUSH);
-                jffs2_dbg(1, "deflate returned with avail_in %d, avail_out %d, total_in %ld, total_out %ld\n",
+                jffs2_dbg(1, "deflate returned with avail_in %ld, avail_out %ld, total_in %ld, total_out %ld\n",
                          def_strm.avail_in, def_strm.avail_out,
                          def_strm.total_in, def_strm.total_out);
                if (ret != Z_OK) {
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index ad0f2e2a1700..d72817ac51f6 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -756,8 +756,7 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
        for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
                list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
                        list_del(&xd->xindex);
-                        if (xd->xname)
+                        kfree(xd->xname);
-                                kfree(xd->xname);
                        jffs2_free_xattr_datum(xd);
                }
        }
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index d895b4b7b661..4429d6d9217f 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -896,7 +896,7 @@ const struct file_operations kernfs_file_fops = {
 * @ops: kernfs operations for the file
 * @priv: private data for the file
 * @ns: optional namespace tag of the file
- * @static_name: don't copy file name
+ * @name_is_static: don't copy file name
 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
 *
 * Returns the created node on success, ERR_PTR() value on error.
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 1812f026960c..daa8e7514eae 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -306,11 +306,9 @@ static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
 static void nsm_init_private(struct nsm_handle *nsm)
 {
        u64 *p = (u64 *)&nsm->sm_priv.data;
-        struct timespec ts;
        s64 ns;
-        ktime_get_ts(&ts);
+        ns = ktime_get_ns();
-        ns = timespec_to_ns(&ts);
        put_unaligned(ns, p);
        put_unaligned((unsigned long)nsm, p + 1);
 }
diff --git a/fs/locks.c b/fs/locks.c
index a6f54802d277..cb66fb05ad4a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -247,6 +247,18 @@ void locks_free_lock(struct file_lock *fl)
 }
 EXPORT_SYMBOL(locks_free_lock);
+static void
+locks_dispose_list(struct list_head *dispose)
+{
+        struct file_lock *fl;
+        while (!list_empty(dispose)) {
+                fl = list_first_entry(dispose, struct file_lock, fl_block);
+                list_del_init(&fl->fl_block);
+                locks_free_lock(fl);
+        }
+}
 void locks_init_lock(struct file_lock *fl)
 {
        memset(fl, 0, sizeof(struct file_lock));
@@ -285,7 +297,8 @@ EXPORT_SYMBOL(__locks_copy_lock);
 void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
 {
-        locks_release_private(new);
+        /* "new" must be a freshly-initialized lock */
+        WARN_ON_ONCE(new->fl_ops);
        __locks_copy_lock(new, fl);
        new->fl_file = fl->fl_file;
@@ -650,12 +663,16 @@ static void locks_unlink_lock(struct file_lock **thisfl_p)
 *
 * Must be called with i_lock held!
 */
-static void locks_delete_lock(struct file_lock **thisfl_p)
+static void locks_delete_lock(struct file_lock **thisfl_p,
+                              struct list_head *dispose)
 {
        struct file_lock *fl = *thisfl_p;
        locks_unlink_lock(thisfl_p);
-        locks_free_lock(fl);
+        if (dispose)
+                list_add(&fl->fl_block, dispose);
+        else
+                locks_free_lock(fl);
 }
 /* Determine if lock sys_fl blocks lock caller_fl. Common functionality
@@ -811,6 +828,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
        struct inode * inode = file_inode(filp);
        int error = 0;
        int found = 0;
+        LIST_HEAD(dispose);
        if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
                new_fl = locks_alloc_lock();
@@ -833,7 +851,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
                if (request->fl_type == fl->fl_type)
                        goto out;
                found = 1;
-                locks_delete_lock(before);
+                locks_delete_lock(before, &dispose);
                break;
        }
@@ -880,6 +898,7 @@ out:
        spin_unlock(&inode->i_lock);
        if (new_fl)
                locks_free_lock(new_fl);
+        locks_dispose_list(&dispose);
        return error;
 }
@@ -893,6 +912,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
        struct file_lock **before;
        int error;
        bool added = false;
+        LIST_HEAD(dispose);
        /*
         * We may need two file_lock structures for this operation,
@@ -988,7 +1008,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                        else
                                request->fl_end = fl->fl_end;
                        if (added) {
-                                locks_delete_lock(before);
+                                locks_delete_lock(before, &dispose);
                                continue;
                        }
                        request = fl;
@@ -1018,21 +1038,24 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                                 * one (This may happen several times).
                                 */
                                if (added) {
-                                        locks_delete_lock(before);
+                                        locks_delete_lock(before, &dispose);
                                        continue;
                                }
-                                /* Replace the old lock with the new one.
+                                /*
-                                 * Wake up anybody waiting for the old one,
+                                 * Replace the old lock with new_fl, and
-                                 * as the change in lock type might satisfy
+                                 * remove the old one. It's safe to do the
-                                 * their needs.
+                                 * insert here since we know that we won't be
+                                 * using new_fl later, and that the lock is
+                                 * just replacing an existing lock.
                                 */
-                                locks_wake_up_blocks(fl);
+                                error = -ENOLCK;
-                                fl->fl_start = request->fl_start;
+                                if (!new_fl)
-                                fl->fl_end = request->fl_end;
+                                        goto out;
-                                fl->fl_type = request->fl_type;
+                                locks_copy_lock(new_fl, request);
-                                locks_release_private(fl);
+                                request = new_fl;
-                                locks_copy_private(fl, request);
+                                new_fl = NULL;
-                                request = fl;
+                                locks_delete_lock(before, &dispose);
+                                locks_insert_lock(before, request);
                                added = true;
                        }
                }
@@ -1093,6 +1116,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                locks_free_lock(new_fl);
        if (new_fl2)
                locks_free_lock(new_fl2);
+        locks_dispose_list(&dispose);
        return error;
 }
@@ -1268,7 +1292,7 @@ int lease_modify(struct file_lock **before, int arg)
                        printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
                        fl->fl_fasync = NULL;
                }
-                locks_delete_lock(before);
+                locks_delete_lock(before, NULL);
        }
        return 0;
 }
@@ -1737,13 +1761,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
        ret = fl;
        spin_lock(&inode->i_lock);
        error = __vfs_setlease(filp, arg, &ret);
-        if (error) {
+        if (error)
-                spin_unlock(&inode->i_lock);
+                goto out_unlock;
-                locks_free_lock(fl);
+        if (ret == fl)
-                goto out_free_fasync;
+                fl = NULL;
-        }
-        if (ret != fl)
-                locks_free_lock(fl);
        /*
         * fasync_insert_entry() returns the old entry if any.
@@ -1755,9 +1776,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
                new = NULL;
        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+out_unlock:
        spin_unlock(&inode->i_lock);
+        if (fl)
-out_free_fasync:
+                locks_free_lock(fl);
        if (new)
                fasync_free(new);
        return error;
@@ -2320,6 +2342,7 @@ void locks_remove_file(struct file *filp)
        struct inode * inode = file_inode(filp);
        struct file_lock *fl;
        struct file_lock **before;
+        LIST_HEAD(dispose);
        if (!inode->i_flock)
                return;
@@ -2365,12 +2388,13 @@ void locks_remove_file(struct file *filp)
                                fl->fl_type, fl->fl_flags,
                                fl->fl_start, fl->fl_end);
-                        locks_delete_lock(before);
+                        locks_delete_lock(before, &dispose);
                        continue;
                }
                before = &fl->fl_next;
        }
        spin_unlock(&inode->i_lock);
+        locks_dispose_list(&dispose);
 }
 /**
@@ -2452,7 +2476,11 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
                        seq_puts(f, "FLOCK  ADVISORY  ");
                }
        } else if (IS_LEASE(fl)) {
-                seq_puts(f, "LEASE  ");
+                if (fl->fl_flags & FL_DELEG)
+                        seq_puts(f, "DELEG  ");
+                else
+                        seq_puts(f, "LEASE  ");
                if (lease_breaking(fl))
                        seq_puts(f, "BREAKING  ");
                else if (fl->fl_file)
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 48140315f627..380d86e1ab45 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1019,11 +1019,11 @@ static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
 /**
 * logfs_is_valid_block - check whether this block is still valid
 *
- * @sb  - superblock
+ * @sb:         superblock
- * @ofs - block physical offset
+ * @ofs:        block physical offset
- * @ino - block inode number
+ * @ino:        block inode number
- * @bix - block index
+ * @bix:        block index
- * @level - block level
+ * @gc_level:   block level
 *
 * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
 * become invalid once the journal is written.
@@ -2226,10 +2226,9 @@ void btree_write_block(struct logfs_block *block)
 *
 * @inode:              parent inode (ifile or directory)
 * @buf:                object to write (inode or dentry)
- * @n:                  object size
+ * @count:              object size
- * @_pos:               object number (file position in blocks/objects)
+ * @bix:                block index
 * @flags:              write flags
- * @lock:               0 if write lock is already taken, 1 otherwise
 * @shadow_tree:        shadow below this inode
 *
 * FIXME: All caller of this put a 200-300 byte variable on the stack,
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 4bc50dac8e97..742942a983be 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -96,7 +96,7 @@ int minix_new_block(struct inode * inode)
 unsigned long minix_count_free_blocks(struct super_block *sb)
 {
        struct minix_sb_info *sbi = minix_sb(sb);
-        u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1);
+        u32 bits = sbi->s_nzones - sbi->s_firstdatazone + 1;
        return (count_free(sbi->s_zmap, sb->s_blocksize, bits)
                << sbi->s_log_zone_size);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f007a3355570..3f57af196a7d 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -267,12 +267,12 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
        block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize);
        if (sbi->s_imap_blocks < block) {
                printk("MINIX-fs: file system does not have enough "
-                                "imap blocks allocated.  Refusing to mount\n");
+                                "imap blocks allocated.  Refusing to mount.\n");
                goto out_no_bitmap;
        }
        block = minix_blocks_needed(
-                        (sbi->s_nzones - (sbi->s_firstdatazone + 1)),
+                        (sbi->s_nzones - sbi->s_firstdatazone + 1),
                        s->s_blocksize);
        if (sbi->s_zmap_blocks < block) {
                printk("MINIX-fs: file system does not have enough "
diff --git a/fs/mount.h b/fs/mount.h
index d55297f2fa05..6740a6215529 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -55,7 +55,7 @@ struct mount {
        int mnt_id;                     /* mount identifier */
        int mnt_group_id;               /* peer group identifier */
        int mnt_expiry_mark;            /* true if marked for expiry */
-        int mnt_pinned;
+        struct hlist_head mnt_pins;
        struct path mnt_ex_mountpoint;
 };
diff --git a/fs/namei.c b/fs/namei.c
index 9eb787e5c167..a996bb48dfab 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1091,10 +1091,10 @@ int follow_down_one(struct path *path)
 }
 EXPORT_SYMBOL(follow_down_one);
-static inline bool managed_dentry_might_block(struct dentry *dentry)
+static inline int managed_dentry_rcu(struct dentry *dentry)
 {
-        return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
+        return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
-                dentry->d_op->d_manage(dentry, true) < 0);
+                dentry->d_op->d_manage(dentry, true) : 0;
 }
 /*
@@ -1110,11 +1110,18 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                 * Don't forget we might have a non-mountpoint managed dentry
                 * that wants to block transit.
                 */
-                if (unlikely(managed_dentry_might_block(path->dentry)))
+                switch (managed_dentry_rcu(path->dentry)) {
+                case -ECHILD:
+                default:
                        return false;
+                case -EISDIR:
+                        return true;
+                case 0:
+                        break;
+                }
                if (!d_mountpoint(path->dentry))
-                        return true;
+                        return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
                mounted = __lookup_mnt(path->mnt, path->dentry);
                if (!mounted)
@@ -1130,7 +1137,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                 */
                *inode = path->dentry->d_inode;
        }
-        return read_seqretry(&mount_lock, nd->m_seq);
+        return read_seqretry(&mount_lock, nd->m_seq) &&
+                !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
 }
 static int follow_dotdot_rcu(struct nameidata *nd)
@@ -1402,11 +1410,8 @@ static int lookup_fast(struct nameidata *nd,
                }
                path->mnt = mnt;
                path->dentry = dentry;
-                if (unlikely(!__follow_mount_rcu(nd, path, inode)))
+                if (likely(__follow_mount_rcu(nd, path, inode)))
-                        goto unlazy;
+                        return 0;
-                if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-                        goto unlazy;
-                return 0;
 unlazy:
                if (unlazy_walk(nd, dentry))
                        return -ECHILD;
@@ -4019,7 +4024,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
- *      a) we can get into loop creation. Check is done in is_subdir().
+ *      a) we can get into loop creation.
 *      b) race potential - two innocent renames can create a loop together.
 *         That's where 4.4 screws up. Current fix: serialization on
 *         sb->s_vfs_rename_mutex. We might be more accurate, but that's another
@@ -4075,7 +4080,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (error)
                return error;
-        if (!old_dir->i_op->rename)
+        if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
                return -EPERM;
        if (flags && !old_dir->i_op->rename2)
@@ -4134,10 +4139,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                if (error)
                        goto out;
        }
-        if (!flags) {
+        if (!old_dir->i_op->rename2) {
                error = old_dir->i_op->rename(old_dir, old_dentry,
                                              new_dir, new_dentry);
        } else {
+                WARN_ON(old_dir->i_op->rename != NULL);
                error = old_dir->i_op->rename2(old_dir, old_dentry,
                                               new_dir, new_dentry, flags);
        }
diff --git a/fs/namespace.c b/fs/namespace.c
index 182bc41cd887..a01c7730e9af 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -16,7 +16,6 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/idr.h>
-#include <linux/acct.h>         /* acct_auto_close_mnt */
 #include <linux/init.h>         /* init_rootfs */
 #include <linux/fs_struct.h>    /* get_fs_root et.al. */
 #include <linux/fsnotify.h>     /* fsnotify_vfsmount_delete */
@@ -779,6 +778,20 @@ static void attach_mnt(struct mount *mnt,
        list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
 }
+static void attach_shadowed(struct mount *mnt,
+                        struct mount *parent,
+                        struct mount *shadows)
+{
+        if (shadows) {
+                hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
+                list_add(&mnt->mnt_child, &shadows->mnt_child);
+        } else {
+                hlist_add_head_rcu(&mnt->mnt_hash,
+                                m_hash(&parent->mnt, mnt->mnt_mountpoint));
+                list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+        }
+}
 /*
 * vfsmount lock must be held for write
 */
@@ -797,12 +810,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
        list_splice(&head, n->list.prev);
-        if (shadows)
+        attach_shadowed(mnt, parent, shadows);
-                hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash);
-        else
-                hlist_add_head_rcu(&mnt->mnt_hash,
-                                m_hash(&parent->mnt, mnt->mnt_mountpoint));
-        list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
        touch_mnt_namespace(n);
 }
@@ -890,8 +898,21 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
        mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
        /* Don't allow unprivileged users to change mount flags */
-        if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
+        if (flag & CL_UNPRIVILEGED) {
-                mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
+                mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
+                if (mnt->mnt.mnt_flags & MNT_READONLY)
+                        mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
+                if (mnt->mnt.mnt_flags & MNT_NODEV)
+                        mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
+                if (mnt->mnt.mnt_flags & MNT_NOSUID)
+                        mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
+                if (mnt->mnt.mnt_flags & MNT_NOEXEC)
+                        mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
+        }
        /* Don't allow unprivileged users to reveal what is under a mount */
        if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
@@ -938,7 +959,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 static void mntput_no_expire(struct mount *mnt)
 {
-put_again:
        rcu_read_lock();
        mnt_add_count(mnt, -1);
        if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
@@ -951,14 +971,6 @@ put_again:
                unlock_mount_hash();
                return;
        }
-        if (unlikely(mnt->mnt_pinned)) {
-                mnt_add_count(mnt, mnt->mnt_pinned + 1);
-                mnt->mnt_pinned = 0;
-                rcu_read_unlock();
-                unlock_mount_hash();
-                acct_auto_close_mnt(&mnt->mnt);
-                goto put_again;
-        }
        if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
                rcu_read_unlock();
                unlock_mount_hash();
@@ -981,6 +993,8 @@ put_again:
         * so mnt_get_writers() below is safe.
         */
        WARN_ON(mnt_get_writers(mnt));
+        if (unlikely(mnt->mnt_pins.first))
+                mnt_pin_kill(mnt);
        fsnotify_vfsmount_delete(&mnt->mnt);
        dput(mnt->mnt.mnt_root);
        deactivate_super(mnt->mnt.mnt_sb);
@@ -1008,25 +1022,15 @@ struct vfsmount *mntget(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(mntget);
-void mnt_pin(struct vfsmount *mnt)
+struct vfsmount *mnt_clone_internal(struct path *path)
-{
-        lock_mount_hash();
-        real_mount(mnt)->mnt_pinned++;
-        unlock_mount_hash();
-}
-EXPORT_SYMBOL(mnt_pin);
-void mnt_unpin(struct vfsmount *m)
 {
-        struct mount *mnt = real_mount(m);
+        struct mount *p;
-        lock_mount_hash();
+        p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
-        if (mnt->mnt_pinned) {
+        if (IS_ERR(p))
-                mnt_add_count(mnt, 1);
+                return ERR_CAST(p);
-                mnt->mnt_pinned--;
+        p->mnt.mnt_flags |= MNT_INTERNAL;
-        }
+        return &p->mnt;
-        unlock_mount_hash();
 }
-EXPORT_SYMBOL(mnt_unpin);
 static inline void mangle(struct seq_file *m, const char *s)
 {
@@ -1492,6 +1496,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
                        continue;
                for (s = r; s; s = next_mnt(s, r)) {
+                        struct mount *t = NULL;
                        if (!(flag & CL_COPY_UNBINDABLE) &&
                            IS_MNT_UNBINDABLE(s)) {
                                s = skip_mnt_tree(s);
@@ -1513,7 +1518,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
                                goto out;
                        lock_mount_hash();
                        list_add_tail(&q->mnt_list, &res->mnt_list);
-                        attach_mnt(q, parent, p->mnt_mp);
+                        mnt_set_mountpoint(parent, p->mnt_mp, q);
+                        if (!list_empty(&parent->mnt_mounts)) {
+                                t = list_last_entry(&parent->mnt_mounts,
+                                        struct mount, mnt_child);
+                                if (t->mnt_mp != p->mnt_mp)
+                                        t = NULL;
+                        }
+                        attach_shadowed(q, parent, t);
                        unlock_mount_hash();
                }
        }
@@ -1896,9 +1908,6 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
        if (readonly_request == __mnt_is_readonly(mnt))
                return 0;
-        if (mnt->mnt_flags & MNT_LOCK_READONLY)
-                return -EPERM;
        if (readonly_request)
                error = mnt_make_readonly(real_mount(mnt));
        else
@@ -1924,6 +1933,33 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
        if (path->dentry != path->mnt->mnt_root)
                return -EINVAL;
+        /* Don't allow changing of locked mnt flags.
+         *
+         * No locks need to be held here while testing the various
+         * MNT_LOCK flags because those flags can never be cleared
+         * once they are set.
+         */
+        if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
+            !(mnt_flags & MNT_READONLY)) {
+                return -EPERM;
+        }
+        if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
+            !(mnt_flags & MNT_NODEV)) {
+                return -EPERM;
+        }
+        if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
+            !(mnt_flags & MNT_NOSUID)) {
+                return -EPERM;
+        }
+        if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
+            !(mnt_flags & MNT_NOEXEC)) {
+                return -EPERM;
+        }
+        if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
+            ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
+                return -EPERM;
+        }
        err = security_sb_remount(sb, data);
        if (err)
                return err;
@@ -1937,7 +1973,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
                err = do_remount_sb(sb, flags, data, 0);
        if (!err) {
                lock_mount_hash();
-                mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
+                mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
                mnt->mnt.mnt_flags = mnt_flags;
                touch_mnt_namespace(mnt->mnt_ns);
                unlock_mount_hash();
@@ -2122,7 +2158,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
                 */
                if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
                        flags |= MS_NODEV;
-                        mnt_flags |= MNT_NODEV;
+                        mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
                }
        }
@@ -2436,6 +2472,14 @@ long do_mount(const char *dev_name, const char *dir_name,
        if (flags & MS_RDONLY)
                mnt_flags |= MNT_READONLY;
+        /* The default atime for remount is preservation */
+        if ((flags & MS_REMOUNT) &&
+            ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
+                       MS_STRICTATIME)) == 0)) {
+                mnt_flags &= ~MNT_ATIME_MASK;
+                mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
+        }
        flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
                   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
                   MS_STRICTATIME);
@@ -2972,13 +3016,13 @@ static void *mntns_get(struct task_struct *task)
        struct mnt_namespace *ns = NULL;
        struct nsproxy *nsproxy;
-        rcu_read_lock();
+        task_lock(task);
-        nsproxy = task_nsproxy(task);
+        nsproxy = task->nsproxy;
        if (nsproxy) {
                ns = nsproxy->mnt_ns;
                get_mnt_ns(ns);
        }
-        rcu_read_unlock();
+        task_unlock(task);
        return ns;
 }
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9b431f44fad9..cbb1797149d5 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -210,8 +210,7 @@ static void bl_end_io_read(struct bio *bio, int err)
                        SetPageUptodate(bvec->bv_page);
        if (err) {
-                struct nfs_pgio_data *rdata = par->data;
+                struct nfs_pgio_header *header = par->data;
-                struct nfs_pgio_header *header = rdata->header;
                if (!header->pnfs_error)
                        header->pnfs_error = -EIO;
@@ -224,43 +223,44 @@ static void bl_end_io_read(struct bio *bio, int err)
 static void bl_read_cleanup(struct work_struct *work)
 {
        struct rpc_task *task;
-        struct nfs_pgio_data *rdata;
+        struct nfs_pgio_header *hdr;
        dprintk("%s enter\n", __func__);
        task = container_of(work, struct rpc_task, u.tk_work);
-        rdata = container_of(task, struct nfs_pgio_data, task);
+        hdr = container_of(task, struct nfs_pgio_header, task);
-        pnfs_ld_read_done(rdata);
+        pnfs_ld_read_done(hdr);
 }
 static void
 bl_end_par_io_read(void *data, int unused)
 {
-        struct nfs_pgio_data *rdata = data;
+        struct nfs_pgio_header *hdr = data;
-        rdata->task.tk_status = rdata->header->pnfs_error;
+        hdr->task.tk_status = hdr->pnfs_error;
-        INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
+        INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
-        schedule_work(&rdata->task.u.tk_work);
+        schedule_work(&hdr->task.u.tk_work);
 }
 static enum pnfs_try_status
-bl_read_pagelist(struct nfs_pgio_data *rdata)
+bl_read_pagelist(struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *header = rdata->header;
+        struct nfs_pgio_header *header = hdr;
        int i, hole;
        struct bio *bio = NULL;
        struct pnfs_block_extent *be = NULL, *cow_read = NULL;
        sector_t isect, extent_length = 0;
        struct parallel_io *par;
-        loff_t f_offset = rdata->args.offset;
+        loff_t f_offset = hdr->args.offset;
-        size_t bytes_left = rdata->args.count;
+        size_t bytes_left = hdr->args.count;
        unsigned int pg_offset, pg_len;
-        struct page **pages = rdata->args.pages;
+        struct page **pages = hdr->args.pages;
-        int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
+        int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT;
        const bool is_dio = (header->dreq != NULL);
        dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
-               rdata->pages.npages, f_offset, (unsigned int)rdata->args.count);
+                hdr->page_array.npages, f_offset,
+                (unsigned int)hdr->args.count);
-        par = alloc_parallel(rdata);
+        par = alloc_parallel(hdr);
        if (!par)
                goto use_mds;
        par->pnfs_callback = bl_end_par_io_read;
@@ -268,7 +268,7 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
        isect = (sector_t) (f_offset >> SECTOR_SHIFT);
        /* Code assumes extents are page-aligned */
-        for (i = pg_index; i < rdata->pages.npages; i++) {
+        for (i = pg_index; i < hdr->page_array.npages; i++) {
                if (!extent_length) {
                        /* We've used up the previous extent */
                        bl_put_extent(be);
@@ -317,7 +317,8 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
                        struct pnfs_block_extent *be_read;
                        be_read = (hole && cow_read) ? cow_read : be;
-                        bio = do_add_page_to_bio(bio, rdata->pages.npages - i,
+                        bio = do_add_page_to_bio(bio,
+                                                 hdr->page_array.npages - i,
                                                 READ,
                                                 isect, pages[i], be_read,
                                                 bl_end_io_read, par,
@@ -332,10 +333,10 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
                extent_length -= PAGE_CACHE_SECTORS;
        }
        if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
-                rdata->res.eof = 1;
+                hdr->res.eof = 1;
-                rdata->res.count = header->inode->i_size - rdata->args.offset;
+                hdr->res.count = header->inode->i_size - hdr->args.offset;
        } else {
-                rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset;
+                hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset;
        }
 out:
        bl_put_extent(be);
@@ -390,8 +391,7 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
        }
        if (unlikely(err)) {
-                struct nfs_pgio_data *data = par->data;
+                struct nfs_pgio_header *header = par->data;
-                struct nfs_pgio_header *header = data->header;
                if (!header->pnfs_error)
                        header->pnfs_error = -EIO;
@@ -405,8 +405,7 @@ static void bl_end_io_write(struct bio *bio, int err)
 {
        struct parallel_io *par = bio->bi_private;
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct nfs_pgio_data *data = par->data;
+        struct nfs_pgio_header *header = par->data;
-        struct nfs_pgio_header *header = data->header;
        if (!uptodate) {
                if (!header->pnfs_error)
@@ -423,32 +422,32 @@ static void bl_end_io_write(struct bio *bio, int err)
 static void bl_write_cleanup(struct work_struct *work)
 {
        struct rpc_task *task;
-        struct nfs_pgio_data *wdata;
+        struct nfs_pgio_header *hdr;
        dprintk("%s enter\n", __func__);
        task = container_of(work, struct rpc_task, u.tk_work);
-        wdata = container_of(task, struct nfs_pgio_data, task);
+        hdr = container_of(task, struct nfs_pgio_header, task);
-        if (likely(!wdata->header->pnfs_error)) {
+        if (likely(!hdr->pnfs_error)) {
                /* Marks for LAYOUTCOMMIT */
-                mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg),
+                mark_extents_written(BLK_LSEG2EXT(hdr->lseg),
-                                     wdata->args.offset, wdata->args.count);
+                                     hdr->args.offset, hdr->args.count);
        }
-        pnfs_ld_write_done(wdata);
+        pnfs_ld_write_done(hdr);
 }
 /* Called when last of bios associated with a bl_write_pagelist call finishes */
 static void bl_end_par_io_write(void *data, int num_se)
 {
-        struct nfs_pgio_data *wdata = data;
+        struct nfs_pgio_header *hdr = data;
-        if (unlikely(wdata->header->pnfs_error)) {
+        if (unlikely(hdr->pnfs_error)) {
-                bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval,
+                bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval,
                                        num_se);
        }
-        wdata->task.tk_status = wdata->header->pnfs_error;
+        hdr->task.tk_status = hdr->pnfs_error;
-        wdata->verf.committed = NFS_FILE_SYNC;
+        hdr->verf.committed = NFS_FILE_SYNC;
-        INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
+        INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
-        schedule_work(&wdata->task.u.tk_work);
+        schedule_work(&hdr->task.u.tk_work);
 }
 /* FIXME STUB - mark intersection of layout and page as bad, so is not
@@ -673,18 +672,17 @@ check_page:
 }
 static enum pnfs_try_status
-bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)
+bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 {
-        struct nfs_pgio_header *header = wdata->header;
        int i, ret, npg_zero, pg_index, last = 0;
        struct bio *bio = NULL;
        struct pnfs_block_extent *be = NULL, *cow_read = NULL;
        sector_t isect, last_isect = 0, extent_length = 0;
        struct parallel_io *par = NULL;
-        loff_t offset = wdata->args.offset;
+        loff_t offset = header->args.offset;
-        size_t count = wdata->args.count;
+        size_t count = header->args.count;
        unsigned int pg_offset, pg_len, saved_len;
-        struct page **pages = wdata->args.pages;
+        struct page **pages = header->args.pages;
        struct page *page;
        pgoff_t index;
        u64 temp;
@@ -699,11 +697,11 @@ bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)
                dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
                goto out_mds;
        }
-        /* At this point, wdata->pages is a (sequential) list of nfs_pages.
+        /* At this point, header->page_aray is a (sequential) list of nfs_pages.
         * We want to write each, and if there is an error set pnfs_error
         * to have it redone using nfs.
         */
-        par = alloc_parallel(wdata);
+        par = alloc_parallel(header);
        if (!par)
                goto out_mds;
        par->pnfs_callback = bl_end_par_io_write;
@@ -790,8 +788,8 @@ next_page:
        bio = bl_submit_bio(WRITE, bio);
        /* Middle pages */
-        pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
+        pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
-        for (i = pg_index; i < wdata->pages.npages; i++) {
+        for (i = pg_index; i < header->page_array.npages; i++) {
                if (!extent_length) {
                        /* We've used up the previous extent */
                        bl_put_extent(be);
@@ -862,7 +860,8 @@ next_page:
                }
-                bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE,
+                bio = do_add_page_to_bio(bio, header->page_array.npages - i,
+                                         WRITE,
                                         isect, pages[i], be,
                                         bl_end_io_write, par,
                                         pg_offset, pg_len);
@@ -890,7 +889,7 @@ next_page:
        }
 write_done:
-        wdata->res.count = wdata->args.count;
+        header->res.count = header->args.count;
 out:
        bl_put_extent(be);
        bl_put_extent(cow_read);
@@ -1063,7 +1062,7 @@ nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
                return ERR_PTR(-ENOMEM);
        }
-        pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
+        pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS);
        if (pages == NULL) {
                kfree(dev);
                return ERR_PTR(-ENOMEM);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 073b4cf67ed9..54de482143cc 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -428,6 +428,18 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
        if (p == NULL)
                return 0;
+        /*
+         * Did we get the acceptor from userland during the SETCLIENID
+         * negotiation?
+         */
+        if (clp->cl_acceptor)
+                return !strcmp(p, clp->cl_acceptor);
+        /*
+         * Otherwise try to verify it using the cl_hostname. Note that this
+         * doesn't work if a non-canonical hostname was used in the devname.
+         */
        /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
        if (memcmp(p, "nfs@", 4) != 0)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 1d09289c8f0e..1c5ff6d58385 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -110,8 +110,8 @@ struct nfs_subversion *get_nfs_version(unsigned int version)
                mutex_unlock(&nfs_version_mutex);
        }
-        if (!IS_ERR(nfs))
+        if (!IS_ERR(nfs) && !try_module_get(nfs->owner))
-                try_module_get(nfs->owner);
+                return ERR_PTR(-EAGAIN);
        return nfs;
 }
@@ -158,7 +158,8 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
                goto error_0;
        clp->cl_nfs_mod = cl_init->nfs_mod;
-        try_module_get(clp->cl_nfs_mod->owner);
+        if (!try_module_get(clp->cl_nfs_mod->owner))
+                goto error_dealloc;
        clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;
@@ -190,6 +191,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 error_cleanup:
        put_nfs_version(clp->cl_nfs_mod);
+error_dealloc:
        kfree(clp);
 error_0:
        return ERR_PTR(err);
@@ -252,6 +254,7 @@ void nfs_free_client(struct nfs_client *clp)
        put_net(clp->cl_net);
        put_nfs_version(clp->cl_nfs_mod);
        kfree(clp->cl_hostname);
+        kfree(clp->cl_acceptor);
        kfree(clp);
        dprintk("<-- nfs_free_client()\n");
@@ -482,8 +485,13 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
        struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
        const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops;
+        if (cl_init->hostname == NULL) {
+                WARN_ON(1);
+                return NULL;
+        }
        dprintk("--> nfs_get_client(%s,v%u)\n",
-                cl_init->hostname ?: "", rpc_ops->version);
+                cl_init->hostname, rpc_ops->version);
        /* see if the client already exists */
        do {
@@ -510,7 +518,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
        } while (!IS_ERR(new));
        dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n",
-                cl_init->hostname ?: "", PTR_ERR(new));
+                cl_init->hostname, PTR_ERR(new));
        return new;
 }
 EXPORT_SYMBOL_GPL(nfs_get_client);
@@ -1205,7 +1213,7 @@ static const struct file_operations nfs_server_list_fops = {
        .open           = nfs_server_list_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release,
+        .release        = seq_release_net,
        .owner          = THIS_MODULE,
 };
@@ -1226,7 +1234,7 @@ static const struct file_operations nfs_volume_list_fops = {
        .open           = nfs_volume_list_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = seq_release,
+        .release        = seq_release_net,
        .owner          = THIS_MODULE,
 };
@@ -1236,19 +1244,8 @@ static const struct file_operations nfs_volume_list_fops = {
 */
 static int nfs_server_list_open(struct inode *inode, struct file *file)
 {
-        struct seq_file *m;
+        return seq_open_net(inode, file, &nfs_server_list_ops,
-        int ret;
+                           sizeof(struct seq_net_private));
-        struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
-        struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
-        ret = seq_open(file, &nfs_server_list_ops);
-        if (ret < 0)
-                return ret;
-        m = file->private_data;
-        m->private = net;
-        return 0;
 }
 /*
@@ -1256,7 +1253,7 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
 */
 static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
 {
-        struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+        struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
        /* lock the list against modification */
        spin_lock(&nn->nfs_client_lock);
@@ -1268,7 +1265,7 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
 */
 static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
 {
-        struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+        struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
        return seq_list_next(v, &nn->nfs_client_list, pos);
 }
@@ -1278,7 +1275,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
 */
 static void nfs_server_list_stop(struct seq_file *p, void *v)
 {
-        struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+        struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
        spin_unlock(&nn->nfs_client_lock);
 }
@@ -1289,7 +1286,7 @@ static void nfs_server_list_stop(struct seq_file *p, void *v)
 static int nfs_server_list_show(struct seq_file *m, void *v)
 {
        struct nfs_client *clp;
-        struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+        struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
        /* display header on line 1 */
        if (v == &nn->nfs_client_list) {
@@ -1321,19 +1318,8 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
 */
 static int nfs_volume_list_open(struct inode *inode, struct file *file)
 {
-        struct seq_file *m;
+        return seq_open_net(inode, file, &nfs_server_list_ops,
-        int ret;
+                           sizeof(struct seq_net_private));
-        struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
-        struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
-        ret = seq_open(file, &nfs_volume_list_ops);
-        if (ret < 0)
-                return ret;
-        m = file->private_data;
-        m->private = net;
-        return 0;
 }
 /*
@@ -1341,7 +1327,7 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
 */
 static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
 {
-        struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+        struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
        /* lock the list against modification */
        spin_lock(&nn->nfs_client_lock);
@@ -1353,7 +1339,7 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
 */
 static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
 {
-        struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+        struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
        return seq_list_next(v, &nn->nfs_volume_list, pos);
 }
@@ -1363,7 +1349,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
 */
 static void nfs_volume_list_stop(struct seq_file *p, void *v)
 {
-        struct nfs_net *nn = net_generic(p->private, nfs_net_id);
+        struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
        spin_unlock(&nn->nfs_client_lock);
 }
@@ -1376,7 +1362,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
        struct nfs_server *server;
        struct nfs_client *clp;
        char dev[8], fsid[17];
-        struct nfs_net *nn = net_generic(m->private, nfs_net_id);
+        struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
        /* display header on line 1 */
        if (v == &nn->nfs_volume_list) {
@@ -1407,6 +1393,45 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
        return 0;
 }
+int nfs_fs_proc_net_init(struct net *net)
+{
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        struct proc_dir_entry *p;
+        nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net);
+        if (!nn->proc_nfsfs)
+                goto error_0;
+        /* a file of servers with which we're dealing */
+        p = proc_create("servers", S_IFREG|S_IRUGO,
+                        nn->proc_nfsfs, &nfs_server_list_fops);
+        if (!p)
+                goto error_1;
+        /* a file of volumes that we have mounted */
+        p = proc_create("volumes", S_IFREG|S_IRUGO,
+                        nn->proc_nfsfs, &nfs_volume_list_fops);
+        if (!p)
+                goto error_2;
+        return 0;
+error_2:
+        remove_proc_entry("servers", nn->proc_nfsfs);
+error_1:
+        remove_proc_entry("fs/nfsfs", NULL);
+error_0:
+        return -ENOMEM;
+}
+void nfs_fs_proc_net_exit(struct net *net)
+{
+        struct nfs_net *nn = net_generic(net, nfs_net_id);
+        remove_proc_entry("volumes", nn->proc_nfsfs);
+        remove_proc_entry("servers", nn->proc_nfsfs);
+        remove_proc_entry("fs/nfsfs", NULL);
+}
 /*
 * initialise the /proc/fs/nfsfs/ directory
 */
@@ -1419,14 +1444,12 @@ int __init nfs_fs_proc_init(void)
                goto error_0;
        /* a file of servers with which we're dealing */
-        p = proc_create("servers", S_IFREG|S_IRUGO,
+        p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers");
-                        proc_fs_nfs, &nfs_server_list_fops);
        if (!p)
                goto error_1;
        /* a file of volumes that we have mounted */
-        p = proc_create("volumes", S_IFREG|S_IRUGO,
+        p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes");
-                        proc_fs_nfs, &nfs_volume_list_fops);
        if (!p)
                goto error_2;
        return 0;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5d8ccecf5f5c..5853f53db732 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -41,14 +41,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
        set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
-/**
+static int
- * nfs_have_delegation - check if inode has a delegation
+nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
- * @inode: inode to check
- * @flags: delegation types to check for
- *
- * Returns one if inode has the indicated delegation, otherwise zero.
- */
-int nfs4_have_delegation(struct inode *inode, fmode_t flags)
 {
        struct nfs_delegation *delegation;
        int ret = 0;
@@ -58,12 +52,34 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)
        delegation = rcu_dereference(NFS_I(inode)->delegation);
        if (delegation != NULL && (delegation->type & flags) == flags &&
            !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
-                nfs_mark_delegation_referenced(delegation);
+                if (mark)
+                        nfs_mark_delegation_referenced(delegation);
                ret = 1;
        }
        rcu_read_unlock();
        return ret;
 }
+/**
+ * nfs_have_delegation - check if inode has a delegation, mark it
+ * NFS_DELEGATION_REFERENCED if there is one.
+ * @inode: inode to check
+ * @flags: delegation types to check for
+ *
+ * Returns one if inode has the indicated delegation, otherwise zero.
+ */
+int nfs4_have_delegation(struct inode *inode, fmode_t flags)
+{
+        return nfs4_do_check_delegation(inode, flags, true);
+}
+/*
+ * nfs4_check_delegation - check if inode has a delegation, do not mark
+ * NFS_DELEGATION_REFERENCED if it has one.
+ */
+int nfs4_check_delegation(struct inode *inode, fmode_t flags)
+{
+        return nfs4_do_check_delegation(inode, flags, false);
+}
 static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
 {
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 9a79c7a99d6d..5c1cce39297f 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -59,6 +59,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
+int nfs4_check_delegation(struct inode *inode, fmode_t flags);
 #endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4a3d4ef76127..36d921f0c602 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -988,9 +988,13 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
 * A check for whether or not the parent directory has changed.
 * In the case it has, we assume that the dentries are untrustworthy
 * and may need to be looked up again.
+ * If rcu_walk prevents us from performing a full check, return 0.
 */
-static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
+static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
+                              int rcu_walk)
 {
+        int ret;
        if (IS_ROOT(dentry))
                return 1;
        if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
@@ -998,7 +1002,11 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
        if (!nfs_verify_change_attribute(dir, dentry->d_time))
                return 0;
        /* Revalidate nfsi->cache_change_attribute before we declare a match */
-        if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+        if (rcu_walk)
+                ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir);
+        else
+                ret = nfs_revalidate_inode(NFS_SERVER(dir), dir);
+        if (ret < 0)
                return 0;
        if (!nfs_verify_change_attribute(dir, dentry->d_time))
                return 0;
@@ -1042,6 +1050,8 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
 out:
        return (inode->i_nlink == 0) ? -ENOENT : 0;
 out_force:
+        if (flags & LOOKUP_RCU)
+                return -ECHILD;
        ret = __nfs_revalidate_inode(server, inode);
        if (ret != 0)
                return ret;
@@ -1054,6 +1064,9 @@ out_force:
 *
 * If parent mtime has changed, we revalidate, else we wait for a
 * period corresponding to the parent's attribute cache timeout value.
+ *
+ * If LOOKUP_RCU prevents us from performing a full check, return 1
+ * suggesting a reval is needed.
 */
 static inline
 int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
@@ -1064,7 +1077,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
                return 0;
        if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
                return 1;
-        return !nfs_check_verifier(dir, dentry);
+        return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
 }
 /*
@@ -1088,21 +1101,30 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
        struct nfs4_label *label = NULL;
        int error;
-        if (flags & LOOKUP_RCU)
+        if (flags & LOOKUP_RCU) {
-                return -ECHILD;
+                parent = ACCESS_ONCE(dentry->d_parent);
+                dir = ACCESS_ONCE(parent->d_inode);
-        parent = dget_parent(dentry);
+                if (!dir)
-        dir = parent->d_inode;
+                        return -ECHILD;
+        } else {
+                parent = dget_parent(dentry);
+                dir = parent->d_inode;
+        }
        nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
        inode = dentry->d_inode;
        if (!inode) {
-                if (nfs_neg_need_reval(dir, dentry, flags))
+                if (nfs_neg_need_reval(dir, dentry, flags)) {
+                        if (flags & LOOKUP_RCU)
+                                return -ECHILD;
                        goto out_bad;
+                }
                goto out_valid_noent;
        }
        if (is_bad_inode(inode)) {
+                if (flags & LOOKUP_RCU)
+                        return -ECHILD;
                dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
                                __func__, dentry);
                goto out_bad;
@@ -1112,12 +1134,20 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
                goto out_set_verifier;
        /* Force a full look up iff the parent directory has changed */
-        if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) {
+        if (!nfs_is_exclusive_create(dir, flags) &&
-                if (nfs_lookup_verify_inode(inode, flags))
+            nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
+                if (nfs_lookup_verify_inode(inode, flags)) {
+                        if (flags & LOOKUP_RCU)
+                                return -ECHILD;
                        goto out_zap_parent;
+                }
                goto out_valid;
        }
+        if (flags & LOOKUP_RCU)
+                return -ECHILD;
        if (NFS_STALE(inode))
                goto out_bad;
@@ -1153,13 +1183,18 @@ out_set_verifier:
        /* Success: notify readdir to use READDIRPLUS */
        nfs_advise_use_readdirplus(dir);
 out_valid_noent:
-        dput(parent);
+        if (flags & LOOKUP_RCU) {
+                if (parent != ACCESS_ONCE(dentry->d_parent))
+                        return -ECHILD;
+        } else
+                dput(parent);
        dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
                        __func__, dentry);
        return 1;
 out_zap_parent:
        nfs_zap_caches(dir);
 out_bad:
+        WARN_ON(flags & LOOKUP_RCU);
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fhandle);
        nfs4_label_free(label);
@@ -1185,6 +1220,7 @@ out_zap_parent:
                        __func__, dentry);
        return 0;
 out_error:
+        WARN_ON(flags & LOOKUP_RCU);
        nfs_free_fattr(fattr);
        nfs_free_fhandle(fhandle);
        nfs4_label_free(label);
@@ -1529,14 +1565,9 @@ EXPORT_SYMBOL_GPL(nfs_atomic_open);
 static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 {
-        struct dentry *parent = NULL;
        struct inode *inode;
-        struct inode *dir;
        int ret = 0;
-        if (flags & LOOKUP_RCU)
-                return -ECHILD;
        if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
                goto no_open;
        if (d_mountpoint(dentry))
@@ -1545,34 +1576,47 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
                goto no_open;
        inode = dentry->d_inode;
-        parent = dget_parent(dentry);
-        dir = parent->d_inode;
        /* We can't create new files in nfs_open_revalidate(), so we
         * optimize away revalidation of negative dentries.
         */
        if (inode == NULL) {
+                struct dentry *parent;
+                struct inode *dir;
+                if (flags & LOOKUP_RCU) {
+                        parent = ACCESS_ONCE(dentry->d_parent);
+                        dir = ACCESS_ONCE(parent->d_inode);
+                        if (!dir)
+                                return -ECHILD;
+                } else {
+                        parent = dget_parent(dentry);
+                        dir = parent->d_inode;
+                }
                if (!nfs_neg_need_reval(dir, dentry, flags))
                        ret = 1;
+                else if (flags & LOOKUP_RCU)
+                        ret = -ECHILD;
+                if (!(flags & LOOKUP_RCU))
+                        dput(parent);
+                else if (parent != ACCESS_ONCE(dentry->d_parent))
+                        return -ECHILD;
                goto out;
        }
        /* NFS only supports OPEN on regular files */
        if (!S_ISREG(inode->i_mode))
-                goto no_open_dput;
+                goto no_open;
        /* We cannot do exclusive creation on a positive dentry */
        if (flags & LOOKUP_EXCL)
-                goto no_open_dput;
+                goto no_open;
        /* Let f_op->open() actually open (and revalidate) the file */
        ret = 1;
 out:
-        dput(parent);
        return ret;
-no_open_dput:
-        dput(parent);
 no_open:
        return nfs_lookup_revalidate(dentry, flags);
 }
@@ -2028,10 +2072,14 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock);
 static LIST_HEAD(nfs_access_lru_list);
 static atomic_long_t nfs_access_nr_entries;
+static unsigned long nfs_access_max_cachesize = ULONG_MAX;
+module_param(nfs_access_max_cachesize, ulong, 0644);
+MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
 static void nfs_access_free_entry(struct nfs_access_entry *entry)
 {
        put_rpccred(entry->cred);
-        kfree(entry);
+        kfree_rcu(entry, rcu_head);
        smp_mb__before_atomic();
        atomic_long_dec(&nfs_access_nr_entries);
        smp_mb__after_atomic();
@@ -2048,19 +2096,14 @@ static void nfs_access_free_list(struct list_head *head)
        }
 }
-unsigned long
+static unsigned long
-nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+nfs_do_access_cache_scan(unsigned int nr_to_scan)
 {
        LIST_HEAD(head);
        struct nfs_inode *nfsi, *next;
        struct nfs_access_entry *cache;
-        int nr_to_scan = sc->nr_to_scan;
-        gfp_t gfp_mask = sc->gfp_mask;
        long freed = 0;
-        if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
-                return SHRINK_STOP;
        spin_lock(&nfs_access_lru_lock);
        list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
                struct inode *inode;
@@ -2094,11 +2137,39 @@ remove_lru_entry:
 }
 unsigned long
+nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+        int nr_to_scan = sc->nr_to_scan;
+        gfp_t gfp_mask = sc->gfp_mask;
+        if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+                return SHRINK_STOP;
+        return nfs_do_access_cache_scan(nr_to_scan);
+}
+unsigned long
 nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
 {
        return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
 }
+static void
+nfs_access_cache_enforce_limit(void)
+{
+        long nr_entries = atomic_long_read(&nfs_access_nr_entries);
+        unsigned long diff;
+        unsigned int nr_to_scan;
+        if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize)
+                return;
+        nr_to_scan = 100;
+        diff = nr_entries - nfs_access_max_cachesize;
+        if (diff < nr_to_scan)
+                nr_to_scan = diff;
+        nfs_do_access_cache_scan(nr_to_scan);
+}
 static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
 {
        struct rb_root *root_node = &nfsi->access_cache;
@@ -2186,6 +2257,38 @@ out_zap:
        return -ENOENT;
 }
+static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+{
+        /* Only check the most recently returned cache entry,
+         * but do it without locking.
+         */
+        struct nfs_inode *nfsi = NFS_I(inode);
+        struct nfs_access_entry *cache;
+        int err = -ECHILD;
+        struct list_head *lh;
+        rcu_read_lock();
+        if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+                goto out;
+        lh = rcu_dereference(nfsi->access_cache_entry_lru.prev);
+        cache = list_entry(lh, struct nfs_access_entry, lru);
+        if (lh == &nfsi->access_cache_entry_lru ||
+            cred != cache->cred)
+                cache = NULL;
+        if (cache == NULL)
+                goto out;
+        if (!nfs_have_delegated_attributes(inode) &&
+            !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+                goto out;
+        res->jiffies = cache->jiffies;
+        res->cred = cache->cred;
+        res->mask = cache->mask;
+        err = 0;
+out:
+        rcu_read_unlock();
+        return err;
+}
 static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
@@ -2229,6 +2332,11 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
        cache->cred = get_rpccred(set->cred);
        cache->mask = set->mask;
+        /* The above field assignments must be visible
+         * before this item appears on the lru.  We cannot easily
+         * use rcu_assign_pointer, so just force the memory barrier.
+         */
+        smp_wmb();
        nfs_access_add_rbtree(inode, cache);
        /* Update accounting */
@@ -2244,6 +2352,7 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
                                        &nfs_access_lru_list);
                spin_unlock(&nfs_access_lru_lock);
        }
+        nfs_access_cache_enforce_limit();
 }
 EXPORT_SYMBOL_GPL(nfs_access_add_cache);
@@ -2267,10 +2376,16 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
        trace_nfs_access_enter(inode);
-        status = nfs_access_get_cached(inode, cred, &cache);
+        status = nfs_access_get_cached_rcu(inode, cred, &cache);
+        if (status != 0)
+                status = nfs_access_get_cached(inode, cred, &cache);
        if (status == 0)
                goto out_cached;
+        status = -ECHILD;
+        if (mask & MAY_NOT_BLOCK)
+                goto out;
        /* Be clever: ask server to check for all possible rights */
        cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
        cache.cred = cred;
@@ -2321,9 +2436,6 @@ int nfs_permission(struct inode *inode, int mask)
        struct rpc_cred *cred;
        int res = 0;
-        if (mask & MAY_NOT_BLOCK)
-                return -ECHILD;
        nfs_inc_stats(inode, NFSIOS_VFSACCESS);
        if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2350,12 +2462,23 @@ force_lookup:
        if (!NFS_PROTO(inode)->access)
                goto out_notsup;
-        cred = rpc_lookup_cred();
+        /* Always try fast lookups first */
-        if (!IS_ERR(cred)) {
+        rcu_read_lock();
-                res = nfs_do_access(inode, cred, mask);
+        cred = rpc_lookup_cred_nonblock();
-                put_rpccred(cred);
+        if (!IS_ERR(cred))
-        } else
+                res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
+        else
                res = PTR_ERR(cred);
+        rcu_read_unlock();
+        if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
+                /* Fast lookup failed, try the slow way */
+                cred = rpc_lookup_cred();
+                if (!IS_ERR(cred)) {
+                        res = nfs_do_access(inode, cred, mask);
+                        put_rpccred(cred);
+                } else
+                        res = PTR_ERR(cred);
+        }
 out:
        if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
                res = -EACCES;
@@ -2364,6 +2487,9 @@ out:
                inode->i_sb->s_id, inode->i_ino, mask, res);
        return res;
 out_notsup:
+        if (mask & MAY_NOT_BLOCK)
+                return -ECHILD;
        res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
        if (res == 0)
                res = generic_permission(inode, mask);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f11b9eed0de1..65ef6e00deee 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -148,8 +148,8 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
 {
        struct nfs_writeverf *verfp;
-        verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
+        verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
-                                      hdr->data->ds_idx);
+                                      hdr->ds_idx);
        WARN_ON_ONCE(verfp->committed >= 0);
        memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
        WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +169,8 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
 {
        struct nfs_writeverf *verfp;
-        verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
+        verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
-                                         hdr->data->ds_idx);
+                                         hdr->ds_idx);
        if (verfp->committed < 0) {
                nfs_direct_set_hdr_verf(dreq, hdr);
                return 0;
@@ -715,7 +715,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 {
        struct nfs_direct_req *dreq = hdr->dreq;
        struct nfs_commit_info cinfo;
-        int bit = -1;
+        bool request_commit = false;
        struct nfs_page *req = nfs_list_entry(hdr->pages.next);
        if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
@@ -729,27 +729,20 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
                dreq->flags = 0;
                dreq->error = hdr->error;
        }
-        if (dreq->error != 0)
+        if (dreq->error == 0) {
-                bit = NFS_IOHDR_ERROR;
-        else {
                dreq->count += hdr->good_bytes;
-                if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
+                if (nfs_write_need_commit(hdr)) {
-                        dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
-                        bit = NFS_IOHDR_NEED_RESCHED;
-                } else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
                        if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
-                                bit = NFS_IOHDR_NEED_RESCHED;
+                                request_commit = true;
                        else if (dreq->flags == 0) {
                                nfs_direct_set_hdr_verf(dreq, hdr);
-                                bit = NFS_IOHDR_NEED_COMMIT;
+                                request_commit = true;
                                dreq->flags = NFS_ODIRECT_DO_COMMIT;
                        } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
-                                if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) {
+                                request_commit = true;
+                                if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
                                        dreq->flags =
                                                NFS_ODIRECT_RESCHED_WRITES;
-                                        bit = NFS_IOHDR_NEED_RESCHED;
-                                } else
-                                        bit = NFS_IOHDR_NEED_COMMIT;
                        }
                }
        }
@@ -759,9 +752,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
                req = nfs_list_entry(hdr->pages.next);
                nfs_list_remove_request(req);
-                switch (bit) {
+                if (request_commit) {
-                case NFS_IOHDR_NEED_RESCHED:
-                case NFS_IOHDR_NEED_COMMIT:
                        kref_get(&req->wb_kref);
                        nfs_mark_request_commit(req, hdr->lseg, &cinfo);
                }
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d2eba1c13b7e..1359c4a27393 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -84,45 +84,37 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
        BUG();
 }
-static void filelayout_reset_write(struct nfs_pgio_data *data)
+static void filelayout_reset_write(struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = data->header;
+        struct rpc_task *task = &hdr->task;
-        struct rpc_task *task = &data->task;
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
                dprintk("%s Reset task %5u for i/o through MDS "
                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
-                        data->task.tk_pid,
+                        hdr->task.tk_pid,
                        hdr->inode->i_sb->s_id,
                        (unsigned long long)NFS_FILEID(hdr->inode),
-                        data->args.count,
+                        hdr->args.count,
-                        (unsigned long long)data->args.offset);
+                        (unsigned long long)hdr->args.offset);
-                task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
+                task->tk_status = pnfs_write_done_resend_to_mds(hdr);
-                                                        &hdr->pages,
-                                                        hdr->completion_ops,
-                                                        hdr->dreq);
        }
 }
-static void filelayout_reset_read(struct nfs_pgio_data *data)
+static void filelayout_reset_read(struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = data->header;
+        struct rpc_task *task = &hdr->task;
-        struct rpc_task *task = &data->task;
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
                dprintk("%s Reset task %5u for i/o through MDS "
                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
-                        data->task.tk_pid,
+                        hdr->task.tk_pid,
                        hdr->inode->i_sb->s_id,
                        (unsigned long long)NFS_FILEID(hdr->inode),
-                        data->args.count,
+                        hdr->args.count,
-                        (unsigned long long)data->args.offset);
+                        (unsigned long long)hdr->args.offset);
-                task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
+                task->tk_status = pnfs_read_done_resend_to_mds(hdr);
-                                                        &hdr->pages,
-                                                        hdr->completion_ops,
-                                                        hdr->dreq);
        }
 }
@@ -243,18 +235,17 @@ wait_on_recovery:
 /* NFS_PROTO call done callback routines */
 static int filelayout_read_done_cb(struct rpc_task *task,
-                                struct nfs_pgio_data *data)
+                                struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = data->header;
        int err;
-        trace_nfs4_pnfs_read(data, task->tk_status);
+        trace_nfs4_pnfs_read(hdr, task->tk_status);
-        err = filelayout_async_handle_error(task, data->args.context->state,
+        err = filelayout_async_handle_error(task, hdr->args.context->state,
-                                            data->ds_clp, hdr->lseg);
+                                            hdr->ds_clp, hdr->lseg);
        switch (err) {
        case -NFS4ERR_RESET_TO_MDS:
-                filelayout_reset_read(data);
+                filelayout_reset_read(hdr);
                return task->tk_status;
        case -EAGAIN:
                rpc_restart_call_prepare(task);
@@ -270,15 +261,14 @@ static int filelayout_read_done_cb(struct rpc_task *task,
 * rfc5661 is not clear about which credential should be used.
 */
 static void
-filelayout_set_layoutcommit(struct nfs_pgio_data *wdata)
+filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = wdata->header;
        if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
-            wdata->res.verf->committed == NFS_FILE_SYNC)
+            hdr->res.verf->committed == NFS_FILE_SYNC)
                return;
-        pnfs_set_layoutcommit(wdata);
+        pnfs_set_layoutcommit(hdr);
        dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
                (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
 }
@@ -305,83 +295,82 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
 */
 static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
-        struct nfs_pgio_data *rdata = data;
+        struct nfs_pgio_header *hdr = data;
-        if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) {
+        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
                return;
        }
-        if (filelayout_reset_to_mds(rdata->header->lseg)) {
+        if (filelayout_reset_to_mds(hdr->lseg)) {
                dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
-                filelayout_reset_read(rdata);
+                filelayout_reset_read(hdr);
                rpc_exit(task, 0);
                return;
        }
-        rdata->pgio_done_cb = filelayout_read_done_cb;
+        hdr->pgio_done_cb = filelayout_read_done_cb;
-        if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
+        if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
-                        &rdata->args.seq_args,
+                        &hdr->args.seq_args,
-                        &rdata->res.seq_res,
+                        &hdr->res.seq_res,
                        task))
                return;
-        if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
+        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
-                        rdata->args.lock_context, FMODE_READ) == -EIO)
+                        hdr->args.lock_context, FMODE_READ) == -EIO)
                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
 }
 static void filelayout_read_call_done(struct rpc_task *task, void *data)
 {
-        struct nfs_pgio_data *rdata = data;
+        struct nfs_pgio_header *hdr = data;
        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
-        if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) &&
+        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
            task->tk_status == 0) {
-                nfs41_sequence_done(task, &rdata->res.seq_res);
+                nfs41_sequence_done(task, &hdr->res.seq_res);
                return;
        }
        /* Note this may cause RPC to be resent */
-        rdata->header->mds_ops->rpc_call_done(task, data);
+        hdr->mds_ops->rpc_call_done(task, data);
 }
 static void filelayout_read_count_stats(struct rpc_task *task, void *data)
 {
-        struct nfs_pgio_data *rdata = data;
+        struct nfs_pgio_header *hdr = data;
-        rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics);
+        rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
 }
 static void filelayout_read_release(void *data)
 {
-        struct nfs_pgio_data *rdata = data;
+        struct nfs_pgio_header *hdr = data;
-        struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout;
+        struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
        filelayout_fenceme(lo->plh_inode, lo);
-        nfs_put_client(rdata->ds_clp);
+        nfs_put_client(hdr->ds_clp);
-        rdata->header->mds_ops->rpc_release(data);
+        hdr->mds_ops->rpc_release(data);
 }
 static int filelayout_write_done_cb(struct rpc_task *task,
-                                struct nfs_pgio_data *data)
+                                struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = data->header;
        int err;
-        trace_nfs4_pnfs_write(data, task->tk_status);
+        trace_nfs4_pnfs_write(hdr, task->tk_status);
-        err = filelayout_async_handle_error(task, data->args.context->state,
+        err = filelayout_async_handle_error(task, hdr->args.context->state,
-                                            data->ds_clp, hdr->lseg);
+                                            hdr->ds_clp, hdr->lseg);
        switch (err) {
        case -NFS4ERR_RESET_TO_MDS:
-                filelayout_reset_write(data);
+                filelayout_reset_write(hdr);
                return task->tk_status;
        case -EAGAIN:
                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
-        filelayout_set_layoutcommit(data);
+        filelayout_set_layoutcommit(hdr);
        return 0;
 }
@@ -419,57 +408,57 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
 static void filelayout_write_prepare(struct rpc_task *task, void *data)
 {
-        struct nfs_pgio_data *wdata = data;
+        struct nfs_pgio_header *hdr = data;
-        if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) {
+        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
                return;
        }
-        if (filelayout_reset_to_mds(wdata->header->lseg)) {
+        if (filelayout_reset_to_mds(hdr->lseg)) {
                dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
-                filelayout_reset_write(wdata);
+                filelayout_reset_write(hdr);
                rpc_exit(task, 0);
                return;
        }
-        if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
+        if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
-                        &wdata->args.seq_args,
+                        &hdr->args.seq_args,
-                        &wdata->res.seq_res,
+                        &hdr->res.seq_res,
                        task))
                return;
-        if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
+        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
-                        wdata->args.lock_context, FMODE_WRITE) == -EIO)
+                        hdr->args.lock_context, FMODE_WRITE) == -EIO)
                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
 }
 static void filelayout_write_call_done(struct rpc_task *task, void *data)
 {
-        struct nfs_pgio_data *wdata = data;
+        struct nfs_pgio_header *hdr = data;
-        if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
+        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
            task->tk_status == 0) {
-                nfs41_sequence_done(task, &wdata->res.seq_res);
+                nfs41_sequence_done(task, &hdr->res.seq_res);
                return;
        }
        /* Note this may cause RPC to be resent */
-        wdata->header->mds_ops->rpc_call_done(task, data);
+        hdr->mds_ops->rpc_call_done(task, data);
 }
 static void filelayout_write_count_stats(struct rpc_task *task, void *data)
 {
-        struct nfs_pgio_data *wdata = data;
+        struct nfs_pgio_header *hdr = data;
-        rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics);
+        rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
 }
 static void filelayout_write_release(void *data)
 {
-        struct nfs_pgio_data *wdata = data;
+        struct nfs_pgio_header *hdr = data;
-        struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout;
+        struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
        filelayout_fenceme(lo->plh_inode, lo);
-        nfs_put_client(wdata->ds_clp);
+        nfs_put_client(hdr->ds_clp);
-        wdata->header->mds_ops->rpc_release(data);
+        hdr->mds_ops->rpc_release(data);
 }
 static void filelayout_commit_prepare(struct rpc_task *task, void *data)
@@ -529,19 +518,18 @@ static const struct rpc_call_ops filelayout_commit_call_ops = {
 };
 static enum pnfs_try_status
-filelayout_read_pagelist(struct nfs_pgio_data *data)
+filelayout_read_pagelist(struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = data->header;
        struct pnfs_layout_segment *lseg = hdr->lseg;
        struct nfs4_pnfs_ds *ds;
        struct rpc_clnt *ds_clnt;
-        loff_t offset = data->args.offset;
+        loff_t offset = hdr->args.offset;
        u32 j, idx;
        struct nfs_fh *fh;
        dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
                __func__, hdr->inode->i_ino,
-                data->args.pgbase, (size_t)data->args.count, offset);
+                hdr->args.pgbase, (size_t)hdr->args.count, offset);
        /* Retrieve the correct rpc_client for the byte range */
        j = nfs4_fl_calc_j_index(lseg, offset);
@@ -559,30 +547,29 @@ filelayout_read_pagelist(struct nfs_pgio_data *data)
        /* No multipath support. Use first DS */
        atomic_inc(&ds->ds_clp->cl_count);
-        data->ds_clp = ds->ds_clp;
+        hdr->ds_clp = ds->ds_clp;
-        data->ds_idx = idx;
+        hdr->ds_idx = idx;
        fh = nfs4_fl_select_ds_fh(lseg, j);
        if (fh)
-                data->args.fh = fh;
+                hdr->args.fh = fh;
-        data->args.offset = filelayout_get_dserver_offset(lseg, offset);
+        hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
-        data->mds_offset = offset;
+        hdr->mds_offset = offset;
        /* Perform an asynchronous read to ds */
-        nfs_initiate_pgio(ds_clnt, data,
+        nfs_initiate_pgio(ds_clnt, hdr,
                            &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
        return PNFS_ATTEMPTED;
 }
 /* Perform async writes. */
 static enum pnfs_try_status
-filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
+filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 {
-        struct nfs_pgio_header *hdr = data->header;
        struct pnfs_layout_segment *lseg = hdr->lseg;
        struct nfs4_pnfs_ds *ds;
        struct rpc_clnt *ds_clnt;
-        loff_t offset = data->args.offset;
+        loff_t offset = hdr->args.offset;
        u32 j, idx;
        struct nfs_fh *fh;
@@ -598,21 +585,20 @@ filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
                return PNFS_NOT_ATTEMPTED;
        dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
-                __func__, hdr->inode->i_ino, sync, (size_t) data->args.count,
+                __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
                offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
-        data->pgio_done_cb = filelayout_write_done_cb;
+        hdr->pgio_done_cb = filelayout_write_done_cb;
        atomic_inc(&ds->ds_clp->cl_count);
-        data->ds_clp = ds->ds_clp;
+        hdr->ds_clp = ds->ds_clp;
-        data->ds_idx = idx;
+        hdr->ds_idx = idx;
        fh = nfs4_fl_select_ds_fh(lseg, j);
        if (fh)
-                data->args.fh = fh;
+                hdr->args.fh = fh;
+        hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
-        data->args.offset = filelayout_get_dserver_offset(lseg, offset);
        /* Perform an asynchronous write */
-        nfs_initiate_pgio(ds_clnt, data,
+        nfs_initiate_pgio(ds_clnt, hdr,
                                    &filelayout_write_call_ops, sync,
                                    RPC_TASK_SOFTCONN);
        return PNFS_ATTEMPTED;
@@ -1023,6 +1009,7 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
 /* The generic layer is about to remove the req from the commit list.
 * If this will make the bucket empty, it will need to put the lseg reference.
+ * Note this is must be called holding the inode (/cinfo) lock
 */
 static void
 filelayout_clear_request_commit(struct nfs_page *req,
@@ -1030,7 +1017,6 @@ filelayout_clear_request_commit(struct nfs_page *req,
 {
        struct pnfs_layout_segment *freeme = NULL;
-        spin_lock(cinfo->lock);
        if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
                goto out;
        cinfo->ds->nwritten--;
@@ -1045,22 +1031,25 @@ filelayout_clear_request_commit(struct nfs_page *req,
        }
 out:
        nfs_request_remove_commit_list(req, cinfo);
-        spin_unlock(cinfo->lock);
+        pnfs_put_lseg_async(freeme);
-        pnfs_put_lseg(freeme);
 }
-static struct list_head *
+static void
-filelayout_choose_commit_list(struct nfs_page *req,
+filelayout_mark_request_commit(struct nfs_page *req,
-                              struct pnfs_layout_segment *lseg,
+                               struct pnfs_layout_segment *lseg,
-                              struct nfs_commit_info *cinfo)
+                               struct nfs_commit_info *cinfo)
 {
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
        u32 i, j;
        struct list_head *list;
        struct pnfs_commit_bucket *buckets;
-        if (fl->commit_through_mds)
+        if (fl->commit_through_mds) {
-                return &cinfo->mds->list;
+                list = &cinfo->mds->list;
+                spin_lock(cinfo->lock);
+                goto mds_commit;
+        }
        /* Note that we are calling nfs4_fl_calc_j_index on each page
         * that ends up being committed to a data server.  An attractive
@@ -1084,19 +1073,22 @@ filelayout_choose_commit_list(struct nfs_page *req,
        }
        set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
        cinfo->ds->nwritten++;
-        spin_unlock(cinfo->lock);
-        return list;
-}
-static void
+mds_commit:
-filelayout_mark_request_commit(struct nfs_page *req,
+        /* nfs_request_add_commit_list(). We need to add req to list without
-                               struct pnfs_layout_segment *lseg,
+         * dropping cinfo lock.
-                               struct nfs_commit_info *cinfo)
+         */
-{
+        set_bit(PG_CLEAN, &(req)->wb_flags);
-        struct list_head *list;
+        nfs_list_add_request(req, list);
+        cinfo->mds->ncommit++;
-        list = filelayout_choose_commit_list(req, lseg, cinfo);
+        spin_unlock(cinfo->lock);
-        nfs_request_add_commit_list(req, list, cinfo);
+        if (!cinfo->dreq) {
+                inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+                inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+                             BDI_RECLAIMABLE);
+                __mark_inode_dirty(req->wb_context->dentry->d_inode,
+                                   I_DIRTY_DATASYNC);
+        }
 }
 static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
@@ -1244,15 +1236,63 @@ restart:
        spin_unlock(cinfo->lock);
 }
+/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
+ *                                 for @page
+ * @cinfo - commit info for current inode
+ * @page - page to search for matching head request
+ *
+ * Returns a the head request if one is found, otherwise returns NULL.
+ */
+static struct nfs_page *
+filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
+{
+        struct nfs_page *freq, *t;
+        struct pnfs_commit_bucket *b;
+        int i;
+        /* Linearly search the commit lists for each bucket until a matching
+         * request is found */
+        for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+                list_for_each_entry_safe(freq, t, &b->written, wb_list) {
+                        if (freq->wb_page == page)
+                                return freq->wb_head;
+                }
+                list_for_each_entry_safe(freq, t, &b->committing, wb_list) {
+                        if (freq->wb_page == page)
+                                return freq->wb_head;
+                }
+        }
+        return NULL;
+}
+static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
+{
+        struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+        struct pnfs_commit_bucket *bucket = fl_cinfo->buckets;
+        struct pnfs_layout_segment *freeme;
+        int i;
+        for (i = idx; i < fl_cinfo->nbuckets; i++, bucket++) {
+                if (list_empty(&bucket->committing))
+                        continue;
+                nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
+                spin_lock(cinfo->lock);
+                freeme = bucket->clseg;
+                bucket->clseg = NULL;
+                spin_unlock(cinfo->lock);
+                pnfs_put_lseg(freeme);
+        }
+}
 static unsigned int
 alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
 {
        struct pnfs_ds_commit_info *fl_cinfo;
        struct pnfs_commit_bucket *bucket;
        struct nfs_commit_data *data;
-        int i, j;
+        int i;
        unsigned int nreq = 0;
-        struct pnfs_layout_segment *freeme;
        fl_cinfo = cinfo->ds;
        bucket = fl_cinfo->buckets;
@@ -1272,16 +1312,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
        }
        /* Clean up on error */
-        for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) {
+        filelayout_retry_commit(cinfo, i);
-                if (list_empty(&bucket->committing))
-                        continue;
-                nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
-                spin_lock(cinfo->lock);
-                freeme = bucket->clseg;
-                bucket->clseg = NULL;
-                spin_unlock(cinfo->lock);
-                pnfs_put_lseg(freeme);
-        }
        /* Caller will clean up entries put on list */
        return nreq;
 }
@@ -1301,8 +1332,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
                        data->lseg = NULL;
                        list_add(&data->pages, &list);
                        nreq++;
-                } else
+                } else {
                        nfs_retry_commit(mds_pages, NULL, cinfo);
+                        filelayout_retry_commit(cinfo, 0);
+                        cinfo->completion_ops->error_cleanup(NFS_I(inode));
+                        return -ENOMEM;
+                }
        }
        nreq += alloc_ds_commits(cinfo, &list);
@@ -1380,6 +1415,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
        .clear_request_commit   = filelayout_clear_request_commit,
        .scan_commit_lists      = filelayout_scan_commit_lists,
        .recover_commit_reqs    = filelayout_recover_commit_reqs,
+        .search_commit_reqs     = filelayout_search_commit_reqs,
        .commit_pagelist        = filelayout_commit_pagelist,
        .read_pagelist          = filelayout_read_pagelist,
        .write_pagelist         = filelayout_write_pagelist,
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index e2a0361e24c6..8540516f4d71 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -695,7 +695,7 @@ filelayout_get_device_info(struct inode *inode,
        if (pdev == NULL)
                return NULL;
-        pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
+        pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
        if (pages == NULL) {
                kfree(pdev);
                return NULL;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b94f80420a58..880618a8b048 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -112,7 +112,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
         * if the dentry tree reaches them; however if the dentry already
         * exists, we'll pick it up at this point and use it as the root
         */
-        ret = d_obtain_alias(inode);
+        ret = d_obtain_root(inode);
        if (IS_ERR(ret)) {
                dprintk("nfs_get_root: get root dentry failed\n");
                goto out;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 567983d2c0eb..7dd55b745c4d 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -174,7 +174,9 @@ static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
 static struct key_type key_type_id_resolver = {
        .name           = "id_resolver",
-        .instantiate    = user_instantiate,
+        .preparse       = user_preparse,
+        .free_preparse  = user_free_preparse,
+        .instantiate    = generic_key_instantiate,
        .match          = user_match,
        .revoke         = user_revoke,
        .destroy        = user_destroy,
@@ -282,6 +284,8 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
                                                desc, "", 0, idmap);
                mutex_unlock(&idmap->idmap_mutex);
        }
+        if (!IS_ERR(rkey))
+                set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags);
        kfree(desc);
        return rkey;
@@ -394,7 +398,9 @@ static const struct rpc_pipe_ops idmap_upcall_ops = {
 static struct key_type key_type_id_resolver_legacy = {
        .name           = "id_legacy",
-        .instantiate    = user_instantiate,
+        .preparse       = user_preparse,
+        .free_preparse  = user_free_preparse,
+        .instantiate    = generic_key_instantiate,
        .match          = user_match,
        .revoke         = user_revoke,
        .destroy        = user_destroy,
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index abd37a380535..577a36f0a510 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1002,6 +1002,15 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
+int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode)
+{
+        if (!(NFS_I(inode)->cache_validity &
+                        (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
+                        && !nfs_attribute_cache_expired(inode))
+                return NFS_STALE(inode) ? -ESTALE : 0;
+        return -ECHILD;
+}
 static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
@@ -1840,11 +1849,12 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
 static int nfs_net_init(struct net *net)
 {
        nfs_clients_init(net);
-        return 0;
+        return nfs_fs_proc_net_init(net);
 }
 static void nfs_net_exit(struct net *net)
 {
+        nfs_fs_proc_net_exit(net);
        nfs_cleanup_cb_ident_idr(net);
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 617f36611d4a..9056622d2230 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -195,7 +195,16 @@ extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
+extern int nfs_fs_proc_net_init(struct net *net);
+extern void nfs_fs_proc_net_exit(struct net *net);
 #else
+static inline int nfs_fs_proc_net_init(struct net *net)
+{
+        return 0;
+}
+static inline void nfs_fs_proc_net_exit(struct net *net)
+{
+}
 static inline int nfs_fs_proc_init(void)
 {
        return 0;
@@ -238,11 +247,11 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
 int nfs_iocounter_wait(struct nfs_io_counter *c);
 extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
-struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *);
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
-void nfs_rw_header_free(struct nfs_pgio_header *);
+void nfs_pgio_header_free(struct nfs_pgio_header *);
-void nfs_pgio_data_release(struct nfs_pgio_data *);
+void nfs_pgio_data_destroy(struct nfs_pgio_header *);
 int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
-int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *,
+int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *,
                      const struct rpc_call_ops *, int, int);
 void nfs_free_request(struct nfs_page *req);
@@ -442,6 +451,7 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst,
 void nfs_mark_request_commit(struct nfs_page *req,
                             struct pnfs_layout_segment *lseg,
                             struct nfs_commit_info *cinfo);
+int nfs_write_need_commit(struct nfs_pgio_header *);
 int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
                            int how, struct nfs_commit_info *cinfo);
 void nfs_retry_commit(struct list_head *page_list,
@@ -482,7 +492,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
 extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
 /* nfs4proc.c */
-extern void __nfs4_read_done_cb(struct nfs_pgio_data *);
+extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
 extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
                            const struct rpc_timeout *timeparms,
                            const char *ip_addr);
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index 8ee1fab83268..ef221fb8a183 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -29,6 +29,9 @@ struct nfs_net {
 #endif
        spinlock_t nfs_client_lock;
        struct timespec boot_time;
+#ifdef CONFIG_PROC_FS
+        struct proc_dir_entry *proc_nfsfs;
+#endif
 };
 extern int nfs_net_id;
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 8f854dde4150..d0fec260132a 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -256,7 +256,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
        char *p = data + *result;
        acl = get_acl(inode, type);
-        if (!acl)
+        if (IS_ERR_OR_NULL(acl))
                return 0;
        posix_acl_release(acl);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index f0afa291fd58..809670eba52a 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -795,41 +795,44 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return status;
 }
-static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-        struct inode *inode = data->header->inode;
+        struct inode *inode = hdr->inode;
        if (nfs3_async_handle_jukebox(task, inode))
                return -EAGAIN;
        nfs_invalidate_atime(inode);
-        nfs_refresh_inode(inode, &data->fattr);
+        nfs_refresh_inode(inode, &hdr->fattr);
        return 0;
 }
-static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr,
+                                 struct rpc_message *msg)
 {
        msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
 }
-static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task,
+                                      struct nfs_pgio_header *hdr)
 {
        rpc_call_start(task);
        return 0;
 }
-static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-        struct inode *inode = data->header->inode;
+        struct inode *inode = hdr->inode;
        if (nfs3_async_handle_jukebox(task, inode))
                return -EAGAIN;
        if (task->tk_status >= 0)
-                nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+                nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
        return 0;
 }
-static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr,
+                                  struct rpc_message *msg)
 {
        msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
 }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ba2affa51941..92193eddb41d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -54,7 +54,7 @@ struct nfs4_minor_version_ops {
                        const nfs4_stateid *);
        int     (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
                        struct nfs_fsinfo *);
-        int     (*free_lock_state)(struct nfs_server *,
+        void    (*free_lock_state)(struct nfs_server *,
                        struct nfs4_lock_state *);
        const struct rpc_call_ops *call_sync_ops;
        const struct nfs4_state_recovery_ops *reboot_recovery_ops;
@@ -129,27 +129,17 @@ enum {
 * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
 */
-struct nfs4_lock_owner {
-        unsigned int lo_type;
-#define NFS4_ANY_LOCK_TYPE      (0U)
-#define NFS4_FLOCK_LOCK_TYPE    (1U << 0)
-#define NFS4_POSIX_LOCK_TYPE    (1U << 1)
-        union {
-                fl_owner_t posix_owner;
-                pid_t flock_owner;
-        } lo_u;
-};
 struct nfs4_lock_state {
-        struct list_head        ls_locks;       /* Other lock stateids */
+        struct list_head                ls_locks;   /* Other lock stateids */
-        struct nfs4_state *     ls_state;       /* Pointer to open state */
+        struct nfs4_state *             ls_state;   /* Pointer to open state */
 #define NFS_LOCK_INITIALIZED 0
 #define NFS_LOCK_LOST        1
-        unsigned long           ls_flags;
+        unsigned long                   ls_flags;
        struct nfs_seqid_counter        ls_seqid;
-        nfs4_stateid            ls_stateid;
+        nfs4_stateid                    ls_stateid;
-        atomic_t                ls_count;
+        atomic_t                        ls_count;
-        struct nfs4_lock_owner  ls_owner;
+        fl_owner_t                      ls_owner;
+        struct work_struct              ls_release;
 };
 /* bits for nfs4_state->flags */
@@ -337,11 +327,11 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
 */
 static inline void
 nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
-                         struct rpc_message *msg, struct nfs_pgio_data *wdata)
+                         struct rpc_message *msg, struct nfs_pgio_header *hdr)
 {
        if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
            !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
-                wdata->args.stable = NFS_FILE_SYNC;
+                hdr->args.stable = NFS_FILE_SYNC;
 }
 #else /* CONFIG_NFS_v4_1 */
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -369,7 +359,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
 static inline void
 nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
-                         struct rpc_message *msg, struct nfs_pgio_data *wdata)
+                         struct rpc_message *msg, struct nfs_pgio_header *hdr)
 {
 }
 #endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index aa9ef4876046..53e435a95260 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -855,6 +855,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
        };
        struct rpc_timeout ds_timeout;
        struct nfs_client *clp;
+        char buf[INET6_ADDRSTRLEN + 1];
+        if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+                return ERR_PTR(-EINVAL);
+        cl_init.hostname = buf;
        /*
         * Set an authflavor equual to the MDS value. Use the MDS nfs_client
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4bf3d97cc5a0..75ae8d22f067 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1952,6 +1952,14 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
        return status;
 }
+/*
+ * Additional permission checks in order to distinguish between an
+ * open for read, and an open for execute. This works around the
+ * fact that NFSv4 OPEN treats read and execute permissions as being
+ * the same.
+ * Note that in the non-execute case, we want to turn off permission
+ * checking if we just created a new file (POSIX open() semantics).
+ */
 static int nfs4_opendata_access(struct rpc_cred *cred,
                                struct nfs4_opendata *opendata,
                                struct nfs4_state *state, fmode_t fmode,
@@ -1966,14 +1974,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
                return 0;
        mask = 0;
-        /* don't check MAY_WRITE - a newly created file may not have
+        /*
-         * write mode bits, but POSIX allows the creating process to write.
+         * Use openflags to check for exec, because fmode won't
-         * use openflags to check for exec, because fmode won't
+         * always have FMODE_EXEC set when file open for exec.
-         * always have FMODE_EXEC set when file open for exec. */
+         */
        if (openflags & __FMODE_EXEC) {
                /* ONLY check for exec rights */
                mask = MAY_EXEC;
-        } else if (fmode & FMODE_READ)
+        } else if ((fmode & FMODE_READ) && !opendata->file_created)
                mask = MAY_READ;
        cache.cred = cred;
@@ -2216,8 +2224,15 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
        seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
        ret = _nfs4_proc_open(opendata);
-        if (ret != 0)
+        if (ret != 0) {
+                if (ret == -ENOENT) {
+                        d_drop(opendata->dentry);
+                        d_add(opendata->dentry, NULL);
+                        nfs_set_verifier(opendata->dentry,
+                                         nfs_save_change_attribute(opendata->dir->d_inode));
+                }
                goto out;
+        }
        state = nfs4_opendata_to_nfs4_state(opendata);
        ret = PTR_ERR(state);
@@ -2647,6 +2662,48 @@ static const struct rpc_call_ops nfs4_close_ops = {
        .rpc_release = nfs4_free_closedata,
 };
+static bool nfs4_state_has_opener(struct nfs4_state *state)
+{
+        /* first check existing openers */
+        if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 &&
+            state->n_rdonly != 0)
+                return true;
+        if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 &&
+            state->n_wronly != 0)
+                return true;
+        if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 &&
+            state->n_rdwr != 0)
+                return true;
+        return false;
+}
+static bool nfs4_roc(struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        struct nfs_open_context *ctx;
+        struct nfs4_state *state;
+        spin_lock(&inode->i_lock);
+        list_for_each_entry(ctx, &nfsi->open_files, list) {
+                state = ctx->state;
+                if (state == NULL)
+                        continue;
+                if (nfs4_state_has_opener(state)) {
+                        spin_unlock(&inode->i_lock);
+                        return false;
+                }
+        }
+        spin_unlock(&inode->i_lock);
+        if (nfs4_check_delegation(inode, FMODE_READ))
+                return false;
+        return pnfs_roc(inode);
+}
 /* 
 * It is possible for data to be read/written from a mem-mapped file 
 * after the sys_close call (which hits the vfs layer as a flush).
@@ -2697,7 +2754,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
-        calldata->roc = pnfs_roc(state->inode);
+        calldata->roc = nfs4_roc(state->inode);
        nfs_sb_active(calldata->inode->i_sb);
        msg.rpc_argp = &calldata->arg;
@@ -4033,24 +4090,25 @@ static bool nfs4_error_stateid_expired(int err)
        return false;
 }
-void __nfs4_read_done_cb(struct nfs_pgio_data *data)
+void __nfs4_read_done_cb(struct nfs_pgio_header *hdr)
 {
-        nfs_invalidate_atime(data->header->inode);
+        nfs_invalidate_atime(hdr->inode);
 }
-static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-        struct nfs_server *server = NFS_SERVER(data->header->inode);
+        struct nfs_server *server = NFS_SERVER(hdr->inode);
-        trace_nfs4_read(data, task->tk_status);
+        trace_nfs4_read(hdr, task->tk_status);
-        if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
+        if (nfs4_async_handle_error(task, server,
+                                    hdr->args.context->state) == -EAGAIN) {
                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
-        __nfs4_read_done_cb(data);
+        __nfs4_read_done_cb(hdr);
        if (task->tk_status > 0)
-                renew_lease(server, data->timestamp);
+                renew_lease(server, hdr->timestamp);
        return 0;
 }
@@ -4068,54 +4126,59 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,
        return true;
 }
-static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
        dprintk("--> %s\n", __func__);
-        if (!nfs4_sequence_done(task, &data->res.seq_res))
+        if (!nfs4_sequence_done(task, &hdr->res.seq_res))
                return -EAGAIN;
-        if (nfs4_read_stateid_changed(task, &data->args))
+        if (nfs4_read_stateid_changed(task, &hdr->args))
                return -EAGAIN;
-        return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
+        return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
-                                    nfs4_read_done_cb(task, data);
+                                    nfs4_read_done_cb(task, hdr);
 }
-static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
+                                 struct rpc_message *msg)
 {
-        data->timestamp   = jiffies;
+        hdr->timestamp   = jiffies;
-        data->pgio_done_cb = nfs4_read_done_cb;
+        hdr->pgio_done_cb = nfs4_read_done_cb;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
-        nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+        nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
 }
-static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
+                                      struct nfs_pgio_header *hdr)
 {
-        if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
+        if (nfs4_setup_sequence(NFS_SERVER(hdr->inode),
-                        &data->args.seq_args,
+                        &hdr->args.seq_args,
-                        &data->res.seq_res,
+                        &hdr->res.seq_res,
                        task))
                return 0;
-        if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
+        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
-                                data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO)
+                                hdr->args.lock_context,
+                                hdr->rw_ops->rw_mode) == -EIO)
                return -EIO;
-        if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
                return -EIO;
        return 0;
 }
-static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_write_done_cb(struct rpc_task *task,
+                              struct nfs_pgio_header *hdr)
 {
-        struct inode *inode = data->header->inode;
+        struct inode *inode = hdr->inode;
        
-        trace_nfs4_write(data, task->tk_status);
+        trace_nfs4_write(hdr, task->tk_status);
-        if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
+        if (nfs4_async_handle_error(task, NFS_SERVER(inode),
+                                    hdr->args.context->state) == -EAGAIN) {
                rpc_restart_call_prepare(task);
                return -EAGAIN;
        }
        if (task->tk_status >= 0) {
-                renew_lease(NFS_SERVER(inode), data->timestamp);
+                renew_lease(NFS_SERVER(inode), hdr->timestamp);
-                nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+                nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr);
        }
        return 0;
 }
@@ -4134,23 +4197,21 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task,
        return true;
 }
-static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-        if (!nfs4_sequence_done(task, &data->res.seq_res))
+        if (!nfs4_sequence_done(task, &hdr->res.seq_res))
                return -EAGAIN;
-        if (nfs4_write_stateid_changed(task, &data->args))
+        if (nfs4_write_stateid_changed(task, &hdr->args))
                return -EAGAIN;
-        return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
+        return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
-                nfs4_write_done_cb(task, data);
+                nfs4_write_done_cb(task, hdr);
 }
 static
-bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
+bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
 {
-        const struct nfs_pgio_header *hdr = data->header;
        /* Don't request attributes for pNFS or O_DIRECT writes */
-        if (data->ds_clp != NULL || hdr->dreq != NULL)
+        if (hdr->ds_clp != NULL || hdr->dreq != NULL)
                return false;
        /* Otherwise, request attributes if and only if we don't hold
         * a delegation
@@ -4158,23 +4219,24 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
        return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
 }
-static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
+                                  struct rpc_message *msg)
 {
-        struct nfs_server *server = NFS_SERVER(data->header->inode);
+        struct nfs_server *server = NFS_SERVER(hdr->inode);
-        if (!nfs4_write_need_cache_consistency_data(data)) {
+        if (!nfs4_write_need_cache_consistency_data(hdr)) {
-                data->args.bitmask = NULL;
+                hdr->args.bitmask = NULL;
-                data->res.fattr = NULL;
+                hdr->res.fattr = NULL;
        } else
-                data->args.bitmask = server->cache_consistency_bitmask;
+                hdr->args.bitmask = server->cache_consistency_bitmask;
-        if (!data->pgio_done_cb)
+        if (!hdr->pgio_done_cb)
-                data->pgio_done_cb = nfs4_write_done_cb;
+                hdr->pgio_done_cb = nfs4_write_done_cb;
-        data->res.server = server;
+        hdr->res.server = server;
-        data->timestamp   = jiffies;
+        hdr->timestamp   = jiffies;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
-        nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+        nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1);
 }
 static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -4881,6 +4943,18 @@ nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
                return scnprintf(buf, len, "tcp");
 }
+static void nfs4_setclientid_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_setclientid *sc = calldata;
+        if (task->tk_status == 0)
+                sc->sc_cred = get_rpccred(task->tk_rqstp->rq_cred);
+}
+static const struct rpc_call_ops nfs4_setclientid_ops = {
+        .rpc_call_done = nfs4_setclientid_done,
+};
 /**
 * nfs4_proc_setclientid - Negotiate client ID
 * @clp: state data structure
@@ -4907,6 +4981,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
                .rpc_resp = res,
                .rpc_cred = cred,
        };
+        struct rpc_task *task;
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = clp->cl_rpcclient,
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_setclientid_ops,
+                .callback_data = &setclientid,
+                .flags = RPC_TASK_TIMEOUT,
+        };
        int status;
        /* nfs_client_id4 */
@@ -4933,7 +5015,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
        dprintk("NFS call  setclientid auth=%s, '%.*s'\n",
                clp->cl_rpcclient->cl_auth->au_ops->au_name,
                setclientid.sc_name_len, setclientid.sc_name);
-        status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+        task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task)) {
+                status = PTR_ERR(task);
+                goto out;
+        }
+        status = task->tk_status;
+        if (setclientid.sc_cred) {
+                clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
+                put_rpccred(setclientid.sc_cred);
+        }
+        rpc_put_task(task);
+out:
        trace_nfs4_setclientid(clp, status);
        dprintk("NFS reply setclientid: %d\n", status);
        return status;
@@ -4975,6 +5068,9 @@ struct nfs4_delegreturndata {
        unsigned long timestamp;
        struct nfs_fattr fattr;
        int rpc_status;
+        struct inode *inode;
+        bool roc;
+        u32 roc_barrier;
 };
 static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
@@ -4988,7 +5084,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
        switch (task->tk_status) {
        case 0:
                renew_lease(data->res.server, data->timestamp);
-                break;
        case -NFS4ERR_ADMIN_REVOKED:
        case -NFS4ERR_DELEG_REVOKED:
        case -NFS4ERR_BAD_STATEID:
@@ -4996,6 +5091,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
        case -NFS4ERR_STALE_STATEID:
        case -NFS4ERR_EXPIRED:
                task->tk_status = 0;
+                if (data->roc)
+                        pnfs_roc_set_barrier(data->inode, data->roc_barrier);
                break;
        default:
                if (nfs4_async_handle_error(task, data->res.server, NULL) ==
@@ -5009,6 +5106,10 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 static void nfs4_delegreturn_release(void *calldata)
 {
+        struct nfs4_delegreturndata *data = calldata;
+        if (data->roc)
+                pnfs_roc_release(data->inode);
        kfree(calldata);
 }
@@ -5018,6 +5119,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
        d_data = (struct nfs4_delegreturndata *)data;
+        if (d_data->roc &&
+            pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
+                return;
        nfs4_setup_sequence(d_data->res.server,
                        &d_data->args.seq_args,
                        &d_data->res.seq_res,
@@ -5061,6 +5166,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        nfs_fattr_init(data->res.fattr);
        data->timestamp = jiffies;
        data->rpc_status = 0;
+        data->inode = inode;
+        data->roc = list_empty(&NFS_I(inode)->open_files) ?
+                    pnfs_roc(inode) : false;
        task_setup_data.callback_data = data;
        msg.rpc_argp = &data->args;
@@ -5834,8 +5942,10 @@ struct nfs_release_lockowner_data {
 static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs_release_lockowner_data *data = calldata;
-        nfs40_setup_sequence(data->server,
+        struct nfs_server *server = data->server;
-                                &data->args.seq_args, &data->res.seq_res, task);
+        nfs40_setup_sequence(server, &data->args.seq_args,
+                                &data->res.seq_res, task);
+        data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
        data->timestamp = jiffies;
 }
@@ -5852,6 +5962,8 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
                break;
        case -NFS4ERR_STALE_CLIENTID:
        case -NFS4ERR_EXPIRED:
+                nfs4_schedule_lease_recovery(server->nfs_client);
+                break;
        case -NFS4ERR_LEASE_MOVED:
        case -NFS4ERR_DELAY:
                if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
@@ -5872,7 +5984,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = {
        .rpc_release = nfs4_release_lockowner_release,
 };
-static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
+static void
+nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
        struct nfs_release_lockowner_data *data;
        struct rpc_message msg = {
@@ -5880,11 +5993,11 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
        };
        if (server->nfs_client->cl_mvops->minor_version != 0)
-                return -EINVAL;
+                return;
        data = kmalloc(sizeof(*data), GFP_NOFS);
        if (!data)
-                return -ENOMEM;
+                return;
        data->lsp = lsp;
        data->server = server;
        data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
@@ -5895,7 +6008,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
        msg.rpc_resp = &data->res;
        nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
        rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
-        return 0;
 }
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
@@ -8182,7 +8294,8 @@ static int nfs41_free_stateid(struct nfs_server *server,
        return ret;
 }
-static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
+static void
+nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
        struct rpc_task *task;
        struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
@@ -8190,9 +8303,8 @@ static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_sta
        task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
        nfs4_free_lock_state(server, lsp);
        if (IS_ERR(task))
-                return PTR_ERR(task);
+                return;
        rpc_put_task(task);
-        return 0;
 }
 static bool nfs41_match_stateid(const nfs4_stateid *s1,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 42f121182167..a043f618cd5a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -787,33 +787,36 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
 * that is compatible with current->files
 */
 static struct nfs4_lock_state *
-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 {
        struct nfs4_lock_state *pos;
        list_for_each_entry(pos, &state->lock_states, ls_locks) {
-                if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type)
+                if (pos->ls_owner != fl_owner)
                        continue;
-                switch (pos->ls_owner.lo_type) {
-                case NFS4_POSIX_LOCK_TYPE:
-                        if (pos->ls_owner.lo_u.posix_owner != fl_owner)
-                                continue;
-                        break;
-                case NFS4_FLOCK_LOCK_TYPE:
-                        if (pos->ls_owner.lo_u.flock_owner != fl_pid)
-                                continue;
-                }
                atomic_inc(&pos->ls_count);
                return pos;
        }
        return NULL;
 }
+static void
+free_lock_state_work(struct work_struct *work)
+{
+        struct nfs4_lock_state *lsp = container_of(work,
+                                        struct nfs4_lock_state, ls_release);
+        struct nfs4_state *state = lsp->ls_state;
+        struct nfs_server *server = state->owner->so_server;
+        struct nfs_client *clp = server->nfs_client;
+        clp->cl_mvops->free_lock_state(server, lsp);
+}
 /*
 * Return a compatible lock_state. If no initialized lock_state structure
 * exists, return an uninitialized one.
 *
 */
-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 {
        struct nfs4_lock_state *lsp;
        struct nfs_server *server = state->owner->so_server;
@@ -824,21 +827,12 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
        nfs4_init_seqid_counter(&lsp->ls_seqid);
        atomic_set(&lsp->ls_count, 1);
        lsp->ls_state = state;
-        lsp->ls_owner.lo_type = type;
+        lsp->ls_owner = fl_owner;
-        switch (lsp->ls_owner.lo_type) {
-        case NFS4_FLOCK_LOCK_TYPE:
-                lsp->ls_owner.lo_u.flock_owner = fl_pid;
-                break;
-        case NFS4_POSIX_LOCK_TYPE:
-                lsp->ls_owner.lo_u.posix_owner = fl_owner;
-                break;
-        default:
-                goto out_free;
-        }
        lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
        if (lsp->ls_seqid.owner_id < 0)
                goto out_free;
        INIT_LIST_HEAD(&lsp->ls_locks);
+        INIT_WORK(&lsp->ls_release, free_lock_state_work);
        return lsp;
 out_free:
        kfree(lsp);
@@ -857,13 +851,13 @@ void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp
 * exists, return an uninitialized one.
 *
 */
-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type)
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
 {
        struct nfs4_lock_state *lsp, *new = NULL;
        
        for(;;) {
                spin_lock(&state->state_lock);
-                lsp = __nfs4_find_lock_state(state, owner, pid, type);
+                lsp = __nfs4_find_lock_state(state, owner);
                if (lsp != NULL)
                        break;
                if (new != NULL) {
@@ -874,7 +868,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
                        break;
                }
                spin_unlock(&state->state_lock);
-                new = nfs4_alloc_lock_state(state, owner, pid, type);
+                new = nfs4_alloc_lock_state(state, owner);
                if (new == NULL)
                        return NULL;
        }
@@ -902,13 +896,12 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
        if (list_empty(&state->lock_states))
                clear_bit(LK_STATE_IN_USE, &state->flags);
        spin_unlock(&state->state_lock);
-        server = state->owner->so_server;
+        if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
-        if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+                queue_work(nfsiod_workqueue, &lsp->ls_release);
-                struct nfs_client *clp = server->nfs_client;
+        else {
+                server = state->owner->so_server;
-                clp->cl_mvops->free_lock_state(server, lsp);
-        } else
                nfs4_free_lock_state(server, lsp);
+        }
 }
 static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -935,13 +928,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
        if (fl->fl_ops != NULL)
                return 0;
-        if (fl->fl_flags & FL_POSIX)
+        lsp = nfs4_get_lock_state(state, fl->fl_owner);
-                lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
-        else if (fl->fl_flags & FL_FLOCK)
-                lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid,
-                                NFS4_FLOCK_LOCK_TYPE);
-        else
-                return -EINVAL;
        if (lsp == NULL)
                return -ENOMEM;
        fl->fl_u.nfs4_fl.owner = lsp;
@@ -955,7 +942,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
 {
        struct nfs4_lock_state *lsp;
        fl_owner_t fl_owner;
-        pid_t fl_pid;
        int ret = -ENOENT;
@@ -966,9 +952,8 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
                goto out;
        fl_owner = lockowner->l_owner;
-        fl_pid = lockowner->l_pid;
        spin_lock(&state->state_lock);
-        lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
+        lsp = __nfs4_find_lock_state(state, fl_owner);
        if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
                ret = -EIO;
        else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 0a744f3a86f6..1c32adbe728d 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -932,11 +932,11 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
 DECLARE_EVENT_CLASS(nfs4_read_event,
                TP_PROTO(
-                        const struct nfs_pgio_data *data,
+                        const struct nfs_pgio_header *hdr,
                        int error
                ),
-                TP_ARGS(data, error),
+                TP_ARGS(hdr, error),
                TP_STRUCT__entry(
                        __field(dev_t, dev)
@@ -948,12 +948,12 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
                ),
                TP_fast_assign(
-                        const struct inode *inode = data->header->inode;
+                        const struct inode *inode = hdr->inode;
                        __entry->dev = inode->i_sb->s_dev;
                        __entry->fileid = NFS_FILEID(inode);
                        __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
-                        __entry->offset = data->args.offset;
+                        __entry->offset = hdr->args.offset;
-                        __entry->count = data->args.count;
+                        __entry->count = hdr->args.count;
                        __entry->error = error;
                ),
@@ -972,10 +972,10 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
 #define DEFINE_NFS4_READ_EVENT(name) \
        DEFINE_EVENT(nfs4_read_event, name, \
                        TP_PROTO( \
-                                const struct nfs_pgio_data *data, \
+                                const struct nfs_pgio_header *hdr, \
                                int error \
                        ), \
-                        TP_ARGS(data, error))
+                        TP_ARGS(hdr, error))
 DEFINE_NFS4_READ_EVENT(nfs4_read);
 #ifdef CONFIG_NFS_V4_1
 DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
@@ -983,11 +983,11 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
 DECLARE_EVENT_CLASS(nfs4_write_event,
                TP_PROTO(
-                        const struct nfs_pgio_data *data,
+                        const struct nfs_pgio_header *hdr,
                        int error
                ),
-                TP_ARGS(data, error),
+                TP_ARGS(hdr, error),
                TP_STRUCT__entry(
                        __field(dev_t, dev)
@@ -999,12 +999,12 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
                ),
                TP_fast_assign(
-                        const struct inode *inode = data->header->inode;
+                        const struct inode *inode = hdr->inode;
                        __entry->dev = inode->i_sb->s_dev;
                        __entry->fileid = NFS_FILEID(inode);
                        __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
-                        __entry->offset = data->args.offset;
+                        __entry->offset = hdr->args.offset;
-                        __entry->count = data->args.count;
+                        __entry->count = hdr->args.count;
                        __entry->error = error;
                ),
@@ -1024,10 +1024,10 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
 #define DEFINE_NFS4_WRITE_EVENT(name) \
        DEFINE_EVENT(nfs4_write_event, name, \
                        TP_PROTO( \
-                                const struct nfs_pgio_data *data, \
+                                const struct nfs_pgio_header *hdr, \
                                int error \
                        ), \
-                        TP_ARGS(data, error))
+                        TP_ARGS(hdr, error))
 DEFINE_NFS4_WRITE_EVENT(nfs4_write);
 #ifdef CONFIG_NFS_V4_1
 DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 939ae606cfa4..e13b59d8d9aa 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7092,7 +7092,7 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
        if (!status)
                status = decode_sequence(xdr, &res->seq_res, rqstp);
        if (!status)
-                status = decode_reclaim_complete(xdr, (void *)NULL);
+                status = decode_reclaim_complete(xdr, NULL);
        return status;
 }
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 611320753db2..ae05278b3761 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -439,22 +439,21 @@ static void _read_done(struct ore_io_state *ios, void *private)
        objlayout_read_done(&objios->oir, status, objios->sync);
 }
-int objio_read_pagelist(struct nfs_pgio_data *rdata)
+int objio_read_pagelist(struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = rdata->header;
        struct objio_state *objios;
        int ret;
        ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
-                        hdr->lseg, rdata->args.pages, rdata->args.pgbase,
+                        hdr->lseg, hdr->args.pages, hdr->args.pgbase,
-                        rdata->args.offset, rdata->args.count, rdata,
+                        hdr->args.offset, hdr->args.count, hdr,
                        GFP_KERNEL, &objios);
        if (unlikely(ret))
                return ret;
        objios->ios->done = _read_done;
        dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
-                rdata->args.offset, rdata->args.count);
+                hdr->args.offset, hdr->args.count);
        ret = ore_read(objios->ios);
        if (unlikely(ret))
                objio_free_result(&objios->oir);
@@ -487,11 +486,11 @@ static void _write_done(struct ore_io_state *ios, void *private)
 static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
 {
        struct objio_state *objios = priv;
-        struct nfs_pgio_data *wdata = objios->oir.rpcdata;
+        struct nfs_pgio_header *hdr = objios->oir.rpcdata;
-        struct address_space *mapping = wdata->header->inode->i_mapping;
+        struct address_space *mapping = hdr->inode->i_mapping;
        pgoff_t index = offset / PAGE_SIZE;
        struct page *page;
-        loff_t i_size = i_size_read(wdata->header->inode);
+        loff_t i_size = i_size_read(hdr->inode);
        if (offset >= i_size) {
                *uptodate = true;
@@ -531,15 +530,14 @@ static const struct _ore_r4w_op _r4w_op = {
        .put_page = &__r4w_put_page,
 };
-int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
+int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
 {
-        struct nfs_pgio_header *hdr = wdata->header;
        struct objio_state *objios;
        int ret;
        ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
-                        hdr->lseg, wdata->args.pages, wdata->args.pgbase,
+                        hdr->lseg, hdr->args.pages, hdr->args.pgbase,
-                        wdata->args.offset, wdata->args.count, wdata, GFP_NOFS,
+                        hdr->args.offset, hdr->args.count, hdr, GFP_NOFS,
                        &objios);
        if (unlikely(ret))
                return ret;
@@ -551,7 +549,7 @@ int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
                objios->ios->done = _write_done;
        dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
-                wdata->args.offset, wdata->args.count);
+                hdr->args.offset, hdr->args.count);
        ret = ore_write(objios->ios);
        if (unlikely(ret)) {
                objio_free_result(&objios->oir);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 765d3f54e986..697a16d11fac 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -229,36 +229,36 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
 static void _rpc_read_complete(struct work_struct *work)
 {
        struct rpc_task *task;
-        struct nfs_pgio_data *rdata;
+        struct nfs_pgio_header *hdr;
        dprintk("%s enter\n", __func__);
        task = container_of(work, struct rpc_task, u.tk_work);
-        rdata = container_of(task, struct nfs_pgio_data, task);
+        hdr = container_of(task, struct nfs_pgio_header, task);
-        pnfs_ld_read_done(rdata);
+        pnfs_ld_read_done(hdr);
 }
 void
 objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 {
-        struct nfs_pgio_data *rdata = oir->rpcdata;
+        struct nfs_pgio_header *hdr = oir->rpcdata;
-        oir->status = rdata->task.tk_status = status;
+        oir->status = hdr->task.tk_status = status;
        if (status >= 0)
-                rdata->res.count = status;
+                hdr->res.count = status;
        else
-                rdata->header->pnfs_error = status;
+                hdr->pnfs_error = status;
        objlayout_iodone(oir);
        /* must not use oir after this point */
        dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
-                status, rdata->res.eof, sync);
+                status, hdr->res.eof, sync);
        if (sync)
-                pnfs_ld_read_done(rdata);
+                pnfs_ld_read_done(hdr);
        else {
-                INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
+                INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete);
-                schedule_work(&rdata->task.u.tk_work);
+                schedule_work(&hdr->task.u.tk_work);
        }
 }
@@ -266,12 +266,11 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 * Perform sync or async reads.
 */
 enum pnfs_try_status
-objlayout_read_pagelist(struct nfs_pgio_data *rdata)
+objlayout_read_pagelist(struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = rdata->header;
        struct inode *inode = hdr->inode;
-        loff_t offset = rdata->args.offset;
+        loff_t offset = hdr->args.offset;
-        size_t count = rdata->args.count;
+        size_t count = hdr->args.count;
        int err;
        loff_t eof;
@@ -279,23 +278,23 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
        if (unlikely(offset + count > eof)) {
                if (offset >= eof) {
                        err = 0;
-                        rdata->res.count = 0;
+                        hdr->res.count = 0;
-                        rdata->res.eof = 1;
+                        hdr->res.eof = 1;
                        /*FIXME: do we need to call pnfs_ld_read_done() */
                        goto out;
                }
                count = eof - offset;
        }
-        rdata->res.eof = (offset + count) >= eof;
+        hdr->res.eof = (offset + count) >= eof;
-        _fix_verify_io_params(hdr->lseg, &rdata->args.pages,
+        _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
-                              &rdata->args.pgbase,
+                              &hdr->args.pgbase,
-                              rdata->args.offset, rdata->args.count);
+                              hdr->args.offset, hdr->args.count);
        dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
-                __func__, inode->i_ino, offset, count, rdata->res.eof);
+                __func__, inode->i_ino, offset, count, hdr->res.eof);
-        err = objio_read_pagelist(rdata);
+        err = objio_read_pagelist(hdr);
 out:
        if (unlikely(err)) {
                hdr->pnfs_error = err;
@@ -312,38 +311,38 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
 static void _rpc_write_complete(struct work_struct *work)
 {
        struct rpc_task *task;
-        struct nfs_pgio_data *wdata;
+        struct nfs_pgio_header *hdr;
        dprintk("%s enter\n", __func__);
        task = container_of(work, struct rpc_task, u.tk_work);
-        wdata = container_of(task, struct nfs_pgio_data, task);
+        hdr = container_of(task, struct nfs_pgio_header, task);
-        pnfs_ld_write_done(wdata);
+        pnfs_ld_write_done(hdr);
 }
 void
 objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 {
-        struct nfs_pgio_data *wdata = oir->rpcdata;
+        struct nfs_pgio_header *hdr = oir->rpcdata;
-        oir->status = wdata->task.tk_status = status;
+        oir->status = hdr->task.tk_status = status;
        if (status >= 0) {
-                wdata->res.count = status;
+                hdr->res.count = status;
-                wdata->verf.committed = oir->committed;
+                hdr->verf.committed = oir->committed;
        } else {
-                wdata->header->pnfs_error = status;
+                hdr->pnfs_error = status;
        }
        objlayout_iodone(oir);
        /* must not use oir after this point */
        dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
-                status, wdata->verf.committed, sync);
+                status, hdr->verf.committed, sync);
        if (sync)
-                pnfs_ld_write_done(wdata);
+                pnfs_ld_write_done(hdr);
        else {
-                INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
+                INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete);
-                schedule_work(&wdata->task.u.tk_work);
+                schedule_work(&hdr->task.u.tk_work);
        }
 }
@@ -351,17 +350,15 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 * Perform sync or async writes.
 */
 enum pnfs_try_status
-objlayout_write_pagelist(struct nfs_pgio_data *wdata,
+objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how)
-                         int how)
 {
-        struct nfs_pgio_header *hdr = wdata->header;
        int err;
-        _fix_verify_io_params(hdr->lseg, &wdata->args.pages,
+        _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
-                              &wdata->args.pgbase,
+                              &hdr->args.pgbase,
-                              wdata->args.offset, wdata->args.count);
+                              hdr->args.offset, hdr->args.count);
-        err = objio_write_pagelist(wdata, how);
+        err = objio_write_pagelist(hdr, how);
        if (unlikely(err)) {
                hdr->pnfs_error = err;
                dprintk("%s: Returned Error %d\n", __func__, err);
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 01e041029a6c..fd13f1d2f136 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
 */
 extern void objio_free_result(struct objlayout_io_res *oir);
-extern int objio_read_pagelist(struct nfs_pgio_data *rdata);
+extern int objio_read_pagelist(struct nfs_pgio_header *rdata);
-extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how);
+extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how);
 /*
 * callback API
@@ -168,10 +168,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg(
 extern void objlayout_free_lseg(struct pnfs_layout_segment *);
 extern enum pnfs_try_status objlayout_read_pagelist(
-        struct nfs_pgio_data *);
+        struct nfs_pgio_header *);
 extern enum pnfs_try_status objlayout_write_pagelist(
-        struct nfs_pgio_data *,
+        struct nfs_pgio_header *,
        int how);
 extern void objlayout_encode_layoutcommit(
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 0be5050638f7..ba491926df5f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -141,16 +141,24 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
 * @req - request in group that is to be locked
 *
 * this lock must be held if modifying the page group list
+ *
+ * returns result from wait_on_bit_lock: 0 on success, < 0 on error
 */
-void
+int
-nfs_page_group_lock(struct nfs_page *req)
+nfs_page_group_lock(struct nfs_page *req, bool wait)
 {
        struct nfs_page *head = req->wb_head;
+        int ret;
        WARN_ON_ONCE(head != head->wb_head);
-        wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+        do {
+                ret = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
                        TASK_UNINTERRUPTIBLE);
+        } while (wait && ret != 0);
+        WARN_ON_ONCE(ret > 0);
+        return ret;
 }
 /*
@@ -211,7 +219,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
 {
        bool ret;
-        nfs_page_group_lock(req);
+        nfs_page_group_lock(req, true);
        ret = nfs_page_group_sync_on_bit_locked(req, bit);
        nfs_page_group_unlock(req);
@@ -454,123 +462,72 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
 }
 EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
-static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr)
+struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
-{
-        return container_of(hdr, struct nfs_rw_header, header);
-}
-/**
- * nfs_rw_header_alloc - Allocate a header for a read or write
- * @ops: Read or write function vector
- */
-struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops)
 {
-        struct nfs_rw_header *header = ops->rw_alloc_header();
+        struct nfs_pgio_header *hdr = ops->rw_alloc_header();
-        if (header) {
-                struct nfs_pgio_header *hdr = &header->header;
+        if (hdr) {
                INIT_LIST_HEAD(&hdr->pages);
                spin_lock_init(&hdr->lock);
-                atomic_set(&hdr->refcnt, 0);
                hdr->rw_ops = ops;
        }
-        return header;
+        return hdr;
 }
-EXPORT_SYMBOL_GPL(nfs_rw_header_alloc);
+EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc);
 /*
- * nfs_rw_header_free - Free a read or write header
+ * nfs_pgio_header_free - Free a read or write header
 * @hdr: The header to free
 */
-void nfs_rw_header_free(struct nfs_pgio_header *hdr)
+void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
 {
-        hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr));
+        hdr->rw_ops->rw_free_header(hdr);
 }
-EXPORT_SYMBOL_GPL(nfs_rw_header_free);
+EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
 /**
- * nfs_pgio_data_alloc - Allocate pageio data
+ * nfs_pgio_data_destroy - make @hdr suitable for reuse
- * @hdr: The header making a request
+ *
- * @pagecount: Number of pages to create
+ * Frees memory and releases refs from nfs_generic_pgio, so that it may
- */
+ * be called again.
-static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr,
+ *
-                                                 unsigned int pagecount)
+ * @hdr: A header that has had nfs_generic_pgio called
-{
-        struct nfs_pgio_data *data, *prealloc;
-        prealloc = &NFS_RW_HEADER(hdr)->rpc_data;
-        if (prealloc->header == NULL)
-                data = prealloc;
-        else
-                data = kzalloc(sizeof(*data), GFP_KERNEL);
-        if (!data)
-                goto out;
-        if (nfs_pgarray_set(&data->pages, pagecount)) {
-                data->header = hdr;
-                atomic_inc(&hdr->refcnt);
-        } else {
-                if (data != prealloc)
-                        kfree(data);
-                data = NULL;
-        }
-out:
-        return data;
-}
-/**
- * nfs_pgio_data_release - Properly free pageio data
- * @data: The data to release
 */
-void nfs_pgio_data_release(struct nfs_pgio_data *data)
+void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = data->header;
+        put_nfs_open_context(hdr->args.context);
-        struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr);
+        if (hdr->page_array.pagevec != hdr->page_array.page_array)
+                kfree(hdr->page_array.pagevec);
-        put_nfs_open_context(data->args.context);
-        if (data->pages.pagevec != data->pages.page_array)
-                kfree(data->pages.pagevec);
-        if (data == &pageio_header->rpc_data) {
-                data->header = NULL;
-                data = NULL;
-        }
-        if (atomic_dec_and_test(&hdr->refcnt))
-                hdr->completion_ops->completion(hdr);
-        /* Note: we only free the rpc_task after callbacks are done.
-         * See the comment in rpc_free_task() for why
-         */
-        kfree(data);
 }
-EXPORT_SYMBOL_GPL(nfs_pgio_data_release);
+EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy);
 /**
 * nfs_pgio_rpcsetup - Set up arguments for a pageio call
- * @data: The pageio data
+ * @hdr: The pageio hdr
 * @count: Number of bytes to read
 * @offset: Initial offset
 * @how: How to commit data (writes only)
 * @cinfo: Commit information for the call (writes only)
 */
-static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
+static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
                              unsigned int count, unsigned int offset,
                              int how, struct nfs_commit_info *cinfo)
 {
-        struct nfs_page *req = data->header->req;
+        struct nfs_page *req = hdr->req;
        /* Set up the RPC argument and reply structs
-         * NB: take care not to mess about with data->commit et al. */
+         * NB: take care not to mess about with hdr->commit et al. */
-        data->args.fh     = NFS_FH(data->header->inode);
+        hdr->args.fh     = NFS_FH(hdr->inode);
-        data->args.offset = req_offset(req) + offset;
+        hdr->args.offset = req_offset(req) + offset;
        /* pnfs_set_layoutcommit needs this */
-        data->mds_offset = data->args.offset;
+        hdr->mds_offset = hdr->args.offset;
-        data->args.pgbase = req->wb_pgbase + offset;
+        hdr->args.pgbase = req->wb_pgbase + offset;
-        data->args.pages  = data->pages.pagevec;
+        hdr->args.pages  = hdr->page_array.pagevec;
-        data->args.count  = count;
+        hdr->args.count  = count;
-        data->args.context = get_nfs_open_context(req->wb_context);
+        hdr->args.context = get_nfs_open_context(req->wb_context);
-        data->args.lock_context = req->wb_lock_context;
+        hdr->args.lock_context = req->wb_lock_context;
-        data->args.stable  = NFS_UNSTABLE;
+        hdr->args.stable  = NFS_UNSTABLE;
        switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
        case 0:
                break;
@@ -578,59 +535,59 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
                if (nfs_reqs_to_commit(cinfo))
                        break;
        default:
-                data->args.stable = NFS_FILE_SYNC;
+                hdr->args.stable = NFS_FILE_SYNC;
        }
-        data->res.fattr   = &data->fattr;
+        hdr->res.fattr   = &hdr->fattr;
-        data->res.count   = count;
+        hdr->res.count   = count;
-        data->res.eof     = 0;
+        hdr->res.eof     = 0;
-        data->res.verf    = &data->verf;
+        hdr->res.verf    = &hdr->verf;
-        nfs_fattr_init(&data->fattr);
+        nfs_fattr_init(&hdr->fattr);
 }
 /**
- * nfs_pgio_prepare - Prepare pageio data to go over the wire
+ * nfs_pgio_prepare - Prepare pageio hdr to go over the wire
 * @task: The current task
- * @calldata: pageio data to prepare
+ * @calldata: pageio header to prepare
 */
 static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
 {
-        struct nfs_pgio_data *data = calldata;
+        struct nfs_pgio_header *hdr = calldata;
        int err;
-        err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data);
+        err = NFS_PROTO(hdr->inode)->pgio_rpc_prepare(task, hdr);
        if (err)
                rpc_exit(task, err);
 }
-int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data,
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
                      const struct rpc_call_ops *call_ops, int how, int flags)
 {
        struct rpc_task *task;
        struct rpc_message msg = {
-                .rpc_argp = &data->args,
+                .rpc_argp = &hdr->args,
-                .rpc_resp = &data->res,
+                .rpc_resp = &hdr->res,
-                .rpc_cred = data->header->cred,
+                .rpc_cred = hdr->cred,
        };
        struct rpc_task_setup task_setup_data = {
                .rpc_client = clnt,
-                .task = &data->task,
+                .task = &hdr->task,
                .rpc_message = &msg,
                .callback_ops = call_ops,
-                .callback_data = data,
+                .callback_data = hdr,
                .workqueue = nfsiod_workqueue,
                .flags = RPC_TASK_ASYNC | flags,
        };
        int ret = 0;
-        data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how);
+        hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how);
        dprintk("NFS: %5u initiated pgio call "
                "(req %s/%llu, %u bytes @ offset %llu)\n",
-                data->task.tk_pid,
+                hdr->task.tk_pid,
-                data->header->inode->i_sb->s_id,
+                hdr->inode->i_sb->s_id,
-                (unsigned long long)NFS_FILEID(data->header->inode),
+                (unsigned long long)NFS_FILEID(hdr->inode),
-                data->args.count,
+                hdr->args.count,
-                (unsigned long long)data->args.offset);
+                (unsigned long long)hdr->args.offset);
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task)) {
@@ -657,22 +614,23 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
                          struct nfs_pgio_header *hdr)
 {
        set_bit(NFS_IOHDR_REDO, &hdr->flags);
-        nfs_pgio_data_release(hdr->data);
+        nfs_pgio_data_destroy(hdr);
-        hdr->data = NULL;
+        hdr->completion_ops->completion(hdr);
        desc->pg_completion_ops->error_cleanup(&desc->pg_list);
        return -ENOMEM;
 }
 /**
 * nfs_pgio_release - Release pageio data
- * @calldata: The pageio data to release
+ * @calldata: The pageio header to release
 */
 static void nfs_pgio_release(void *calldata)
 {
-        struct nfs_pgio_data *data = calldata;
+        struct nfs_pgio_header *hdr = calldata;
-        if (data->header->rw_ops->rw_release)
+        if (hdr->rw_ops->rw_release)
-                data->header->rw_ops->rw_release(data);
+                hdr->rw_ops->rw_release(hdr);
-        nfs_pgio_data_release(data);
+        nfs_pgio_data_destroy(hdr);
+        hdr->completion_ops->completion(hdr);
 }
 /**
@@ -713,22 +671,22 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init);
 /**
 * nfs_pgio_result - Basic pageio error handling
 * @task: The task that ran
- * @calldata: Pageio data to check
+ * @calldata: Pageio header to check
 */
 static void nfs_pgio_result(struct rpc_task *task, void *calldata)
 {
-        struct nfs_pgio_data *data = calldata;
+        struct nfs_pgio_header *hdr = calldata;
-        struct inode *inode = data->header->inode;
+        struct inode *inode = hdr->inode;
        dprintk("NFS: %s: %5u, (status %d)\n", __func__,
                task->tk_pid, task->tk_status);
-        if (data->header->rw_ops->rw_done(task, data, inode) != 0)
+        if (hdr->rw_ops->rw_done(task, hdr, inode) != 0)
                return;
        if (task->tk_status < 0)
-                nfs_set_pgio_error(data->header, task->tk_status, data->args.offset);
+                nfs_set_pgio_error(hdr, task->tk_status, hdr->args.offset);
        else
-                data->header->rw_ops->rw_result(task, data);
+                hdr->rw_ops->rw_result(task, hdr);
 }
 /*
@@ -744,17 +702,16 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 {
        struct nfs_page         *req;
        struct page             **pages;
-        struct nfs_pgio_data    *data;
        struct list_head *head = &desc->pg_list;
        struct nfs_commit_info cinfo;
+        unsigned int pagecount;
-        data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base,
+        pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
-                                                           desc->pg_count));
+        if (!nfs_pgarray_set(&hdr->page_array, pagecount))
-        if (!data)
                return nfs_pgio_error(desc, hdr);
        nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
-        pages = data->pages.pagevec;
+        pages = hdr->page_array.pagevec;
        while (!list_empty(head)) {
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
@@ -767,8 +724,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
                desc->pg_ioflags &= ~FLUSH_COND_STABLE;
        /* Set up the argument struct */
-        nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
+        nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
-        hdr->data = data;
        desc->pg_rpc_callops = &nfs_pgio_common_ops;
        return 0;
 }
@@ -776,25 +732,20 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
 static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 {
-        struct nfs_rw_header *rw_hdr;
        struct nfs_pgio_header *hdr;
        int ret;
-        rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops);
+        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
-        if (!rw_hdr) {
+        if (!hdr) {
                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
                return -ENOMEM;
        }
-        hdr = &rw_hdr->header;
+        nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
-        nfs_pgheader_init(desc, hdr, nfs_rw_header_free);
-        atomic_inc(&hdr->refcnt);
        ret = nfs_generic_pgio(desc, hdr);
        if (ret == 0)
                ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
-                                        hdr->data, desc->pg_rpc_callops,
+                                        hdr, desc->pg_rpc_callops,
                                        desc->pg_ioflags, 0);
-        if (atomic_dec_and_test(&hdr->refcnt))
-                hdr->completion_ops->completion(hdr);
        return ret;
 }
@@ -907,8 +858,13 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
        struct nfs_page *subreq;
        unsigned int bytes_left = 0;
        unsigned int offset, pgbase;
+        int ret;
-        nfs_page_group_lock(req);
+        ret = nfs_page_group_lock(req, false);
+        if (ret < 0) {
+                desc->pg_error = ret;
+                return 0;
+        }
        subreq = req;
        bytes_left = subreq->wb_bytes;
@@ -930,7 +886,11 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                        if (desc->pg_recoalesce)
                                return 0;
                        /* retry add_request for this subreq */
-                        nfs_page_group_lock(req);
+                        ret = nfs_page_group_lock(req, false);
+                        if (ret < 0) {
+                                desc->pg_error = ret;
+                                return 0;
+                        }
                        continue;
                }
@@ -1005,7 +965,38 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
        } while (ret);
        return ret;
 }
-EXPORT_SYMBOL_GPL(nfs_pageio_add_request);
+/*
+ * nfs_pageio_resend - Transfer requests to new descriptor and resend
+ * @hdr - the pgio header to move request from
+ * @desc - the pageio descriptor to add requests to
+ *
+ * Try to move each request (nfs_page) from @hdr to @desc then attempt
+ * to send them.
+ *
+ * Returns 0 on success and < 0 on error.
+ */
+int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
+                      struct nfs_pgio_header *hdr)
+{
+        LIST_HEAD(failed);
+        desc->pg_dreq = hdr->dreq;
+        while (!list_empty(&hdr->pages)) {
+                struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+                nfs_list_remove_request(req);
+                if (!nfs_pageio_add_request(desc, req))
+                        nfs_list_add_request(req, &failed);
+        }
+        nfs_pageio_complete(desc);
+        if (!list_empty(&failed)) {
+                list_move(&failed, &hdr->pages);
+                return -EIO;
+        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_pageio_resend);
 /**
 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
@@ -1021,7 +1012,6 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
                        break;
        }
 }
-EXPORT_SYMBOL_GPL(nfs_pageio_complete);
 /**
 * nfs_pageio_cond_complete - Conditional I/O completion
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a8914b335617..a3851debf8a2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -361,6 +361,23 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 }
 EXPORT_SYMBOL_GPL(pnfs_put_lseg);
+static void pnfs_put_lseg_async_work(struct work_struct *work)
+{
+        struct pnfs_layout_segment *lseg;
+        lseg = container_of(work, struct pnfs_layout_segment, pls_work);
+        pnfs_put_lseg(lseg);
+}
+void
+pnfs_put_lseg_async(struct pnfs_layout_segment *lseg)
+{
+        INIT_WORK(&lseg->pls_work, pnfs_put_lseg_async_work);
+        schedule_work(&lseg->pls_work);
+}
+EXPORT_SYMBOL_GPL(pnfs_put_lseg_async);
 static u64
 end_offset(u64 start, u64 len)
 {
@@ -1470,41 +1487,19 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
-int pnfs_write_done_resend_to_mds(struct inode *inode,
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
-                                struct list_head *head,
-                                const struct nfs_pgio_completion_ops *compl_ops,
-                                struct nfs_direct_req *dreq)
 {
        struct nfs_pageio_descriptor pgio;
-        LIST_HEAD(failed);
        /* Resend all requests through the MDS */
-        nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops);
+        nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
-        pgio.pg_dreq = dreq;
+                              hdr->completion_ops);
-        while (!list_empty(head)) {
+        return nfs_pageio_resend(&pgio, hdr);
-                struct nfs_page *req = nfs_list_entry(head->next);
-                nfs_list_remove_request(req);
-                if (!nfs_pageio_add_request(&pgio, req))
-                        nfs_list_add_request(req, &failed);
-        }
-        nfs_pageio_complete(&pgio);
-        if (!list_empty(&failed)) {
-                /* For some reason our attempt to resend pages. Mark the
-                 * overall send request as having failed, and let
-                 * nfs_writeback_release_full deal with the error.
-                 */
-                list_move(&failed, head);
-                return -EIO;
-        }
-        return 0;
 }
 EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
-static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
+static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = data->header;
        dprintk("pnfs write error = %d\n", hdr->pnfs_error);
        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
@@ -1512,50 +1507,42 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
                pnfs_return_layout(hdr->inode);
        }
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
-                data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode,
+                hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
-                                                        &hdr->pages,
-                                                        hdr->completion_ops,
-                                                        hdr->dreq);
 }
 /*
 * Called by non rpc-based layout drivers
 */
-void pnfs_ld_write_done(struct nfs_pgio_data *data)
+void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = data->header;
+        trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
-        trace_nfs4_pnfs_write(data, hdr->pnfs_error);
        if (!hdr->pnfs_error) {
-                pnfs_set_layoutcommit(data);
+                pnfs_set_layoutcommit(hdr);
-                hdr->mds_ops->rpc_call_done(&data->task, data);
+                hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
        } else
-                pnfs_ld_handle_write_error(data);
+                pnfs_ld_handle_write_error(hdr);
-        hdr->mds_ops->rpc_release(data);
+        hdr->mds_ops->rpc_release(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
 static void
 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
-                struct nfs_pgio_data *data)
+                struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = data->header;
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
                list_splice_tail_init(&hdr->pages, &desc->pg_list);
                nfs_pageio_reset_write_mds(desc);
                desc->pg_recoalesce = 1;
        }
-        nfs_pgio_data_release(data);
+        nfs_pgio_data_destroy(hdr);
 }
 static enum pnfs_try_status
-pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
+pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
                        const struct rpc_call_ops *call_ops,
                        struct pnfs_layout_segment *lseg,
                        int how)
 {
-        struct nfs_pgio_header *hdr = wdata->header;
        struct inode *inode = hdr->inode;
        enum pnfs_try_status trypnfs;
        struct nfs_server *nfss = NFS_SERVER(inode);
@@ -1563,8 +1550,8 @@ pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
        hdr->mds_ops = call_ops;
        dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
-                inode->i_ino, wdata->args.count, wdata->args.offset, how);
+                inode->i_ino, hdr->args.count, hdr->args.offset, how);
-        trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
+        trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
        if (trypnfs != PNFS_NOT_ATTEMPTED)
                nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1575,139 +1562,105 @@ static void
 pnfs_do_write(struct nfs_pageio_descriptor *desc,
              struct nfs_pgio_header *hdr, int how)
 {
-        struct nfs_pgio_data *data = hdr->data;
        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        enum pnfs_try_status trypnfs;
        desc->pg_lseg = NULL;
-        trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
+        trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
        if (trypnfs == PNFS_NOT_ATTEMPTED)
-                pnfs_write_through_mds(desc, data);
+                pnfs_write_through_mds(desc, hdr);
        pnfs_put_lseg(lseg);
 }
 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
        pnfs_put_lseg(hdr->lseg);
-        nfs_rw_header_free(hdr);
+        nfs_pgio_header_free(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
 int
 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
-        struct nfs_rw_header *whdr;
        struct nfs_pgio_header *hdr;
        int ret;
-        whdr = nfs_rw_header_alloc(desc->pg_rw_ops);
+        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
-        if (!whdr) {
+        if (!hdr) {
                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
                return -ENOMEM;
        }
-        hdr = &whdr->header;
        nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
-        atomic_inc(&hdr->refcnt);
        ret = nfs_generic_pgio(desc, hdr);
        if (ret != 0) {
                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
        } else
                pnfs_do_write(desc, hdr, desc->pg_ioflags);
-        if (atomic_dec_and_test(&hdr->refcnt))
-                hdr->completion_ops->completion(hdr);
        return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
-int pnfs_read_done_resend_to_mds(struct inode *inode,
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
-                                struct list_head *head,
-                                const struct nfs_pgio_completion_ops *compl_ops,
-                                struct nfs_direct_req *dreq)
 {
        struct nfs_pageio_descriptor pgio;
-        LIST_HEAD(failed);
        /* Resend all requests through the MDS */
-        nfs_pageio_init_read(&pgio, inode, true, compl_ops);
+        nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
-        pgio.pg_dreq = dreq;
+        return nfs_pageio_resend(&pgio, hdr);
-        while (!list_empty(head)) {
-                struct nfs_page *req = nfs_list_entry(head->next);
-                nfs_list_remove_request(req);
-                if (!nfs_pageio_add_request(&pgio, req))
-                        nfs_list_add_request(req, &failed);
-        }
-        nfs_pageio_complete(&pgio);
-        if (!list_empty(&failed)) {
-                list_move(&failed, head);
-                return -EIO;
-        }
-        return 0;
 }
 EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
-static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data)
+static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = data->header;
        dprintk("pnfs read error = %d\n", hdr->pnfs_error);
        if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
            PNFS_LAYOUTRET_ON_ERROR) {
                pnfs_return_layout(hdr->inode);
        }
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
-                data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode,
+                hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
-                                                        &hdr->pages,
-                                                        hdr->completion_ops,
-                                                        hdr->dreq);
 }
 /*
 * Called by non rpc-based layout drivers
 */
-void pnfs_ld_read_done(struct nfs_pgio_data *data)
+void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = data->header;
+        trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
-        trace_nfs4_pnfs_read(data, hdr->pnfs_error);
        if (likely(!hdr->pnfs_error)) {
-                __nfs4_read_done_cb(data);
+                __nfs4_read_done_cb(hdr);
-                hdr->mds_ops->rpc_call_done(&data->task, data);
+                hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
        } else
-                pnfs_ld_handle_read_error(data);
+                pnfs_ld_handle_read_error(hdr);
-        hdr->mds_ops->rpc_release(data);
+        hdr->mds_ops->rpc_release(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
 static void
 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
-                struct nfs_pgio_data *data)
+                struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = data->header;
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
                list_splice_tail_init(&hdr->pages, &desc->pg_list);
                nfs_pageio_reset_read_mds(desc);
                desc->pg_recoalesce = 1;
        }
-        nfs_pgio_data_release(data);
+        nfs_pgio_data_destroy(hdr);
 }
 /*
 * Call the appropriate parallel I/O subsystem read function.
 */
 static enum pnfs_try_status
-pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
+pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
                       const struct rpc_call_ops *call_ops,
                       struct pnfs_layout_segment *lseg)
 {
-        struct nfs_pgio_header *hdr = rdata->header;
        struct inode *inode = hdr->inode;
        struct nfs_server *nfss = NFS_SERVER(inode);
        enum pnfs_try_status trypnfs;
@@ -1715,9 +1668,9 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
        hdr->mds_ops = call_ops;
        dprintk("%s: Reading ino:%lu %u@%llu\n",
-                __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
+                __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
-        trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
+        trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
        if (trypnfs != PNFS_NOT_ATTEMPTED)
                nfs_inc_stats(inode, NFSIOS_PNFS_READ);
        dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1727,52 +1680,46 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
 static void
 pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_data *data = hdr->data;
        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        enum pnfs_try_status trypnfs;
        desc->pg_lseg = NULL;
-        trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
+        trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
        if (trypnfs == PNFS_NOT_ATTEMPTED)
-                pnfs_read_through_mds(desc, data);
+                pnfs_read_through_mds(desc, hdr);
        pnfs_put_lseg(lseg);
 }
 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
 {
        pnfs_put_lseg(hdr->lseg);
-        nfs_rw_header_free(hdr);
+        nfs_pgio_header_free(hdr);
 }
 EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
 int
 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
-        struct nfs_rw_header *rhdr;
        struct nfs_pgio_header *hdr;
        int ret;
-        rhdr = nfs_rw_header_alloc(desc->pg_rw_ops);
+        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
-        if (!rhdr) {
+        if (!hdr) {
                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
                ret = -ENOMEM;
                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
                return ret;
        }
-        hdr = &rhdr->header;
        nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
-        atomic_inc(&hdr->refcnt);
        ret = nfs_generic_pgio(desc, hdr);
        if (ret != 0) {
                pnfs_put_lseg(desc->pg_lseg);
                desc->pg_lseg = NULL;
        } else
                pnfs_do_read(desc, hdr);
-        if (atomic_dec_and_test(&hdr->refcnt))
-                hdr->completion_ops->completion(hdr);
        return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
@@ -1820,12 +1767,11 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
 EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
 void
-pnfs_set_layoutcommit(struct nfs_pgio_data *wdata)
+pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = wdata->header;
        struct inode *inode = hdr->inode;
        struct nfs_inode *nfsi = NFS_I(inode);
-        loff_t end_pos = wdata->mds_offset + wdata->res.count;
+        loff_t end_pos = hdr->mds_offset + hdr->res.count;
        bool mark_as_dirty = false;
        spin_lock(&inode->i_lock);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 4fb309a2b4c4..aca3dff5dae6 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -32,6 +32,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
+#include <linux/workqueue.h>
 enum {
        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
@@ -46,6 +47,7 @@ struct pnfs_layout_segment {
        atomic_t pls_refcount;
        unsigned long pls_flags;
        struct pnfs_layout_hdr *pls_layout;
+        struct work_struct pls_work;
 };
 enum pnfs_try_status {
@@ -104,6 +106,8 @@ struct pnfs_layoutdriver_type {
                                  int max);
        void (*recover_commit_reqs) (struct list_head *list,
                                     struct nfs_commit_info *cinfo);
+        struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
+                                                struct page *page);
        int (*commit_pagelist)(struct inode *inode,
                               struct list_head *mds_pages,
                               int how,
@@ -113,8 +117,8 @@ struct pnfs_layoutdriver_type {
         * Return PNFS_ATTEMPTED to indicate the layout code has attempted
         * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
         */
-        enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data);
+        enum pnfs_try_status (*read_pagelist)(struct nfs_pgio_header *);
-        enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how);
+        enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
        void (*free_deviceid_node) (struct nfs4_deviceid_node *);
@@ -179,6 +183,7 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 /* pnfs.c */
 void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
+void pnfs_put_lseg_async(struct pnfs_layout_segment *lseg);
 void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
 void unset_pnfs_layoutdriver(struct nfs_server *);
@@ -213,13 +218,13 @@ bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
-void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata);
+void pnfs_set_layoutcommit(struct nfs_pgio_header *);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
 int _pnfs_return_layout(struct inode *);
 int pnfs_commit_and_return_layout(struct inode *);
-void pnfs_ld_write_done(struct nfs_pgio_data *);
+void pnfs_ld_write_done(struct nfs_pgio_header *);
-void pnfs_ld_read_done(struct nfs_pgio_data *);
+void pnfs_ld_read_done(struct nfs_pgio_header *);
 struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
                                               struct nfs_open_context *ctx,
                                               loff_t pos,
@@ -228,12 +233,8 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
                                               gfp_t gfp_flags);
 void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
-int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head,
+int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
-                        const struct nfs_pgio_completion_ops *compl_ops,
+int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
-                        struct nfs_direct_req *dreq);
-int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
-                        const struct nfs_pgio_completion_ops *compl_ops,
-                        struct nfs_direct_req *dreq);
 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
 /* nfs4_deviceid_flags */
@@ -345,6 +346,17 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
        NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
 }
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+                        struct page *page)
+{
+        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+        if (ld == NULL || ld->search_commit_reqs == NULL)
+                return NULL;
+        return ld->search_commit_reqs(cinfo, page);
+}
 /* Should the pNFS client commit and return the layout upon a setattr */
 static inline bool
 pnfs_ld_layoutret_on_setattr(struct inode *inode)
@@ -410,6 +422,10 @@ static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 {
 }
+static inline void pnfs_put_lseg_async(struct pnfs_layout_segment *lseg)
+{
+}
 static inline int pnfs_return_layout(struct inode *ino)
 {
        return 0;
@@ -496,6 +512,13 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
 {
 }
+static inline struct nfs_page *
+pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
+                        struct page *page)
+{
+        return NULL;
+}
 static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
        return 0;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c171ce1a8a30..b09cc23d6f43 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -578,46 +578,49 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return 0;
 }
-static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-        struct inode *inode = data->header->inode;
+        struct inode *inode = hdr->inode;
        nfs_invalidate_atime(inode);
        if (task->tk_status >= 0) {
-                nfs_refresh_inode(inode, data->res.fattr);
+                nfs_refresh_inode(inode, hdr->res.fattr);
                /* Emulate the eof flag, which isn't normally needed in NFSv2
                 * as it is guaranteed to always return the file attributes
                 */
-                if (data->args.offset + data->res.count >= data->res.fattr->size)
+                if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
-                        data->res.eof = 1;
+                        hdr->res.eof = 1;
        }
        return 0;
 }
-static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs_proc_read_setup(struct nfs_pgio_header *hdr,
+                                struct rpc_message *msg)
 {
        msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
 }
-static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
+                                     struct nfs_pgio_header *hdr)
 {
        rpc_call_start(task);
        return 0;
 }
-static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
+static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
-        struct inode *inode = data->header->inode;
+        struct inode *inode = hdr->inode;
        if (task->tk_status >= 0)
-                nfs_post_op_update_inode_force_wcc(inode, data->res.fattr);
+                nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
        return 0;
 }
-static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
+static void nfs_proc_write_setup(struct nfs_pgio_header *hdr,
+                                 struct rpc_message *msg)
 {
        /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
-        data->args.stable = NFS_FILE_SYNC;
+        hdr->args.stable = NFS_FILE_SYNC;
        msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
 }
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e818a475ca64..beff2769c5c5 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -33,12 +33,12 @@ static const struct nfs_rw_ops nfs_rw_read_ops;
 static struct kmem_cache *nfs_rdata_cachep;
-static struct nfs_rw_header *nfs_readhdr_alloc(void)
+static struct nfs_pgio_header *nfs_readhdr_alloc(void)
 {
        return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
 }
-static void nfs_readhdr_free(struct nfs_rw_header *rhdr)
+static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
 {
        kmem_cache_free(nfs_rdata_cachep, rhdr);
 }
@@ -115,12 +115,6 @@ static void nfs_readpage_release(struct nfs_page *req)
                unlock_page(req->wb_page);
        }
-        dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
-                        req->wb_context->dentry->d_inode->i_sb->s_id,
-                        (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
-                        req->wb_bytes,
-                        (long long)req_offset(req));
        nfs_release_request(req);
 }
@@ -172,14 +166,15 @@ out:
        hdr->release(hdr);
 }
-static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg,
+static void nfs_initiate_read(struct nfs_pgio_header *hdr,
+                              struct rpc_message *msg,
                              struct rpc_task_setup *task_setup_data, int how)
 {
-        struct inode *inode = data->header->inode;
+        struct inode *inode = hdr->inode;
        int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
        task_setup_data->flags |= swap_flags;
-        NFS_PROTO(inode)->read_setup(data, msg);
+        NFS_PROTO(inode)->read_setup(hdr, msg);
 }
 static void
@@ -203,14 +198,15 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
 * This is the callback from RPC telling us whether a reply was
 * received or some error occurred (timeout or socket shutdown).
 */
-static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
+static int nfs_readpage_done(struct rpc_task *task,
+                             struct nfs_pgio_header *hdr,
                             struct inode *inode)
 {
-        int status = NFS_PROTO(inode)->read_done(task, data);
+        int status = NFS_PROTO(inode)->read_done(task, hdr);
        if (status != 0)
                return status;
-        nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, data->res.count);
+        nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count);
        if (task->tk_status == -ESTALE) {
                set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
@@ -219,34 +215,34 @@ static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
        return 0;
 }
-static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_readpage_retry(struct rpc_task *task,
+                               struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_args *argp = &data->args;
+        struct nfs_pgio_args *argp = &hdr->args;
-        struct nfs_pgio_res  *resp = &data->res;
+        struct nfs_pgio_res  *resp = &hdr->res;
        /* This is a short read! */
-        nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD);
+        nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD);
        /* Has the server at least made some progress? */
        if (resp->count == 0) {
-                nfs_set_pgio_error(data->header, -EIO, argp->offset);
+                nfs_set_pgio_error(hdr, -EIO, argp->offset);
                return;
        }
-        /* Yes, so retry the read at the end of the data */
+        /* Yes, so retry the read at the end of the hdr */
-        data->mds_offset += resp->count;
+        hdr->mds_offset += resp->count;
        argp->offset += resp->count;
        argp->pgbase += resp->count;
        argp->count -= resp->count;
        rpc_restart_call_prepare(task);
 }
-static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_readpage_result(struct rpc_task *task,
+                                struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = data->header;
+        if (hdr->res.eof) {
-        if (data->res.eof) {
                loff_t bound;
-                bound = data->args.offset + data->res.count;
+                bound = hdr->args.offset + hdr->res.count;
                spin_lock(&hdr->lock);
                if (bound < hdr->io_start + hdr->good_bytes) {
                        set_bit(NFS_IOHDR_EOF, &hdr->flags);
@@ -254,8 +250,8 @@ static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *dat
                        hdr->good_bytes = bound - hdr->io_start;
                }
                spin_unlock(&hdr->lock);
-        } else if (data->res.count != data->args.count)
+        } else if (hdr->res.count != hdr->args.count)
-                nfs_readpage_retry(task, data);
+                nfs_readpage_retry(task, hdr);
 }
 /*
@@ -404,7 +400,7 @@ out:
 int __init nfs_init_readpagecache(void)
 {
        nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
-                                             sizeof(struct nfs_rw_header),
+                                             sizeof(struct nfs_pgio_header),
                                             0, SLAB_HWCACHE_ALIGN,
                                             NULL);
        if (nfs_rdata_cachep == NULL)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 084af1060d79..e4499d5b51e8 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1027,8 +1027,7 @@ static bool nfs_auth_info_add(struct nfs_auth_info *auth_info,
                              rpc_authflavor_t flavor)
 {
        unsigned int i;
-        unsigned int max_flavor_len = (sizeof(auth_info->flavors) /
+        unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors);
-                                       sizeof(auth_info->flavors[0]));
        /* make sure this flavor isn't already in the list */
        for (i = 0; i < auth_info->flavor_len; i++) {
@@ -2180,7 +2179,7 @@ out_no_address:
        return -EINVAL;
 }
-#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
+#define NFS_REMOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
                | NFS_MOUNT_SECURE \
                | NFS_MOUNT_TCP \
                | NFS_MOUNT_VER3 \
@@ -2188,15 +2187,16 @@ out_no_address:
                | NFS_MOUNT_NONLM \
                | NFS_MOUNT_BROKEN_SUID \
                | NFS_MOUNT_STRICTLOCK \
-                | NFS_MOUNT_UNSHARED \
-                | NFS_MOUNT_NORESVPORT \
                | NFS_MOUNT_LEGACY_INTERFACE)
+#define NFS_MOUNT_CMP_FLAGMASK (NFS_REMOUNT_CMP_FLAGMASK & \
+                ~(NFS_MOUNT_UNSHARED | NFS_MOUNT_NORESVPORT))
 static int
 nfs_compare_remount_data(struct nfs_server *nfss,
                         struct nfs_parsed_mount_data *data)
 {
-        if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK ||
+        if ((data->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK ||
            data->rsize != nfss->rsize ||
            data->wsize != nfss->wsize ||
            data->version != nfss->nfs_client->rpc_ops->version ||
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 962c9ee758be..e3b5cf28bdc5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -47,6 +47,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
 static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
 static const struct nfs_rw_ops nfs_rw_write_ops;
 static void nfs_clear_request_commit(struct nfs_page *req);
+static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
+                                      struct inode *inode);
 static struct kmem_cache *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
@@ -71,18 +73,18 @@ void nfs_commit_free(struct nfs_commit_data *p)
 }
 EXPORT_SYMBOL_GPL(nfs_commit_free);
-static struct nfs_rw_header *nfs_writehdr_alloc(void)
+static struct nfs_pgio_header *nfs_writehdr_alloc(void)
 {
-        struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
+        struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
        if (p)
                memset(p, 0, sizeof(*p));
        return p;
 }
-static void nfs_writehdr_free(struct nfs_rw_header *whdr)
+static void nfs_writehdr_free(struct nfs_pgio_header *hdr)
 {
-        mempool_free(whdr, nfs_wdata_mempool);
+        mempool_free(hdr, nfs_wdata_mempool);
 }
 static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
@@ -93,6 +95,38 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
 }
 /*
+ * nfs_page_search_commits_for_head_request_locked
+ *
+ * Search through commit lists on @inode for the head request for @page.
+ * Must be called while holding the inode (which is cinfo) lock.
+ *
+ * Returns the head request if found, or NULL if not found.
+ */
+static struct nfs_page *
+nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
+                                                struct page *page)
+{
+        struct nfs_page *freq, *t;
+        struct nfs_commit_info cinfo;
+        struct inode *inode = &nfsi->vfs_inode;
+        nfs_init_cinfo_from_inode(&cinfo, inode);
+        /* search through pnfs commit lists */
+        freq = pnfs_search_commit_reqs(inode, &cinfo, page);
+        if (freq)
+                return freq->wb_head;
+        /* Linearly search the commit list for the correct request */
+        list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
+                if (freq->wb_page == page)
+                        return freq->wb_head;
+        }
+        return NULL;
+}
+/*
 * nfs_page_find_head_request_locked - find head request associated with @page
 *
 * must be called while holding the inode lock.
@@ -106,21 +140,12 @@ nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
        if (PagePrivate(page))
                req = (struct nfs_page *)page_private(page);
-        else if (unlikely(PageSwapCache(page))) {
+        else if (unlikely(PageSwapCache(page)))
-                struct nfs_page *freq, *t;
+                req = nfs_page_search_commits_for_head_request_locked(nfsi,
+                        page);
-                /* Linearly search the commit list for the correct req */
-                list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) {
-                        if (freq->wb_page == page) {
-                                req = freq->wb_head;
-                                break;
-                        }
-                }
-        }
        if (req) {
                WARN_ON_ONCE(req->wb_head != req);
                kref_get(&req->wb_kref);
        }
@@ -216,7 +241,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
        unsigned int pos = 0;
        unsigned int len = nfs_page_length(req->wb_page);
-        nfs_page_group_lock(req);
+        nfs_page_group_lock(req, true);
        do {
                tmp = nfs_page_group_search_locked(req->wb_head, pos);
@@ -379,8 +404,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
                subreq->wb_head = subreq;
                subreq->wb_this_page = subreq;
-                nfs_clear_request_commit(subreq);
                /* subreq is now totally disconnected from page group or any
                 * write / commit lists. last chance to wake any waiters */
                nfs_unlock_request(subreq);
@@ -456,7 +479,9 @@ try_again:
        }
        /* lock each request in the page group */
-        nfs_page_group_lock(head);
+        ret = nfs_page_group_lock(head, false);
+        if (ret < 0)
+                return ERR_PTR(ret);
        subreq = head;
        do {
                /*
@@ -488,7 +513,7 @@ try_again:
         * Commit list removal accounting is done after locks are dropped */
        subreq = head;
        do {
-                nfs_list_remove_request(subreq);
+                nfs_clear_request_commit(subreq);
                subreq = subreq->wb_this_page;
        } while (subreq != head);
@@ -518,15 +543,11 @@ try_again:
        nfs_page_group_unlock(head);
-        /* drop lock to clear_request_commit the head req and clean up
+        /* drop lock to clean uprequests on destroy list */
-         * requests on destroy list */
        spin_unlock(&inode->i_lock);
        nfs_destroy_unlinked_subrequests(destroy_list, head);
-        /* clean up commit list state */
-        nfs_clear_request_commit(head);
        /* still holds ref on head from nfs_page_find_head_request_locked
         * and still has lock on head from lock loop */
        return head;
@@ -705,6 +726,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
        if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
                nfs_release_request(req);
+        else
+                WARN_ON_ONCE(1);
 }
 static void
@@ -808,6 +831,7 @@ nfs_clear_page_commit(struct page *page)
        dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
 }
+/* Called holding inode (/cinfo) lock */
 static void
 nfs_clear_request_commit(struct nfs_page *req)
 {
@@ -817,20 +841,17 @@ nfs_clear_request_commit(struct nfs_page *req)
                nfs_init_cinfo_from_inode(&cinfo, inode);
                if (!pnfs_clear_request_commit(req, &cinfo)) {
-                        spin_lock(cinfo.lock);
                        nfs_request_remove_commit_list(req, &cinfo);
-                        spin_unlock(cinfo.lock);
                }
                nfs_clear_page_commit(req->wb_page);
        }
 }
-static inline
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
-int nfs_write_need_commit(struct nfs_pgio_data *data)
 {
-        if (data->verf.committed == NFS_DATA_SYNC)
+        if (hdr->verf.committed == NFS_DATA_SYNC)
-                return data->header->lseg == NULL;
+                return hdr->lseg == NULL;
-        return data->verf.committed != NFS_FILE_SYNC;
+        return hdr->verf.committed != NFS_FILE_SYNC;
 }
 #else
@@ -856,8 +877,7 @@ nfs_clear_request_commit(struct nfs_page *req)
 {
 }
-static inline
+int nfs_write_need_commit(struct nfs_pgio_header *hdr)
-int nfs_write_need_commit(struct nfs_pgio_data *data)
 {
        return 0;
 }
@@ -883,11 +903,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
                        nfs_context_set_write_error(req->wb_context, hdr->error);
                        goto remove_req;
                }
-                if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) {
+                if (nfs_write_need_commit(hdr)) {
-                        nfs_mark_request_dirty(req);
-                        goto next;
-                }
-                if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
                        memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
                        nfs_mark_request_commit(req, hdr->lseg, &cinfo);
                        goto next;
@@ -1038,9 +1054,9 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
        else
                req->wb_bytes = rqend - req->wb_offset;
 out_unlock:
-        spin_unlock(&inode->i_lock);
        if (req)
                nfs_clear_request_commit(req);
+        spin_unlock(&inode->i_lock);
        return req;
 out_flushme:
        spin_unlock(&inode->i_lock);
@@ -1241,17 +1257,18 @@ static int flush_task_priority(int how)
        return RPC_PRIORITY_NORMAL;
 }
-static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg,
+static void nfs_initiate_write(struct nfs_pgio_header *hdr,
+                               struct rpc_message *msg,
                               struct rpc_task_setup *task_setup_data, int how)
 {
-        struct inode *inode = data->header->inode;
+        struct inode *inode = hdr->inode;
        int priority = flush_task_priority(how);
        task_setup_data->priority = priority;
-        NFS_PROTO(inode)->write_setup(data, msg);
+        NFS_PROTO(inode)->write_setup(hdr, msg);
        nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
-                                 &task_setup_data->rpc_client, msg, data);
+                                 &task_setup_data->rpc_client, msg, hdr);
 }
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -1313,21 +1330,9 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
        NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
 }
-static void nfs_writeback_release_common(struct nfs_pgio_data *data)
+static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_header *hdr = data->header;
+        /* do nothing! */
-        int status = data->task.tk_status;
-        if ((status >= 0) && nfs_write_need_commit(data)) {
-                spin_lock(&hdr->lock);
-                if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
-                        ; /* Do nothing */
-                else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
-                        memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf));
-                else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf)))
-                        set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
-                spin_unlock(&hdr->lock);
-        }
 }
 /*
@@ -1358,7 +1363,8 @@ static int nfs_should_remove_suid(const struct inode *inode)
 /*
 * This function is called when the WRITE call is complete.
 */
-static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
+static int nfs_writeback_done(struct rpc_task *task,
+                              struct nfs_pgio_header *hdr,
                              struct inode *inode)
 {
        int status;
@@ -1370,13 +1376,14 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
         * another writer had changed the file, but some applications
         * depend on tighter cache coherency when writing.
         */
-        status = NFS_PROTO(inode)->write_done(task, data);
+        status = NFS_PROTO(inode)->write_done(task, hdr);
        if (status != 0)
                return status;
-        nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count);
+        nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
 #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
-        if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) {
+        if (hdr->res.verf->committed < hdr->args.stable &&
+            task->tk_status >= 0) {
                /* We tried a write call, but the server did not
                 * commit data to stable storage even though we
                 * requested it.
@@ -1392,7 +1399,7 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
                        dprintk("NFS:       faulty NFS server %s:"
                                " (committed = %d) != (stable = %d)\n",
                                NFS_SERVER(inode)->nfs_client->cl_hostname,
-                                data->res.verf->committed, data->args.stable);
+                                hdr->res.verf->committed, hdr->args.stable);
                        complain = jiffies + 300 * HZ;
                }
        }
@@ -1407,16 +1414,17 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
 /*
 * This function is called when the WRITE call is complete.
 */
-static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data)
+static void nfs_writeback_result(struct rpc_task *task,
+                                 struct nfs_pgio_header *hdr)
 {
-        struct nfs_pgio_args    *argp = &data->args;
+        struct nfs_pgio_args    *argp = &hdr->args;
-        struct nfs_pgio_res     *resp = &data->res;
+        struct nfs_pgio_res     *resp = &hdr->res;
        if (resp->count < argp->count) {
                static unsigned long    complain;
                /* This a short write! */
-                nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE);
+                nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE);
                /* Has the server at least made some progress? */
                if (resp->count == 0) {
@@ -1426,14 +1434,14 @@ static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *da
                                       argp->count);
                                complain = jiffies + 300 * HZ;
                        }
-                        nfs_set_pgio_error(data->header, -EIO, argp->offset);
+                        nfs_set_pgio_error(hdr, -EIO, argp->offset);
                        task->tk_status = -EIO;
                        return;
                }
                /* Was this an NFSv2 write or an NFSv3 stable write? */
                if (resp->verf->committed != NFS_UNSTABLE) {
                        /* Resend from where the server left off */
-                        data->mds_offset += resp->count;
+                        hdr->mds_offset += resp->count;
                        argp->offset += resp->count;
                        argp->pgbase += resp->count;
                        argp->count -= resp->count;
@@ -1884,7 +1892,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 int __init nfs_init_writepagecache(void)
 {
        nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
-                                             sizeof(struct nfs_rw_header),
+                                             sizeof(struct nfs_pgio_header),
                                             0, SLAB_HWCACHE_ALIGN,
                                             NULL);
        if (nfs_wdata_cachep == NULL)
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index ed628f71274c..538f142935ea 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -30,9 +30,6 @@
 MODULE_LICENSE("GPL");
-EXPORT_SYMBOL_GPL(nfsacl_encode);
-EXPORT_SYMBOL_GPL(nfsacl_decode);
 struct nfsacl_encode_desc {
        struct xdr_array2_desc desc;
        unsigned int count;
@@ -136,6 +133,7 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
                          nfsacl_desc.desc.array_len;
        return err;
 }
+EXPORT_SYMBOL_GPL(nfsacl_encode);
 struct nfsacl_decode_desc {
        struct xdr_array2_desc desc;
@@ -295,3 +293,4 @@ int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
        return 8 + nfsacl_desc.desc.elem_size *
                   nfsacl_desc.desc.array_len;
 }
+EXPORT_SYMBOL_GPL(nfsacl_decode);
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index a986ceb6fd0d..4cd7c69a6cb9 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -47,7 +47,7 @@ struct svc_rqst;
 #define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \
                        / sizeof(struct nfs4_ace))
-struct nfs4_acl *nfs4_acl_new(int);
+int nfs4_acl_bytes(int entries);
 int nfs4_acl_get_whotype(char *, u32);
 __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who);
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 72f44823adbb..9d46a0bdd9f9 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -28,7 +28,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
        validate_process_creds();
        /* discard any old override before preparing the new set */
-        revert_creds(get_cred(current->real_cred));
+        revert_creds(get_cred(current_real_cred()));
        new = prepare_creds();
        if (!new)
                return -ENOMEM;
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 13b85f94d9e2..72ffd7cce3c3 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -698,8 +698,8 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
        kref_get(&item->ex_client->ref);
        new->ex_client = item->ex_client;
-        new->ex_path.dentry = dget(item->ex_path.dentry);
+        new->ex_path = item->ex_path;
-        new->ex_path.mnt = mntget(item->ex_path.mnt);
+        path_get(&item->ex_path);
        new->ex_fslocs.locations = NULL;
        new->ex_fslocs.locations_count = 0;
        new->ex_fslocs.migrated = 0;
@@ -1253,7 +1253,7 @@ static int e_show(struct seq_file *m, void *p)
                return 0;
        }
-        cache_get(&exp->h);
+        exp_get(exp);
        if (cache_check(cd, &exp->h, NULL))
                return 0;
        exp_put(exp);
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index cfeea85c5bed..04dc8c167b0c 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -101,9 +101,10 @@ static inline void exp_put(struct svc_export *exp)
        cache_put(&exp->h, exp->cd);
 }
-static inline void exp_get(struct svc_export *exp)
+static inline struct svc_export *exp_get(struct svc_export *exp)
 {
        cache_get(&exp->h);
+        return exp;
 }
 struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *);
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index 2ed05c3cd43d..c16bf5af6831 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -17,81 +17,13 @@
 struct nfsd_fault_inject_op {
        char *file;
-        u64 (*forget)(struct nfs4_client *, u64);
+        u64 (*get)(void);
-        u64 (*print)(struct nfs4_client *, u64);
+        u64 (*set_val)(u64);
+        u64 (*set_clnt)(struct sockaddr_storage *, size_t);
 };
-static struct nfsd_fault_inject_op inject_ops[] = {
-        {
-                .file   = "forget_clients",
-                .forget = nfsd_forget_client,
-                .print  = nfsd_print_client,
-        },
-        {
-                .file   = "forget_locks",
-                .forget = nfsd_forget_client_locks,
-                .print  = nfsd_print_client_locks,
-        },
-        {
-                .file   = "forget_openowners",
-                .forget = nfsd_forget_client_openowners,
-                .print  = nfsd_print_client_openowners,
-        },
-        {
-                .file   = "forget_delegations",
-                .forget = nfsd_forget_client_delegations,
-                .print  = nfsd_print_client_delegations,
-        },
-        {
-                .file   = "recall_delegations",
-                .forget = nfsd_recall_client_delegations,
-                .print  = nfsd_print_client_delegations,
-        },
-};
-static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);
 static struct dentry *debug_dir;
-static void nfsd_inject_set(struct nfsd_fault_inject_op *op, u64 val)
-{
-        u64 count = 0;
-        if (val == 0)
-                printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);
-        else
-                printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val);
-        nfs4_lock_state();
-        count = nfsd_for_n_state(val, op->forget);
-        nfs4_unlock_state();
-        printk(KERN_INFO "NFSD: %s: found %llu", op->file, count);
-}
-static void nfsd_inject_set_client(struct nfsd_fault_inject_op *op,
-                                   struct sockaddr_storage *addr,
-                                   size_t addr_size)
-{
-        char buf[INET6_ADDRSTRLEN];
-        struct nfs4_client *clp;
-        u64 count;
-        nfs4_lock_state();
-        clp = nfsd_find_client(addr, addr_size);
-        if (clp) {
-                count = op->forget(clp, 0);
-                rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
-                printk(KERN_INFO "NFSD [%s]: Client %s had %llu state object(s)\n", op->file, buf, count);
-        }
-        nfs4_unlock_state();
-}
-static void nfsd_inject_get(struct nfsd_fault_inject_op *op, u64 *val)
-{
-        nfs4_lock_state();
-        *val = nfsd_for_n_state(0, op->print);
-        nfs4_unlock_state();
-}
 static ssize_t fault_inject_read(struct file *file, char __user *buf,
                                 size_t len, loff_t *ppos)
 {
@@ -99,9 +31,10 @@ static ssize_t fault_inject_read(struct file *file, char __user *buf,
        char read_buf[25];
        size_t size;
        loff_t pos = *ppos;
+        struct nfsd_fault_inject_op *op = file_inode(file)->i_private;
        if (!pos)
-                nfsd_inject_get(file_inode(file)->i_private, &val);
+                val = op->get();
        size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val);
        return simple_read_from_buffer(buf, len, ppos, read_buf, size);
@@ -114,18 +47,36 @@ static ssize_t fault_inject_write(struct file *file, const char __user *buf,
        size_t size = min(sizeof(write_buf) - 1, len);
        struct net *net = current->nsproxy->net_ns;
        struct sockaddr_storage sa;
+        struct nfsd_fault_inject_op *op = file_inode(file)->i_private;
        u64 val;
+        char *nl;
        if (copy_from_user(write_buf, buf, size))
                return -EFAULT;
        write_buf[size] = '\0';
+        /* Deal with any embedded newlines in the string */
+        nl = strchr(write_buf, '\n');
+        if (nl) {
+                size = nl - write_buf;
+                *nl = '\0';
+        }
        size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa));
-        if (size > 0)
+        if (size > 0) {
-                nfsd_inject_set_client(file_inode(file)->i_private, &sa, size);
+                val = op->set_clnt(&sa, size);
-        else {
+                if (val)
+                        pr_info("NFSD [%s]: Client %s had %llu state object(s)\n",
+                                op->file, write_buf, val);
+        } else {
                val = simple_strtoll(write_buf, NULL, 0);
-                nfsd_inject_set(file_inode(file)->i_private, val);
+                if (val == 0)
+                        pr_info("NFSD Fault Injection: %s (all)", op->file);
+                else
+                        pr_info("NFSD Fault Injection: %s (n = %llu)",
+                                op->file, val);
+                val = op->set_val(val);
+                pr_info("NFSD: %s: found %llu", op->file, val);
        }
        return len; /* on success, claim we got the whole input */
 }
@@ -141,6 +92,41 @@ void nfsd_fault_inject_cleanup(void)
        debugfs_remove_recursive(debug_dir);
 }
+static struct nfsd_fault_inject_op inject_ops[] = {
+        {
+                .file     = "forget_clients",
+                .get      = nfsd_inject_print_clients,
+                .set_val  = nfsd_inject_forget_clients,
+                .set_clnt = nfsd_inject_forget_client,
+        },
+        {
+                .file     = "forget_locks",
+                .get      = nfsd_inject_print_locks,
+                .set_val  = nfsd_inject_forget_locks,
+                .set_clnt = nfsd_inject_forget_client_locks,
+        },
+        {
+                .file     = "forget_openowners",
+                .get      = nfsd_inject_print_openowners,
+                .set_val  = nfsd_inject_forget_openowners,
+                .set_clnt = nfsd_inject_forget_client_openowners,
+        },
+        {
+                .file     = "forget_delegations",
+                .get      = nfsd_inject_print_delegations,
+                .set_val  = nfsd_inject_forget_delegations,
+                .set_clnt = nfsd_inject_forget_client_delegations,
+        },
+        {
+                .file     = "recall_delegations",
+                .get      = nfsd_inject_print_delegations,
+                .set_val  = nfsd_inject_recall_delegations,
+                .set_clnt = nfsd_inject_recall_client_delegations,
+        },
+};
+#define NUM_INJECT_OPS (sizeof(inject_ops)/sizeof(struct nfsd_fault_inject_op))
 int nfsd_fault_inject_init(void)
 {
        unsigned int i;
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index d32b3aa6600d..ea6749a32760 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -29,14 +29,19 @@
 #define CLIENT_HASH_SIZE                (1 << CLIENT_HASH_BITS)
 #define CLIENT_HASH_MASK                (CLIENT_HASH_SIZE - 1)
-#define LOCKOWNER_INO_HASH_BITS         8
-#define LOCKOWNER_INO_HASH_SIZE         (1 << LOCKOWNER_INO_HASH_BITS)
 #define SESSION_HASH_SIZE       512
 struct cld_net;
 struct nfsd4_client_tracking_ops;
+/*
+ * Represents a nfsd "container". With respect to nfsv4 state tracking, the
+ * fields of interest are the *_id_hashtbls and the *_name_tree. These track
+ * the nfs4_client objects by either short or long form clientid.
+ *
+ * Each nfsd_net runs a nfs4_laundromat workqueue job when necessary to clean
+ * up expired clients and delegations within the container.
+ */
 struct nfsd_net {
        struct cld_net *cld_net;
@@ -66,8 +71,6 @@ struct nfsd_net {
        struct rb_root conf_name_tree;
        struct list_head *unconf_id_hashtbl;
        struct rb_root unconf_name_tree;
-        struct list_head *ownerstr_hashtbl;
-        struct list_head *lockowner_ino_hashtbl;
        struct list_head *sessionid_hashtbl;
        /*
         * client_lru holds client queue ordered by nfs4_client.cl_time
@@ -97,10 +100,16 @@ struct nfsd_net {
        bool nfsd_net_up;
        bool lockd_up;
+        /* Time of server startup */
+        struct timeval nfssvc_boot;
        /*
-         * Time of server startup
+         * Max number of connections this nfsd container will allow. Defaults
+         * to '0' which is means that it bases this on the number of threads.
         */
-        struct timeval nfssvc_boot;
+        unsigned int max_connections;
+        u32 clientid_counter;
        struct svc_serv *nfsd_serv;
 };
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 12b023a7ab7d..ac54ea60b3f6 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -54,14 +54,14 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
        if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
                acl = get_acl(inode, ACL_TYPE_ACCESS);
-                if (IS_ERR(acl)) {
-                        nfserr = nfserrno(PTR_ERR(acl));
-                        goto fail;
-                }
                if (acl == NULL) {
                        /* Solaris returns the inode's minimum ACL. */
                        acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
                }
+                if (IS_ERR(acl)) {
+                        nfserr = nfserrno(PTR_ERR(acl));
+                        goto fail;
+                }
                resp->acl_access = acl;
        }
        if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 2a514e21dc74..34cbbab6abd7 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -47,14 +47,14 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
        if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
                acl = get_acl(inode, ACL_TYPE_ACCESS);
-                if (IS_ERR(acl)) {
-                        nfserr = nfserrno(PTR_ERR(acl));
-                        goto fail;
-                }
                if (acl == NULL) {
                        /* Solaris returns the inode's minimum ACL. */
                        acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
                }
+                if (IS_ERR(acl)) {
+                        nfserr = nfserrno(PTR_ERR(acl));
+                        goto fail;
+                }
                resp->acl_access = acl;
        }
        if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 401289913130..fa2525b2e9d7 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -157,11 +157,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
         * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
         * + 1 (xdr opaque byte count) = 26
         */
+        resp->count = min(argp->count, max_blocksize);
-        resp->count = argp->count;
-        if (max_blocksize < resp->count)
-                resp->count = max_blocksize;
        svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
        fh_copy(&resp->fh, &argp->fh);
@@ -286,8 +282,7 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp,
        fh_copy(&resp->dirfh, &argp->ffh);
        fh_init(&resp->fh, NFS3_FHSIZE);
        nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen,
-                                                   argp->tname, argp->tlen,
+                                                   argp->tname, &resp->fh);
-                                                   &resp->fh, &argp->attrs);
        RETURN_STATUS(nfserr);
 }
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index e6c01e80325e..39c5eb3ad33a 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -120,10 +120,7 @@ decode_sattr3(__be32 *p, struct iattr *iap)
                iap->ia_valid |= ATTR_SIZE;
                p = xdr_decode_hyper(p, &newsize);
-                if (newsize <= NFS_OFFSET_MAX)
+                iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX);
-                        iap->ia_size = newsize;
-                else
-                        iap->ia_size = NFS_OFFSET_MAX;
        }
        if ((tmp = ntohl(*p++)) == 1) { /* set to server time */
                iap->ia_valid |= ATTR_ATIME;
@@ -338,10 +335,8 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
                return 0;
        p = xdr_decode_hyper(p, &args->offset);
-        len = args->count = ntohl(*p++);
+        args->count = ntohl(*p++);
+        len = min(args->count, max_blocksize);
-        if (len > max_blocksize)
-                len = max_blocksize;
        /* set up the kvec */
        v=0;
@@ -349,7 +344,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
                struct page *p = *(rqstp->rq_next_page++);
                rqstp->rq_vec[v].iov_base = page_address(p);
-                rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
+                rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
                len -= rqstp->rq_vec[v].iov_len;
                v++;
        }
@@ -484,9 +479,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
        }
        /* now copy next page if there is one */
        if (len && !avail && rqstp->rq_arg.page_len) {
-                avail = rqstp->rq_arg.page_len;
+                avail = min_t(unsigned int, rqstp->rq_arg.page_len, PAGE_SIZE);
-                if (avail > PAGE_SIZE)
-                        avail = PAGE_SIZE;
                old = page_address(rqstp->rq_arg.pages[0]);
        }
        while (len && avail && *old) {
@@ -571,10 +564,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
        args->verf   = p; p += 2;
        args->dircount = ~0;
        args->count  = ntohl(*p++);
+        args->count  = min_t(u32, args->count, PAGE_SIZE);
-        if (args->count > PAGE_SIZE)
-                args->count = PAGE_SIZE;
        args->buffer = page_address(*(rqstp->rq_next_page++));
        return xdr_argsize_check(rqstp, p);
@@ -595,10 +585,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
        args->dircount = ntohl(*p++);
        args->count    = ntohl(*p++);
-        len = (args->count > max_blocksize) ? max_blocksize :
+        len = args->count = min(args->count, max_blocksize);
-                                                  args->count;
-        args->count = len;
        while (len > 0) {
                struct page *p = *(rqstp->rq_next_page++);
                if (!args->buffer)
@@ -913,8 +900,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
         */
        /* truncate filename if too long */
-        if (namlen > NFS3_MAXNAMLEN)
+        namlen = min(namlen, NFS3_MAXNAMLEN);
-                namlen = NFS3_MAXNAMLEN;
        slen = XDR_QUADLEN(namlen);
        elen = slen + NFS3_ENTRY_BAGGAGE
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index d714156a19fd..59fd76651781 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -146,35 +146,43 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
        int size = 0;
        pacl = get_acl(inode, ACL_TYPE_ACCESS);
-        if (!pacl) {
+        if (!pacl)
                pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
-                if (IS_ERR(pacl))
-                        return PTR_ERR(pacl);
+        if (IS_ERR(pacl))
-        }
+                return PTR_ERR(pacl);
        /* allocate for worst case: one (deny, allow) pair each: */
        size += 2 * pacl->a_count;
        if (S_ISDIR(inode->i_mode)) {
                flags = NFS4_ACL_DIR;
                dpacl = get_acl(inode, ACL_TYPE_DEFAULT);
+                if (IS_ERR(dpacl)) {
+                        error = PTR_ERR(dpacl);
+                        goto rel_pacl;
+                }
                if (dpacl)
                        size += 2 * dpacl->a_count;
        }
-        *acl = nfs4_acl_new(size);
+        *acl = kmalloc(nfs4_acl_bytes(size), GFP_KERNEL);
        if (*acl == NULL) {
                error = -ENOMEM;
                goto out;
        }
+        (*acl)->naces = 0;
        _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
        if (dpacl)
                _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT);
- out:
+out:
-        posix_acl_release(pacl);
        posix_acl_release(dpacl);
+rel_pacl:
+        posix_acl_release(pacl);
        return error;
 }
@@ -872,16 +880,13 @@ ace2type(struct nfs4_ace *ace)
        return -1;
 }
-struct nfs4_acl *
+/*
-nfs4_acl_new(int n)
+ * return the size of the struct nfs4_acl required to represent an acl
+ * with @entries entries.
+ */
+int nfs4_acl_bytes(int entries)
 {
-        struct nfs4_acl *acl;
+        return sizeof(struct nfs4_acl) + entries * sizeof(struct nfs4_ace);
-        acl = kmalloc(sizeof(*acl) + n*sizeof(struct nfs4_ace), GFP_KERNEL);
-        if (acl == NULL)
-                return NULL;
-        acl->naces = 0;
-        return acl;
 }
 static struct {
@@ -935,5 +940,5 @@ __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who)
                return 0;
        }
        WARN_ON_ONCE(1);
-        return -1;
+        return nfserr_serverfault;
 }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 2c73cae9899d..e0be57b0f79b 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -337,7 +337,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
        p = xdr_reserve_space(xdr, 4);
        *p++ = xdr_zero;                        /* truncate */
-        encode_nfs_fh4(xdr, &dp->dl_fh);
+        encode_nfs_fh4(xdr, &dp->dl_stid.sc_file->fi_fhandle);
        hdr->nops++;
 }
@@ -678,7 +678,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
                                (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5))
                        return -EINVAL;
                args.client_name = clp->cl_cred.cr_principal;
-                args.prognumber = conn->cb_prog,
+                args.prognumber = conn->cb_prog;
                args.protocol = XPRT_TRANSPORT_TCP;
                args.authflavor = clp->cl_cred.cr_flavor;
                clp->cl_cb_ident = conn->cb_ident;
@@ -689,7 +689,8 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
                clp->cl_cb_session = ses;
                args.bc_xprt = conn->cb_xprt;
                args.prognumber = clp->cl_cb_session->se_cb_prog;
-                args.protocol = XPRT_TRANSPORT_BC_TCP;
+                args.protocol = conn->cb_xprt->xpt_class->xcl_ident |
+                                XPRT_TRANSPORT_BC;
                args.authflavor = ses->se_cb_sec.flavor;
        }
        /* Create RPC client */
@@ -904,7 +905,7 @@ static void nfsd4_cb_recall_release(void *calldata)
                spin_lock(&clp->cl_lock);
                list_del(&cb->cb_per_client);
                spin_unlock(&clp->cl_lock);
-                nfs4_put_delegation(dp);
+                nfs4_put_stid(&dp->dl_stid);
        }
 }
@@ -933,7 +934,7 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
        set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags);
        /*
         * Note this won't actually result in a null callback;
-         * instead, nfsd4_do_callback_rpc() will detect the killed
+         * instead, nfsd4_run_cb_null() will detect the killed
         * client, destroy the rpc client, and stop:
         */
        do_probe_callback(clp);
@@ -1011,9 +1012,9 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
                run_nfsd4_cb(cb);
 }
-static void nfsd4_do_callback_rpc(struct work_struct *w)
+static void
+nfsd4_run_callback_rpc(struct nfsd4_callback *cb)
 {
-        struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
        struct nfs4_client *clp = cb->cb_clp;
        struct rpc_clnt *clnt;
@@ -1031,9 +1032,22 @@ static void nfsd4_do_callback_rpc(struct work_struct *w)
                        cb->cb_ops, cb);
 }
-void nfsd4_init_callback(struct nfsd4_callback *cb)
+void
+nfsd4_run_cb_null(struct work_struct *w)
 {
-        INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc);
+        struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback,
+                                                        cb_work);
+        nfsd4_run_callback_rpc(cb);
+}
+void
+nfsd4_run_cb_recall(struct work_struct *w)
+{
+        struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback,
+                                                        cb_work);
+        nfsd4_prepare_cb_recall(cb->cb_op);
+        nfsd4_run_callback_rpc(cb);
 }
 void nfsd4_cb_recall(struct nfs4_delegation *dp)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8f029db5d271..5e0dc528a0e8 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -177,7 +177,7 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src)
        fh_put(dst);
        dget(src->fh_dentry);
        if (src->fh_export)
-                cache_get(&src->fh_export->h);
+                exp_get(src->fh_export);
        *dst = *src;
 }
@@ -385,8 +385,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (nfsd4_has_session(cstate))
                copy_clientid(&open->op_clientid, cstate->session);
-        nfs4_lock_state();
        /* check seqid for replay. set nfs4_owner */
        resp = rqstp->rq_resp;
        status = nfsd4_process_open1(&resp->cstate, open, nn);
@@ -431,8 +429,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        break;
                case NFS4_OPEN_CLAIM_PREVIOUS:
                        status = nfs4_check_open_reclaim(&open->op_clientid,
-                                                         cstate->minorversion,
+                                                         cstate, nn);
-                                                         nn);
                        if (status)
                                goto out;
                        open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
@@ -461,19 +458,17 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
         * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
         */
        status = nfsd4_process_open2(rqstp, resfh, open);
-        WARN_ON(status && open->op_created);
+        WARN(status && open->op_created,
+             "nfsd4_process_open2 failed to open newly-created file! status=%u\n",
+             be32_to_cpu(status));
 out:
        if (resfh && resfh != &cstate->current_fh) {
                fh_dup2(&cstate->current_fh, resfh);
                fh_put(resfh);
                kfree(resfh);
        }
-        nfsd4_cleanup_open_state(open, status);
+        nfsd4_cleanup_open_state(cstate, open, status);
-        if (open->op_openowner && !nfsd4_has_session(cstate))
-                cstate->replay_owner = &open->op_openowner->oo_owner;
        nfsd4_bump_seqid(cstate, status);
-        if (!cstate->replay_owner)
-                nfs4_unlock_state();
        return status;
 }
@@ -581,8 +576,12 @@ static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
        __be32 verf[2];
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-        verf[0] = (__be32)nn->nfssvc_boot.tv_sec;
+        /*
-        verf[1] = (__be32)nn->nfssvc_boot.tv_usec;
+         * This is opaque to client, so no need to byte-swap. Use
+         * __force to keep sparse happy
+         */
+        verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
+        verf[1] = (__force __be32)nn->nfssvc_boot.tv_usec;
        memcpy(verifier->data, verf, sizeof(verifier->data));
 }
@@ -619,8 +618,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        case NF4LNK:
                status = nfsd_symlink(rqstp, &cstate->current_fh,
                                      create->cr_name, create->cr_namelen,
-                                      create->cr_linkname, create->cr_linklen,
+                                      create->cr_data, &resfh);
-                                      &resfh, &create->cr_iattr);
                break;
        case NF4BLK:
@@ -909,8 +907,8 @@ nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstat
        default:
                return nfserr_inval;
        }
-        exp_get(cstate->current_fh.fh_export);
-        sin->sin_exp = cstate->current_fh.fh_export;
+        sin->sin_exp = exp_get(cstate->current_fh.fh_export);
        fh_put(&cstate->current_fh);
        return nfs_ok;
 }
@@ -1289,7 +1287,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
         * Don't use the deferral mechanism for NFSv4; compounds make it
         * too hard to avoid non-idempotency problems.
         */
-        rqstp->rq_usedeferral = 0;
+        rqstp->rq_usedeferral = false;
        /*
         * According to RFC3010, this takes precedence over all other errors.
@@ -1391,10 +1389,7 @@ encode_op:
                        args->ops, args->opcnt, resp->opcnt, op->opnum,
                        be32_to_cpu(status));
-                if (cstate->replay_owner) {
+                nfsd4_cstate_clear_replay(cstate);
-                        nfs4_unlock_state();
-                        cstate->replay_owner = NULL;
-                }
                /* XXX Ugh, we need to get rid of this kind of special case: */
                if (op->opnum == OP_READ && op->u.read.rd_filp)
                        fput(op->u.read.rd_filp);
@@ -1408,7 +1403,7 @@ encode_op:
        BUG_ON(cstate->replay_owner);
 out:
        /* Reset deferral mechanism for RPC deferrals */
-        rqstp->rq_usedeferral = 1;
+        rqstp->rq_usedeferral = true;
        dprintk("nfsv4 compound returned %d\n", ntohl(status));
        return status;
 }
@@ -1520,21 +1515,17 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
        u32 maxcount = 0, rlen = 0;
        maxcount = svc_max_payload(rqstp);
-        rlen = op->u.read.rd_length;
+        rlen = min(op->u.read.rd_length, maxcount);
-        if (rlen > maxcount)
-                rlen = maxcount;
        return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
 }
 static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 {
-        u32 maxcount = svc_max_payload(rqstp);
+        u32 maxcount = 0, rlen = 0;
-        u32 rlen = op->u.readdir.rd_maxcount;
-        if (rlen > maxcount)
+        maxcount = svc_max_payload(rqstp);
-                rlen = maxcount;
+        rlen = min(op->u.readdir.rd_maxcount, maxcount);
        return (op_encode_hdr_size + op_encode_verifier_maxsz +
                XDR_QUADLEN(rlen)) * sizeof(__be32);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2204e1fe5725..2e80a59e7e91 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -70,13 +70,11 @@ static u64 current_sessionid = 1;
 #define CURRENT_STATEID(stateid) (!memcmp((stateid), &currentstateid, sizeof(stateid_t)))
 /* forward declarations */
-static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
+static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
+static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
 /* Locking: */
-/* Currently used for almost all code touching nfsv4 state: */
-static DEFINE_MUTEX(client_mutex);
 /*
 * Currently used for the del_recall_lru and file hash table.  In an
 * effort to decrease the scope of the client_mutex, this spinlock may
@@ -84,18 +82,18 @@ static DEFINE_MUTEX(client_mutex);
 */
 static DEFINE_SPINLOCK(state_lock);
+/*
+ * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for
+ * the refcount on the open stateid to drop.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(close_wq);
 static struct kmem_cache *openowner_slab;
 static struct kmem_cache *lockowner_slab;
 static struct kmem_cache *file_slab;
 static struct kmem_cache *stateid_slab;
 static struct kmem_cache *deleg_slab;
-void
-nfs4_lock_state(void)
-{
-        mutex_lock(&client_mutex);
-}
 static void free_session(struct nfsd4_session *);
 static bool is_session_dead(struct nfsd4_session *ses)
@@ -103,12 +101,6 @@ static bool is_session_dead(struct nfsd4_session *ses)
        return ses->se_flags & NFS4_SESSION_DEAD;
 }
-void nfsd4_put_session(struct nfsd4_session *ses)
-{
-        if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
-                free_session(ses);
-}
 static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me)
 {
        if (atomic_read(&ses->se_ref) > ref_held_by_me)
@@ -117,46 +109,17 @@ static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_b
        return nfs_ok;
 }
-static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses)
-{
-        if (is_session_dead(ses))
-                return nfserr_badsession;
-        atomic_inc(&ses->se_ref);
-        return nfs_ok;
-}
-void
-nfs4_unlock_state(void)
-{
-        mutex_unlock(&client_mutex);
-}
 static bool is_client_expired(struct nfs4_client *clp)
 {
        return clp->cl_time == 0;
 }
-static __be32 mark_client_expired_locked(struct nfs4_client *clp)
+static __be32 get_client_locked(struct nfs4_client *clp)
-{
-        if (atomic_read(&clp->cl_refcount))
-                return nfserr_jukebox;
-        clp->cl_time = 0;
-        return nfs_ok;
-}
-static __be32 mark_client_expired(struct nfs4_client *clp)
 {
        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-        __be32 ret;
-        spin_lock(&nn->client_lock);
+        lockdep_assert_held(&nn->client_lock);
-        ret = mark_client_expired_locked(clp);
-        spin_unlock(&nn->client_lock);
-        return ret;
-}
-static __be32 get_client_locked(struct nfs4_client *clp)
-{
        if (is_client_expired(clp))
                return nfserr_expired;
        atomic_inc(&clp->cl_refcount);
@@ -197,13 +160,17 @@ renew_client(struct nfs4_client *clp)
 static void put_client_renew_locked(struct nfs4_client *clp)
 {
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+        lockdep_assert_held(&nn->client_lock);
        if (!atomic_dec_and_test(&clp->cl_refcount))
                return;
        if (!is_client_expired(clp))
                renew_client_locked(clp);
 }
-void put_client_renew(struct nfs4_client *clp)
+static void put_client_renew(struct nfs4_client *clp)
 {
        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -214,6 +181,79 @@ void put_client_renew(struct nfs4_client *clp)
        spin_unlock(&nn->client_lock);
 }
+static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses)
+{
+        __be32 status;
+        if (is_session_dead(ses))
+                return nfserr_badsession;
+        status = get_client_locked(ses->se_client);
+        if (status)
+                return status;
+        atomic_inc(&ses->se_ref);
+        return nfs_ok;
+}
+static void nfsd4_put_session_locked(struct nfsd4_session *ses)
+{
+        struct nfs4_client *clp = ses->se_client;
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+        lockdep_assert_held(&nn->client_lock);
+        if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
+                free_session(ses);
+        put_client_renew_locked(clp);
+}
+static void nfsd4_put_session(struct nfsd4_session *ses)
+{
+        struct nfs4_client *clp = ses->se_client;
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+        spin_lock(&nn->client_lock);
+        nfsd4_put_session_locked(ses);
+        spin_unlock(&nn->client_lock);
+}
+static int
+same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner)
+{
+        return (sop->so_owner.len == owner->len) &&
+                0 == memcmp(sop->so_owner.data, owner->data, owner->len);
+}
+static struct nfs4_openowner *
+find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open,
+                        struct nfs4_client *clp)
+{
+        struct nfs4_stateowner *so;
+        lockdep_assert_held(&clp->cl_lock);
+        list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[hashval],
+                            so_strhash) {
+                if (!so->so_is_open_owner)
+                        continue;
+                if (same_owner_str(so, &open->op_owner)) {
+                        atomic_inc(&so->so_count);
+                        return openowner(so);
+                }
+        }
+        return NULL;
+}
+static struct nfs4_openowner *
+find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
+                        struct nfs4_client *clp)
+{
+        struct nfs4_openowner *oo;
+        spin_lock(&clp->cl_lock);
+        oo = find_openstateowner_str_locked(hashval, open, clp);
+        spin_unlock(&clp->cl_lock);
+        return oo;
+}
 static inline u32
 opaque_hashval(const void *ptr, int nbytes)
@@ -236,10 +276,11 @@ static void nfsd4_free_file(struct nfs4_file *f)
 static inline void
 put_nfs4_file(struct nfs4_file *fi)
 {
+        might_lock(&state_lock);
        if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) {
                hlist_del(&fi->fi_hash);
                spin_unlock(&state_lock);
-                iput(fi->fi_inode);
                nfsd4_free_file(fi);
        }
 }
@@ -250,7 +291,80 @@ get_nfs4_file(struct nfs4_file *fi)
        atomic_inc(&fi->fi_ref);
 }
-static int num_delegations;
+static struct file *
+__nfs4_get_fd(struct nfs4_file *f, int oflag)
+{
+        if (f->fi_fds[oflag])
+                return get_file(f->fi_fds[oflag]);
+        return NULL;
+}
+static struct file *
+find_writeable_file_locked(struct nfs4_file *f)
+{
+        struct file *ret;
+        lockdep_assert_held(&f->fi_lock);
+        ret = __nfs4_get_fd(f, O_WRONLY);
+        if (!ret)
+                ret = __nfs4_get_fd(f, O_RDWR);
+        return ret;
+}
+static struct file *
+find_writeable_file(struct nfs4_file *f)
+{
+        struct file *ret;
+        spin_lock(&f->fi_lock);
+        ret = find_writeable_file_locked(f);
+        spin_unlock(&f->fi_lock);
+        return ret;
+}
+static struct file *find_readable_file_locked(struct nfs4_file *f)
+{
+        struct file *ret;
+        lockdep_assert_held(&f->fi_lock);
+        ret = __nfs4_get_fd(f, O_RDONLY);
+        if (!ret)
+                ret = __nfs4_get_fd(f, O_RDWR);
+        return ret;
+}
+static struct file *
+find_readable_file(struct nfs4_file *f)
+{
+        struct file *ret;
+        spin_lock(&f->fi_lock);
+        ret = find_readable_file_locked(f);
+        spin_unlock(&f->fi_lock);
+        return ret;
+}
+static struct file *
+find_any_file(struct nfs4_file *f)
+{
+        struct file *ret;
+        spin_lock(&f->fi_lock);
+        ret = __nfs4_get_fd(f, O_RDWR);
+        if (!ret) {
+                ret = __nfs4_get_fd(f, O_WRONLY);
+                if (!ret)
+                        ret = __nfs4_get_fd(f, O_RDONLY);
+        }
+        spin_unlock(&f->fi_lock);
+        return ret;
+}
+static atomic_long_t num_delegations;
 unsigned long max_delegations;
 /*
@@ -262,12 +376,11 @@ unsigned long max_delegations;
 #define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
 #define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
-static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
+static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
 {
        unsigned int ret;
        ret = opaque_hashval(ownername->data, ownername->len);
-        ret += clientid;
        return ret & OWNER_HASH_MASK;
 }
@@ -275,75 +388,124 @@ static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
 #define FILE_HASH_BITS                   8
 #define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
-static unsigned int file_hashval(struct inode *ino)
+static unsigned int nfsd_fh_hashval(struct knfsd_fh *fh)
+{
+        return jhash2(fh->fh_base.fh_pad, XDR_QUADLEN(fh->fh_size), 0);
+}
+static unsigned int file_hashval(struct knfsd_fh *fh)
+{
+        return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
+}
+static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
 {
-        /* XXX: why are we hashing on inode pointer, anyway? */
+        return fh1->fh_size == fh2->fh_size &&
-        return hash_ptr(ino, FILE_HASH_BITS);
+                !memcmp(fh1->fh_base.fh_pad,
+                                fh2->fh_base.fh_pad,
+                                fh1->fh_size);
 }
 static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
-static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
+static void
+__nfs4_file_get_access(struct nfs4_file *fp, u32 access)
 {
-        WARN_ON_ONCE(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
+        lockdep_assert_held(&fp->fi_lock);
-        atomic_inc(&fp->fi_access[oflag]);
+        if (access & NFS4_SHARE_ACCESS_WRITE)
+                atomic_inc(&fp->fi_access[O_WRONLY]);
+        if (access & NFS4_SHARE_ACCESS_READ)
+                atomic_inc(&fp->fi_access[O_RDONLY]);
 }
-static void nfs4_file_get_access(struct nfs4_file *fp, int oflag)
+static __be32
+nfs4_file_get_access(struct nfs4_file *fp, u32 access)
 {
-        if (oflag == O_RDWR) {
+        lockdep_assert_held(&fp->fi_lock);
-                __nfs4_file_get_access(fp, O_RDONLY);
-                __nfs4_file_get_access(fp, O_WRONLY);
+        /* Does this access mode make sense? */
-        } else
+        if (access & ~NFS4_SHARE_ACCESS_BOTH)
-                __nfs4_file_get_access(fp, oflag);
+                return nfserr_inval;
+        /* Does it conflict with a deny mode already set? */
+        if ((access & fp->fi_share_deny) != 0)
+                return nfserr_share_denied;
+        __nfs4_file_get_access(fp, access);
+        return nfs_ok;
 }
-static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
+static __be32 nfs4_file_check_deny(struct nfs4_file *fp, u32 deny)
 {
-        if (fp->fi_fds[oflag]) {
+        /* Common case is that there is no deny mode. */
-                fput(fp->fi_fds[oflag]);
+        if (deny) {
-                fp->fi_fds[oflag] = NULL;
+                /* Does this deny mode make sense? */
+                if (deny & ~NFS4_SHARE_DENY_BOTH)
+                        return nfserr_inval;
+                if ((deny & NFS4_SHARE_DENY_READ) &&
+                    atomic_read(&fp->fi_access[O_RDONLY]))
+                        return nfserr_share_denied;
+                if ((deny & NFS4_SHARE_DENY_WRITE) &&
+                    atomic_read(&fp->fi_access[O_WRONLY]))
+                        return nfserr_share_denied;
        }
+        return nfs_ok;
 }
 static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
 {
-        if (atomic_dec_and_test(&fp->fi_access[oflag])) {
+        might_lock(&fp->fi_lock);
-                nfs4_file_put_fd(fp, oflag);
+        if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) {
+                struct file *f1 = NULL;
+                struct file *f2 = NULL;
+                swap(f1, fp->fi_fds[oflag]);
                if (atomic_read(&fp->fi_access[1 - oflag]) == 0)
-                        nfs4_file_put_fd(fp, O_RDWR);
+                        swap(f2, fp->fi_fds[O_RDWR]);
+                spin_unlock(&fp->fi_lock);
+                if (f1)
+                        fput(f1);
+                if (f2)
+                        fput(f2);
        }
 }
-static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
+static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
 {
-        if (oflag == O_RDWR) {
+        WARN_ON_ONCE(access & ~NFS4_SHARE_ACCESS_BOTH);
-                __nfs4_file_put_access(fp, O_RDONLY);
+        if (access & NFS4_SHARE_ACCESS_WRITE)
                __nfs4_file_put_access(fp, O_WRONLY);
-        } else
+        if (access & NFS4_SHARE_ACCESS_READ)
-                __nfs4_file_put_access(fp, oflag);
+                __nfs4_file_put_access(fp, O_RDONLY);
 }
-static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct
+static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
-kmem_cache *slab)
+                                         struct kmem_cache *slab)
 {
-        struct idr *stateids = &cl->cl_stateids;
        struct nfs4_stid *stid;
        int new_id;
-        stid = kmem_cache_alloc(slab, GFP_KERNEL);
+        stid = kmem_cache_zalloc(slab, GFP_KERNEL);
        if (!stid)
                return NULL;
-        new_id = idr_alloc_cyclic(stateids, stid, 0, 0, GFP_KERNEL);
+        idr_preload(GFP_KERNEL);
+        spin_lock(&cl->cl_lock);
+        new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 0, 0, GFP_NOWAIT);
+        spin_unlock(&cl->cl_lock);
+        idr_preload_end();
        if (new_id < 0)
                goto out_free;
        stid->sc_client = cl;
-        stid->sc_type = 0;
        stid->sc_stateid.si_opaque.so_id = new_id;
        stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
        /* Will be incremented before return to client: */
-        stid->sc_stateid.si_generation = 0;
+        atomic_set(&stid->sc_count, 1);
        /*
         * It shouldn't be a problem to reuse an opaque stateid value.
@@ -360,9 +522,24 @@ out_free:
        return NULL;
 }
-static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
+static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
 {
-        return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
+        struct nfs4_stid *stid;
+        struct nfs4_ol_stateid *stp;
+        stid = nfs4_alloc_stid(clp, stateid_slab);
+        if (!stid)
+                return NULL;
+        stp = openlockstateid(stid);
+        stp->st_stid.sc_free = nfs4_free_ol_stateid;
+        return stp;
+}
+static void nfs4_free_deleg(struct nfs4_stid *stid)
+{
+        kmem_cache_free(deleg_slab, stid);
+        atomic_long_dec(&num_delegations);
 }
 /*
@@ -379,10 +556,11 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
 * Each filter is 256 bits.  We hash the filehandle to 32bit and use the
 * low 3 bytes as hash-table indices.
 *
- * 'state_lock', which is always held when block_delegations() is called,
+ * 'blocked_delegations_lock', which is always taken in block_delegations(),
 * is used to manage concurrent access.  Testing does not need the lock
 * except when swapping the two filters.
 */
+static DEFINE_SPINLOCK(blocked_delegations_lock);
 static struct bloom_pair {
        int     entries, old_entries;
        time_t  swap_time;
@@ -398,7 +576,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
        if (bd->entries == 0)
                return 0;
        if (seconds_since_boot() - bd->swap_time > 30) {
-                spin_lock(&state_lock);
+                spin_lock(&blocked_delegations_lock);
                if (seconds_since_boot() - bd->swap_time > 30) {
                        bd->entries -= bd->old_entries;
                        bd->old_entries = bd->entries;
@@ -407,7 +585,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
                        bd->new = 1-bd->new;
                        bd->swap_time = seconds_since_boot();
                }
-                spin_unlock(&state_lock);
+                spin_unlock(&blocked_delegations_lock);
        }
        hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
        if (test_bit(hash&255, bd->set[0]) &&
@@ -430,69 +608,73 @@ static void block_delegations(struct knfsd_fh *fh)
        hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
+        spin_lock(&blocked_delegations_lock);
        __set_bit(hash&255, bd->set[bd->new]);
        __set_bit((hash>>8)&255, bd->set[bd->new]);
        __set_bit((hash>>16)&255, bd->set[bd->new]);
        if (bd->entries == 0)
                bd->swap_time = seconds_since_boot();
        bd->entries += 1;
+        spin_unlock(&blocked_delegations_lock);
 }
 static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
+alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh)
 {
        struct nfs4_delegation *dp;
+        long n;
        dprintk("NFSD alloc_init_deleg\n");
-        if (num_delegations > max_delegations)
+        n = atomic_long_inc_return(&num_delegations);
-                return NULL;
+        if (n < 0 || n > max_delegations)
+                goto out_dec;
        if (delegation_blocked(&current_fh->fh_handle))
-                return NULL;
+                goto out_dec;
        dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
        if (dp == NULL)
-                return dp;
+                goto out_dec;
+        dp->dl_stid.sc_free = nfs4_free_deleg;
        /*
         * delegation seqid's are never incremented.  The 4.1 special
         * meaning of seqid 0 isn't meaningful, really, but let's avoid
         * 0 anyway just for consistency and use 1:
         */
        dp->dl_stid.sc_stateid.si_generation = 1;
-        num_delegations++;
        INIT_LIST_HEAD(&dp->dl_perfile);
        INIT_LIST_HEAD(&dp->dl_perclnt);
        INIT_LIST_HEAD(&dp->dl_recall_lru);
-        dp->dl_file = NULL;
        dp->dl_type = NFS4_OPEN_DELEGATE_READ;
-        fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
+        INIT_WORK(&dp->dl_recall.cb_work, nfsd4_run_cb_recall);
-        dp->dl_time = 0;
-        atomic_set(&dp->dl_count, 1);
-        nfsd4_init_callback(&dp->dl_recall);
        return dp;
+out_dec:
+        atomic_long_dec(&num_delegations);
+        return NULL;
 }
-static void remove_stid(struct nfs4_stid *s)
+void
+nfs4_put_stid(struct nfs4_stid *s)
 {
-        struct idr *stateids = &s->sc_client->cl_stateids;
+        struct nfs4_file *fp = s->sc_file;
+        struct nfs4_client *clp = s->sc_client;
-        idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
+        might_lock(&clp->cl_lock);
-}
-static void nfs4_free_stid(struct kmem_cache *slab, struct nfs4_stid *s)
+        if (!atomic_dec_and_lock(&s->sc_count, &clp->cl_lock)) {
-{
+                wake_up_all(&close_wq);
-        kmem_cache_free(slab, s);
+                return;
-}
-void
-nfs4_put_delegation(struct nfs4_delegation *dp)
-{
-        if (atomic_dec_and_test(&dp->dl_count)) {
-                nfs4_free_stid(deleg_slab, &dp->dl_stid);
-                num_delegations--;
        }
+        idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+        spin_unlock(&clp->cl_lock);
+        s->sc_free(s);
+        if (fp)
+                put_nfs4_file(fp);
 }
 static void nfs4_put_deleg_lease(struct nfs4_file *fp)
 {
+        lockdep_assert_held(&state_lock);
        if (!fp->fi_lease)
                return;
        if (atomic_dec_and_test(&fp->fi_delegees)) {
@@ -512,54 +694,54 @@ static void
 hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
 {
        lockdep_assert_held(&state_lock);
+        lockdep_assert_held(&fp->fi_lock);
+        atomic_inc(&dp->dl_stid.sc_count);
        dp->dl_stid.sc_type = NFS4_DELEG_STID;
        list_add(&dp->dl_perfile, &fp->fi_delegations);
        list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
 }
-/* Called under the state lock. */
 static void
-unhash_delegation(struct nfs4_delegation *dp)
+unhash_delegation_locked(struct nfs4_delegation *dp)
 {
-        spin_lock(&state_lock);
+        struct nfs4_file *fp = dp->dl_stid.sc_file;
-        list_del_init(&dp->dl_perclnt);
-        list_del_init(&dp->dl_perfile);
-        list_del_init(&dp->dl_recall_lru);
-        spin_unlock(&state_lock);
-        if (dp->dl_file) {
-                nfs4_put_deleg_lease(dp->dl_file);
-                put_nfs4_file(dp->dl_file);
-                dp->dl_file = NULL;
-        }
-}
+        lockdep_assert_held(&state_lock);
-static void destroy_revoked_delegation(struct nfs4_delegation *dp)
+        dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID;
-{
+        /* Ensure that deleg break won't try to requeue it */
+        ++dp->dl_time;
+        spin_lock(&fp->fi_lock);
+        list_del_init(&dp->dl_perclnt);
        list_del_init(&dp->dl_recall_lru);
-        remove_stid(&dp->dl_stid);
+        list_del_init(&dp->dl_perfile);
-        nfs4_put_delegation(dp);
+        spin_unlock(&fp->fi_lock);
+        if (fp)
+                nfs4_put_deleg_lease(fp);
 }
 static void destroy_delegation(struct nfs4_delegation *dp)
 {
-        unhash_delegation(dp);
+        spin_lock(&state_lock);
-        remove_stid(&dp->dl_stid);
+        unhash_delegation_locked(dp);
-        nfs4_put_delegation(dp);
+        spin_unlock(&state_lock);
+        nfs4_put_stid(&dp->dl_stid);
 }
 static void revoke_delegation(struct nfs4_delegation *dp)
 {
        struct nfs4_client *clp = dp->dl_stid.sc_client;
+        WARN_ON(!list_empty(&dp->dl_recall_lru));
        if (clp->cl_minorversion == 0)
-                destroy_delegation(dp);
+                nfs4_put_stid(&dp->dl_stid);
        else {
-                unhash_delegation(dp);
                dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
+                spin_lock(&clp->cl_lock);
                list_add(&dp->dl_recall_lru, &clp->cl_revoked);
+                spin_unlock(&clp->cl_lock);
        }
 }
@@ -607,57 +789,62 @@ bmap_to_share_mode(unsigned long bmap) {
        return access;
 }
-static bool
-test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
-        unsigned int access, deny;
-        access = bmap_to_share_mode(stp->st_access_bmap);
-        deny = bmap_to_share_mode(stp->st_deny_bmap);
-        if ((access & open->op_share_deny) || (deny & open->op_share_access))
-                return false;
-        return true;
-}
 /* set share access for a given stateid */
 static inline void
 set_access(u32 access, struct nfs4_ol_stateid *stp)
 {
-        __set_bit(access, &stp->st_access_bmap);
+        unsigned char mask = 1 << access;
+        WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
+        stp->st_access_bmap |= mask;
 }
 /* clear share access for a given stateid */
 static inline void
 clear_access(u32 access, struct nfs4_ol_stateid *stp)
 {
-        __clear_bit(access, &stp->st_access_bmap);
+        unsigned char mask = 1 << access;
+        WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
+        stp->st_access_bmap &= ~mask;
 }
 /* test whether a given stateid has access */
 static inline bool
 test_access(u32 access, struct nfs4_ol_stateid *stp)
 {
-        return test_bit(access, &stp->st_access_bmap);
+        unsigned char mask = 1 << access;
+        return (bool)(stp->st_access_bmap & mask);
 }
 /* set share deny for a given stateid */
 static inline void
-set_deny(u32 access, struct nfs4_ol_stateid *stp)
+set_deny(u32 deny, struct nfs4_ol_stateid *stp)
 {
-        __set_bit(access, &stp->st_deny_bmap);
+        unsigned char mask = 1 << deny;
+        WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
+        stp->st_deny_bmap |= mask;
 }
 /* clear share deny for a given stateid */
 static inline void
-clear_deny(u32 access, struct nfs4_ol_stateid *stp)
+clear_deny(u32 deny, struct nfs4_ol_stateid *stp)
 {
-        __clear_bit(access, &stp->st_deny_bmap);
+        unsigned char mask = 1 << deny;
+        WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
+        stp->st_deny_bmap &= ~mask;
 }
 /* test whether a given stateid is denying specific access */
 static inline bool
-test_deny(u32 access, struct nfs4_ol_stateid *stp)
+test_deny(u32 deny, struct nfs4_ol_stateid *stp)
 {
-        return test_bit(access, &stp->st_deny_bmap);
+        unsigned char mask = 1 << deny;
+        return (bool)(stp->st_deny_bmap & mask);
 }
 static int nfs4_access_to_omode(u32 access)
@@ -674,138 +861,283 @@ static int nfs4_access_to_omode(u32 access)
        return O_RDONLY;
 }
+/*
+ * A stateid that had a deny mode associated with it is being released
+ * or downgraded. Recalculate the deny mode on the file.
+ */
+static void
+recalculate_deny_mode(struct nfs4_file *fp)
+{
+        struct nfs4_ol_stateid *stp;
+        spin_lock(&fp->fi_lock);
+        fp->fi_share_deny = 0;
+        list_for_each_entry(stp, &fp->fi_stateids, st_perfile)
+                fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap);
+        spin_unlock(&fp->fi_lock);
+}
+static void
+reset_union_bmap_deny(u32 deny, struct nfs4_ol_stateid *stp)
+{
+        int i;
+        bool change = false;
+        for (i = 1; i < 4; i++) {
+                if ((i & deny) != i) {
+                        change = true;
+                        clear_deny(i, stp);
+                }
+        }
+        /* Recalculate per-file deny mode if there was a change */
+        if (change)
+                recalculate_deny_mode(stp->st_stid.sc_file);
+}
 /* release all access and file references for a given stateid */
 static void
 release_all_access(struct nfs4_ol_stateid *stp)
 {
        int i;
+        struct nfs4_file *fp = stp->st_stid.sc_file;
+        if (fp && stp->st_deny_bmap != 0)
+                recalculate_deny_mode(fp);
        for (i = 1; i < 4; i++) {
                if (test_access(i, stp))
-                        nfs4_file_put_access(stp->st_file,
+                        nfs4_file_put_access(stp->st_stid.sc_file, i);
-                                             nfs4_access_to_omode(i));
                clear_access(i, stp);
        }
 }
-static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
+static void nfs4_put_stateowner(struct nfs4_stateowner *sop)
 {
+        struct nfs4_client *clp = sop->so_client;
+        might_lock(&clp->cl_lock);
+        if (!atomic_dec_and_lock(&sop->so_count, &clp->cl_lock))
+                return;
+        sop->so_ops->so_unhash(sop);
+        spin_unlock(&clp->cl_lock);
+        kfree(sop->so_owner.data);
+        sop->so_ops->so_free(sop);
+}
+static void unhash_ol_stateid(struct nfs4_ol_stateid *stp)
+{
+        struct nfs4_file *fp = stp->st_stid.sc_file;
+        lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock);
+        spin_lock(&fp->fi_lock);
        list_del(&stp->st_perfile);
+        spin_unlock(&fp->fi_lock);
        list_del(&stp->st_perstateowner);
 }
-static void close_generic_stateid(struct nfs4_ol_stateid *stp)
+static void nfs4_free_ol_stateid(struct nfs4_stid *stid)
 {
+        struct nfs4_ol_stateid *stp = openlockstateid(stid);
        release_all_access(stp);
-        put_nfs4_file(stp->st_file);
+        if (stp->st_stateowner)
-        stp->st_file = NULL;
+                nfs4_put_stateowner(stp->st_stateowner);
+        kmem_cache_free(stateid_slab, stid);
 }
-static void free_generic_stateid(struct nfs4_ol_stateid *stp)
+static void nfs4_free_lock_stateid(struct nfs4_stid *stid)
 {
-        remove_stid(&stp->st_stid);
+        struct nfs4_ol_stateid *stp = openlockstateid(stid);
-        nfs4_free_stid(stateid_slab, &stp->st_stid);
+        struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
+        struct file *file;
+        file = find_any_file(stp->st_stid.sc_file);
+        if (file)
+                filp_close(file, (fl_owner_t)lo);
+        nfs4_free_ol_stateid(stid);
 }
-static void release_lock_stateid(struct nfs4_ol_stateid *stp)
+/*
+ * Put the persistent reference to an already unhashed generic stateid, while
+ * holding the cl_lock. If it's the last reference, then put it onto the
+ * reaplist for later destruction.
+ */
+static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp,
+                                       struct list_head *reaplist)
 {
-        struct file *file;
+        struct nfs4_stid *s = &stp->st_stid;
+        struct nfs4_client *clp = s->sc_client;
+        lockdep_assert_held(&clp->cl_lock);
-        unhash_generic_stateid(stp);
+        WARN_ON_ONCE(!list_empty(&stp->st_locks));
+        if (!atomic_dec_and_test(&s->sc_count)) {
+                wake_up_all(&close_wq);
+                return;
+        }
+        idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+        list_add(&stp->st_locks, reaplist);
+}
+static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
+{
+        struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner);
+        lockdep_assert_held(&oo->oo_owner.so_client->cl_lock);
+        list_del_init(&stp->st_locks);
+        unhash_ol_stateid(stp);
        unhash_stid(&stp->st_stid);
-        file = find_any_file(stp->st_file);
-        if (file)
-                locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner));
-        close_generic_stateid(stp);
-        free_generic_stateid(stp);
 }
-static void unhash_lockowner(struct nfs4_lockowner *lo)
+static void release_lock_stateid(struct nfs4_ol_stateid *stp)
 {
-        struct nfs4_ol_stateid *stp;
+        struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner);
-        list_del(&lo->lo_owner.so_strhash);
+        spin_lock(&oo->oo_owner.so_client->cl_lock);
-        list_del(&lo->lo_perstateid);
+        unhash_lock_stateid(stp);
-        list_del(&lo->lo_owner_ino_hash);
+        spin_unlock(&oo->oo_owner.so_client->cl_lock);
-        while (!list_empty(&lo->lo_owner.so_stateids)) {
+        nfs4_put_stid(&stp->st_stid);
-                stp = list_first_entry(&lo->lo_owner.so_stateids,
-                                struct nfs4_ol_stateid, st_perstateowner);
-                release_lock_stateid(stp);
-        }
 }
-static void nfs4_free_lockowner(struct nfs4_lockowner *lo)
+static void unhash_lockowner_locked(struct nfs4_lockowner *lo)
 {
-        kfree(lo->lo_owner.so_owner.data);
+        struct nfs4_client *clp = lo->lo_owner.so_client;
-        kmem_cache_free(lockowner_slab, lo);
+        lockdep_assert_held(&clp->cl_lock);
+        list_del_init(&lo->lo_owner.so_strhash);
+}
+/*
+ * Free a list of generic stateids that were collected earlier after being
+ * fully unhashed.
+ */
+static void
+free_ol_stateid_reaplist(struct list_head *reaplist)
+{
+        struct nfs4_ol_stateid *stp;
+        struct nfs4_file *fp;
+        might_sleep();
+        while (!list_empty(reaplist)) {
+                stp = list_first_entry(reaplist, struct nfs4_ol_stateid,
+                                       st_locks);
+                list_del(&stp->st_locks);
+                fp = stp->st_stid.sc_file;
+                stp->st_stid.sc_free(&stp->st_stid);
+                if (fp)
+                        put_nfs4_file(fp);
+        }
 }
 static void release_lockowner(struct nfs4_lockowner *lo)
 {
-        unhash_lockowner(lo);
+        struct nfs4_client *clp = lo->lo_owner.so_client;
-        nfs4_free_lockowner(lo);
+        struct nfs4_ol_stateid *stp;
+        struct list_head reaplist;
+        INIT_LIST_HEAD(&reaplist);
+        spin_lock(&clp->cl_lock);
+        unhash_lockowner_locked(lo);
+        while (!list_empty(&lo->lo_owner.so_stateids)) {
+                stp = list_first_entry(&lo->lo_owner.so_stateids,
+                                struct nfs4_ol_stateid, st_perstateowner);
+                unhash_lock_stateid(stp);
+                put_ol_stateid_locked(stp, &reaplist);
+        }
+        spin_unlock(&clp->cl_lock);
+        free_ol_stateid_reaplist(&reaplist);
+        nfs4_put_stateowner(&lo->lo_owner);
 }
-static void
+static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
-release_stateid_lockowners(struct nfs4_ol_stateid *open_stp)
+                                       struct list_head *reaplist)
 {
-        struct nfs4_lockowner *lo;
+        struct nfs4_ol_stateid *stp;
-        while (!list_empty(&open_stp->st_lockowners)) {
+        while (!list_empty(&open_stp->st_locks)) {
-                lo = list_entry(open_stp->st_lockowners.next,
+                stp = list_entry(open_stp->st_locks.next,
-                                struct nfs4_lockowner, lo_perstateid);
+                                struct nfs4_ol_stateid, st_locks);
-                release_lockowner(lo);
+                unhash_lock_stateid(stp);
+                put_ol_stateid_locked(stp, reaplist);
        }
 }
-static void unhash_open_stateid(struct nfs4_ol_stateid *stp)
+static void unhash_open_stateid(struct nfs4_ol_stateid *stp,
+                                struct list_head *reaplist)
 {
-        unhash_generic_stateid(stp);
+        lockdep_assert_held(&stp->st_stid.sc_client->cl_lock);
-        release_stateid_lockowners(stp);
-        close_generic_stateid(stp);
+        unhash_ol_stateid(stp);
+        release_open_stateid_locks(stp, reaplist);
 }
 static void release_open_stateid(struct nfs4_ol_stateid *stp)
 {
-        unhash_open_stateid(stp);
+        LIST_HEAD(reaplist);
-        free_generic_stateid(stp);
+        spin_lock(&stp->st_stid.sc_client->cl_lock);
+        unhash_open_stateid(stp, &reaplist);
+        put_ol_stateid_locked(stp, &reaplist);
+        spin_unlock(&stp->st_stid.sc_client->cl_lock);
+        free_ol_stateid_reaplist(&reaplist);
 }
-static void unhash_openowner(struct nfs4_openowner *oo)
+static void unhash_openowner_locked(struct nfs4_openowner *oo)
 {
-        struct nfs4_ol_stateid *stp;
+        struct nfs4_client *clp = oo->oo_owner.so_client;
-        list_del(&oo->oo_owner.so_strhash);
+        lockdep_assert_held(&clp->cl_lock);
-        list_del(&oo->oo_perclient);
-        while (!list_empty(&oo->oo_owner.so_stateids)) {
+        list_del_init(&oo->oo_owner.so_strhash);
-                stp = list_first_entry(&oo->oo_owner.so_stateids,
+        list_del_init(&oo->oo_perclient);
-                                struct nfs4_ol_stateid, st_perstateowner);
-                release_open_stateid(stp);
-        }
 }
 static void release_last_closed_stateid(struct nfs4_openowner *oo)
 {
-        struct nfs4_ol_stateid *s = oo->oo_last_closed_stid;
+        struct nfsd_net *nn = net_generic(oo->oo_owner.so_client->net,
+                                          nfsd_net_id);
+        struct nfs4_ol_stateid *s;
+        spin_lock(&nn->client_lock);
+        s = oo->oo_last_closed_stid;
        if (s) {
-                free_generic_stateid(s);
+                list_del_init(&oo->oo_close_lru);
                oo->oo_last_closed_stid = NULL;
        }
-}
+        spin_unlock(&nn->client_lock);
+        if (s)
-static void nfs4_free_openowner(struct nfs4_openowner *oo)
+                nfs4_put_stid(&s->st_stid);
-{
-        kfree(oo->oo_owner.so_owner.data);
-        kmem_cache_free(openowner_slab, oo);
 }
 static void release_openowner(struct nfs4_openowner *oo)
 {
-        unhash_openowner(oo);
+        struct nfs4_ol_stateid *stp;
-        list_del(&oo->oo_close_lru);
+        struct nfs4_client *clp = oo->oo_owner.so_client;
+        struct list_head reaplist;
+        INIT_LIST_HEAD(&reaplist);
+        spin_lock(&clp->cl_lock);
+        unhash_openowner_locked(oo);
+        while (!list_empty(&oo->oo_owner.so_stateids)) {
+                stp = list_first_entry(&oo->oo_owner.so_stateids,
+                                struct nfs4_ol_stateid, st_perstateowner);
+                unhash_open_stateid(stp, &reaplist);
+                put_ol_stateid_locked(stp, &reaplist);
+        }
+        spin_unlock(&clp->cl_lock);
+        free_ol_stateid_reaplist(&reaplist);
        release_last_closed_stateid(oo);
-        nfs4_free_openowner(oo);
+        nfs4_put_stateowner(&oo->oo_owner);
 }
 static inline int
@@ -842,7 +1174,7 @@ void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr)
                return;
        if (!seqid_mutating_err(ntohl(nfserr))) {
-                cstate->replay_owner = NULL;
+                nfsd4_cstate_clear_replay(cstate);
                return;
        }
        if (!so)
@@ -1030,10 +1362,8 @@ static void nfsd4_init_conn(struct svc_rqst *rqstp, struct nfsd4_conn *conn, str
        if (ret)
                /* oops; xprt is already down: */
                nfsd4_conn_lost(&conn->cn_xpt_user);
-        if (conn->cn_flags & NFS4_CDFC4_BACK) {
+        /* We may have gained or lost a callback channel: */
-                /* callback channel may be back up */
+        nfsd4_probe_callback_sync(ses->se_client);
-                nfsd4_probe_callback(ses->se_client);
-        }
 }
 static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses)
@@ -1073,9 +1403,6 @@ static void __free_session(struct nfsd4_session *ses)
 static void free_session(struct nfsd4_session *ses)
 {
-        struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
-        lockdep_assert_held(&nn->client_lock);
        nfsd4_del_conns(ses);
        nfsd4_put_drc_mem(&ses->se_fchannel);
        __free_session(ses);
@@ -1097,12 +1424,10 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
        new->se_cb_sec = cses->cb_sec;
        atomic_set(&new->se_ref, 0);
        idx = hash_sessionid(&new->se_sessionid);
-        spin_lock(&nn->client_lock);
        list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
        spin_lock(&clp->cl_lock);
        list_add(&new->se_perclnt, &clp->cl_sessions);
        spin_unlock(&clp->cl_lock);
-        spin_unlock(&nn->client_lock);
        if (cses->flags & SESSION4_BACK_CHAN) {
                struct sockaddr *sa = svc_addr(rqstp);
@@ -1120,12 +1445,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
 /* caller must hold client_lock */
 static struct nfsd4_session *
-find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
+__find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
 {
        struct nfsd4_session *elem;
        int idx;
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        lockdep_assert_held(&nn->client_lock);
        dump_sessionid(__func__, sessionid);
        idx = hash_sessionid(sessionid);
        /* Search in the appropriate list */
@@ -1140,10 +1467,33 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
        return NULL;
 }
+static struct nfsd4_session *
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net,
+                __be32 *ret)
+{
+        struct nfsd4_session *session;
+        __be32 status = nfserr_badsession;
+        session = __find_in_sessionid_hashtbl(sessionid, net);
+        if (!session)
+                goto out;
+        status = nfsd4_get_session_locked(session);
+        if (status)
+                session = NULL;
+out:
+        *ret = status;
+        return session;
+}
 /* caller must hold client_lock */
 static void
 unhash_session(struct nfsd4_session *ses)
 {
+        struct nfs4_client *clp = ses->se_client;
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+        lockdep_assert_held(&nn->client_lock);
        list_del(&ses->se_hash);
        spin_lock(&ses->se_client->cl_lock);
        list_del(&ses->se_perclnt);
@@ -1169,15 +1519,20 @@ STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
 static struct nfs4_client *alloc_client(struct xdr_netobj name)
 {
        struct nfs4_client *clp;
+        int i;
        clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
        if (clp == NULL)
                return NULL;
        clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL);
-        if (clp->cl_name.data == NULL) {
+        if (clp->cl_name.data == NULL)
-                kfree(clp);
+                goto err_no_name;
-                return NULL;
+        clp->cl_ownerstr_hashtbl = kmalloc(sizeof(struct list_head) *
-        }
+                        OWNER_HASH_SIZE, GFP_KERNEL);
+        if (!clp->cl_ownerstr_hashtbl)
+                goto err_no_hashtbl;
+        for (i = 0; i < OWNER_HASH_SIZE; i++)
+                INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]);
        clp->cl_name.len = name.len;
        INIT_LIST_HEAD(&clp->cl_sessions);
        idr_init(&clp->cl_stateids);
@@ -1192,14 +1547,16 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
        spin_lock_init(&clp->cl_lock);
        rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
        return clp;
+err_no_hashtbl:
+        kfree(clp->cl_name.data);
+err_no_name:
+        kfree(clp);
+        return NULL;
 }
 static void
 free_client(struct nfs4_client *clp)
 {
-        struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id);
-        lockdep_assert_held(&nn->client_lock);
        while (!list_empty(&clp->cl_sessions)) {
                struct nfsd4_session *ses;
                ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
@@ -1210,18 +1567,32 @@ free_client(struct nfs4_client *clp)
        }
        rpc_destroy_wait_queue(&clp->cl_cb_waitq);
        free_svc_cred(&clp->cl_cred);
+        kfree(clp->cl_ownerstr_hashtbl);
        kfree(clp->cl_name.data);
        idr_destroy(&clp->cl_stateids);
        kfree(clp);
 }
 /* must be called under the client_lock */
-static inline void
+static void
 unhash_client_locked(struct nfs4_client *clp)
 {
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
        struct nfsd4_session *ses;
-        list_del(&clp->cl_lru);
+        lockdep_assert_held(&nn->client_lock);
+        /* Mark the client as expired! */
+        clp->cl_time = 0;
+        /* Make it invisible */
+        if (!list_empty(&clp->cl_idhash)) {
+                list_del_init(&clp->cl_idhash);
+                if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
+                        rb_erase(&clp->cl_namenode, &nn->conf_name_tree);
+                else
+                        rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
+        }
+        list_del_init(&clp->cl_lru);
        spin_lock(&clp->cl_lock);
        list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
                list_del_init(&ses->se_hash);
@@ -1229,53 +1600,71 @@ unhash_client_locked(struct nfs4_client *clp)
 }
 static void
-destroy_client(struct nfs4_client *clp)
+unhash_client(struct nfs4_client *clp)
+{
+        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+        spin_lock(&nn->client_lock);
+        unhash_client_locked(clp);
+        spin_unlock(&nn->client_lock);
+}
+static __be32 mark_client_expired_locked(struct nfs4_client *clp)
+{
+        if (atomic_read(&clp->cl_refcount))
+                return nfserr_jukebox;
+        unhash_client_locked(clp);
+        return nfs_ok;
+}
+static void
+__destroy_client(struct nfs4_client *clp)
 {
        struct nfs4_openowner *oo;
        struct nfs4_delegation *dp;
        struct list_head reaplist;
-        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
        INIT_LIST_HEAD(&reaplist);
        spin_lock(&state_lock);
        while (!list_empty(&clp->cl_delegations)) {
                dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
-                list_del_init(&dp->dl_perclnt);
+                unhash_delegation_locked(dp);
-                list_move(&dp->dl_recall_lru, &reaplist);
+                list_add(&dp->dl_recall_lru, &reaplist);
        }
        spin_unlock(&state_lock);
        while (!list_empty(&reaplist)) {
                dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
-                destroy_delegation(dp);
+                list_del_init(&dp->dl_recall_lru);
+                nfs4_put_stid(&dp->dl_stid);
        }
-        list_splice_init(&clp->cl_revoked, &reaplist);
+        while (!list_empty(&clp->cl_revoked)) {
-        while (!list_empty(&reaplist)) {
                dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
-                destroy_revoked_delegation(dp);
+                list_del_init(&dp->dl_recall_lru);
+                nfs4_put_stid(&dp->dl_stid);
        }
        while (!list_empty(&clp->cl_openowners)) {
                oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
+                atomic_inc(&oo->oo_owner.so_count);
                release_openowner(oo);
        }
        nfsd4_shutdown_callback(clp);
        if (clp->cl_cb_conn.cb_xprt)
                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
-        list_del(&clp->cl_idhash);
-        if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
-                rb_erase(&clp->cl_namenode, &nn->conf_name_tree);
-        else
-                rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
-        spin_lock(&nn->client_lock);
-        unhash_client_locked(clp);
-        WARN_ON_ONCE(atomic_read(&clp->cl_refcount));
        free_client(clp);
-        spin_unlock(&nn->client_lock);
+}
+static void
+destroy_client(struct nfs4_client *clp)
+{
+        unhash_client(clp);
+        __destroy_client(clp);
 }
 static void expire_client(struct nfs4_client *clp)
 {
+        unhash_client(clp);
        nfsd4_client_record_remove(clp);
-        destroy_client(clp);
+        __destroy_client(clp);
 }
 static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
@@ -1408,25 +1797,28 @@ static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
        return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
 }
-static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
+static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn)
 {
-        static u32 current_clientid = 1;
+        __be32 verf[2];
-        clp->cl_clientid.cl_boot = nn->boot_time;
+        /*
-        clp->cl_clientid.cl_id = current_clientid++; 
+         * This is opaque to client, so no need to byte-swap. Use
+         * __force to keep sparse happy
+         */
+        verf[0] = (__force __be32)get_seconds();
+        verf[1] = (__force __be32)nn->clientid_counter;
+        memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data));
 }
-static void gen_confirm(struct nfs4_client *clp)
+static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
 {
-        __be32 verf[2];
+        clp->cl_clientid.cl_boot = nn->boot_time;
-        static u32 i;
+        clp->cl_clientid.cl_id = nn->clientid_counter++;
+        gen_confirm(clp, nn);
-        verf[0] = (__be32)get_seconds();
-        verf[1] = (__be32)i++;
-        memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data));
 }
-static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
+static struct nfs4_stid *
+find_stateid_locked(struct nfs4_client *cl, stateid_t *t)
 {
        struct nfs4_stid *ret;
@@ -1436,16 +1828,21 @@ static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
        return ret;
 }
-static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
+static struct nfs4_stid *
+find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
 {
        struct nfs4_stid *s;
-        s = find_stateid(cl, t);
+        spin_lock(&cl->cl_lock);
-        if (!s)
+        s = find_stateid_locked(cl, t);
-                return NULL;
+        if (s != NULL) {
-        if (typemask & s->sc_type)
+                if (typemask & s->sc_type)
-                return s;
+                        atomic_inc(&s->sc_count);
-        return NULL;
+                else
+                        s = NULL;
+        }
+        spin_unlock(&cl->cl_lock);
+        return s;
 }
 static struct nfs4_client *create_client(struct xdr_netobj name,
@@ -1455,7 +1852,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
        struct sockaddr *sa = svc_addr(rqstp);
        int ret;
        struct net *net = SVC_NET(rqstp);
-        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        clp = alloc_client(name);
        if (clp == NULL)
@@ -1463,17 +1859,14 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
        ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
        if (ret) {
-                spin_lock(&nn->client_lock);
                free_client(clp);
-                spin_unlock(&nn->client_lock);
                return NULL;
        }
-        nfsd4_init_callback(&clp->cl_cb_null);
+        INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_run_cb_null);
        clp->cl_time = get_seconds();
        clear_bit(0, &clp->cl_cb_slot_busy);
        copy_verf(clp, verf);
        rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
-        gen_confirm(clp);
        clp->cl_cb_session = NULL;
        clp->net = net;
        return clp;
@@ -1525,11 +1918,13 @@ add_to_unconfirmed(struct nfs4_client *clp)
        unsigned int idhashval;
        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+        lockdep_assert_held(&nn->client_lock);
        clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
        add_clp_to_name_tree(clp, &nn->unconf_name_tree);
        idhashval = clientid_hashval(clp->cl_clientid.cl_id);
        list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]);
-        renew_client(clp);
+        renew_client_locked(clp);
 }
 static void
@@ -1538,12 +1933,14 @@ move_to_confirmed(struct nfs4_client *clp)
        unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id);
        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+        lockdep_assert_held(&nn->client_lock);
        dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
        list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]);
        rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
        add_clp_to_name_tree(clp, &nn->conf_name_tree);
        set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
-        renew_client(clp);
+        renew_client_locked(clp);
 }
 static struct nfs4_client *
@@ -1556,7 +1953,7 @@ find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions)
                if (same_clid(&clp->cl_clientid, clid)) {
                        if ((bool)clp->cl_minorversion != sessions)
                                return NULL;
-                        renew_client(clp);
+                        renew_client_locked(clp);
                        return clp;
                }
        }
@@ -1568,6 +1965,7 @@ find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
 {
        struct list_head *tbl = nn->conf_id_hashtbl;
+        lockdep_assert_held(&nn->client_lock);
        return find_client_in_id_table(tbl, clid, sessions);
 }
@@ -1576,6 +1974,7 @@ find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
 {
        struct list_head *tbl = nn->unconf_id_hashtbl;
+        lockdep_assert_held(&nn->client_lock);
        return find_client_in_id_table(tbl, clid, sessions);
 }
@@ -1587,12 +1986,14 @@ static bool clp_used_exchangeid(struct nfs4_client *clp)
 static struct nfs4_client *
 find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
 {
+        lockdep_assert_held(&nn->client_lock);
        return find_clp_in_name_tree(name, &nn->conf_name_tree);
 }
 static struct nfs4_client *
 find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
 {
+        lockdep_assert_held(&nn->client_lock);
        return find_clp_in_name_tree(name, &nn->unconf_name_tree);
 }
@@ -1642,7 +2043,7 @@ out_err:
 /*
 * Cache a reply. nfsd4_check_resp_size() has bounded the cache size.
 */
-void
+static void
 nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
 {
        struct xdr_buf *buf = resp->xdr.buf;
@@ -1758,7 +2159,8 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                  struct nfsd4_compound_state *cstate,
                  struct nfsd4_exchange_id *exid)
 {
-        struct nfs4_client *unconf, *conf, *new;
+        struct nfs4_client *conf, *new;
+        struct nfs4_client *unconf = NULL;
        __be32 status;
        char                    addr_str[INET6_ADDRSTRLEN];
        nfs4_verifier           verf = exid->verifier;
@@ -1787,8 +2189,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                return nfserr_encr_alg_unsupp;
        }
+        new = create_client(exid->clname, rqstp, &verf);
+        if (new == NULL)
+                return nfserr_jukebox;
        /* Cases below refer to rfc 5661 section 18.35.4: */
-        nfs4_lock_state();
+        spin_lock(&nn->client_lock);
        conf = find_confirmed_client_by_name(&exid->clname, nn);
        if (conf) {
                bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);
@@ -1813,7 +2219,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                        }
                        /* case 6 */
                        exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
-                        new = conf;
                        goto out_copy;
                }
                if (!creds_match) { /* case 3 */
@@ -1821,15 +2226,14 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
                                status = nfserr_clid_inuse;
                                goto out;
                        }
-                        expire_client(conf);
                        goto out_new;
                }
                if (verfs_match) { /* case 2 */
                        conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
-                        new = conf;
                        goto out_copy;
                }
                /* case 5, client reboot */
+                conf = NULL;
                goto out_new;
        }
@@ -1840,33 +2244,38 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
        unconf  = find_unconfirmed_client_by_name(&exid->clname, nn);
        if (unconf) /* case 4, possible retry or client restart */
-                expire_client(unconf);
+                unhash_client_locked(unconf);
        /* case 1 (normal case) */
 out_new:
-        new = create_client(exid->clname, rqstp, &verf);
+        if (conf) {
-        if (new == NULL) {
+                status = mark_client_expired_locked(conf);
-                status = nfserr_jukebox;
+                if (status)
-                goto out;
+                        goto out;
        }
        new->cl_minorversion = cstate->minorversion;
        new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
        gen_clid(new, nn);
        add_to_unconfirmed(new);
+        swap(new, conf);
 out_copy:
-        exid->clientid.cl_boot = new->cl_clientid.cl_boot;
+        exid->clientid.cl_boot = conf->cl_clientid.cl_boot;
-        exid->clientid.cl_id = new->cl_clientid.cl_id;
+        exid->clientid.cl_id = conf->cl_clientid.cl_id;
-        exid->seqid = new->cl_cs_slot.sl_seqid + 1;
+        exid->seqid = conf->cl_cs_slot.sl_seqid + 1;
-        nfsd4_set_ex_flags(new, exid);
+        nfsd4_set_ex_flags(conf, exid);
        dprintk("nfsd4_exchange_id seqid %d flags %x\n",
-                new->cl_cs_slot.sl_seqid, new->cl_exchange_flags);
+                conf->cl_cs_slot.sl_seqid, conf->cl_exchange_flags);
        status = nfs_ok;
 out:
-        nfs4_unlock_state();
+        spin_unlock(&nn->client_lock);
+        if (new)
+                expire_client(new);
+        if (unconf)
+                expire_client(unconf);
        return status;
 }
@@ -2010,6 +2419,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 {
        struct sockaddr *sa = svc_addr(rqstp);
        struct nfs4_client *conf, *unconf;
+        struct nfs4_client *old = NULL;
        struct nfsd4_session *new;
        struct nfsd4_conn *conn;
        struct nfsd4_clid_slot *cs_slot = NULL;
@@ -2035,7 +2445,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
        if (!conn)
                goto out_free_session;
-        nfs4_lock_state();
+        spin_lock(&nn->client_lock);
        unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
        conf = find_confirmed_client(&cr_ses->clientid, true, nn);
        WARN_ON_ONCE(conf && unconf);
@@ -2054,7 +2464,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                        goto out_free_conn;
                }
        } else if (unconf) {
-                struct nfs4_client *old;
                if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
                    !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
                        status = nfserr_clid_inuse;
@@ -2072,10 +2481,11 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                }
                old = find_confirmed_client_by_name(&unconf->cl_name, nn);
                if (old) {
-                        status = mark_client_expired(old);
+                        status = mark_client_expired_locked(old);
-                        if (status)
+                        if (status) {
+                                old = NULL;
                                goto out_free_conn;
-                        expire_client(old);
+                        }
                }
                move_to_confirmed(unconf);
                conf = unconf;
@@ -2091,20 +2501,27 @@ nfsd4_create_session(struct svc_rqst *rqstp,
        cr_ses->flags &= ~SESSION4_RDMA;
        init_session(rqstp, new, conf, cr_ses);
-        nfsd4_init_conn(rqstp, conn, new);
+        nfsd4_get_session_locked(new);
        memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
               NFS4_MAX_SESSIONID_LEN);
        cs_slot->sl_seqid++;
        cr_ses->seqid = cs_slot->sl_seqid;
-        /* cache solo and embedded create sessions under the state lock */
+        /* cache solo and embedded create sessions under the client_lock */
        nfsd4_cache_create_session(cr_ses, cs_slot, status);
-        nfs4_unlock_state();
+        spin_unlock(&nn->client_lock);
+        /* init connection and backchannel */
+        nfsd4_init_conn(rqstp, conn, new);
+        nfsd4_put_session(new);
+        if (old)
+                expire_client(old);
        return status;
 out_free_conn:
-        nfs4_unlock_state();
+        spin_unlock(&nn->client_lock);
        free_conn(conn);
+        if (old)
+                expire_client(old);
 out_free_session:
        __free_session(new);
 out_release_drc_mem:
@@ -2152,17 +2569,16 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
        __be32 status;
        struct nfsd4_conn *conn;
        struct nfsd4_session *session;
-        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+        struct net *net = SVC_NET(rqstp);
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        if (!nfsd4_last_compound_op(rqstp))
                return nfserr_not_only_op;
-        nfs4_lock_state();
        spin_lock(&nn->client_lock);
-        session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp));
+        session = find_in_sessionid_hashtbl(&bcts->sessionid, net, &status);
        spin_unlock(&nn->client_lock);
-        status = nfserr_badsession;
        if (!session)
-                goto out;
+                goto out_no_session;
        status = nfserr_wrong_cred;
        if (!mach_creds_match(session->se_client, rqstp))
                goto out;
@@ -2176,7 +2592,8 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
        nfsd4_init_conn(rqstp, conn, session);
        status = nfs_ok;
 out:
-        nfs4_unlock_state();
+        nfsd4_put_session(session);
+out_no_session:
        return status;
 }
@@ -2195,9 +2612,9 @@ nfsd4_destroy_session(struct svc_rqst *r,
        struct nfsd4_session *ses;
        __be32 status;
        int ref_held_by_me = 0;
-        struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
+        struct net *net = SVC_NET(r);
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-        nfs4_lock_state();
        status = nfserr_not_only_op;
        if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
                if (!nfsd4_last_compound_op(r))
@@ -2206,14 +2623,12 @@ nfsd4_destroy_session(struct svc_rqst *r,
        }
        dump_sessionid(__func__, &sessionid->sessionid);
        spin_lock(&nn->client_lock);
-        ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r));
+        ses = find_in_sessionid_hashtbl(&sessionid->sessionid, net, &status);
-        status = nfserr_badsession;
        if (!ses)
                goto out_client_lock;
        status = nfserr_wrong_cred;
        if (!mach_creds_match(ses->se_client, r))
-                goto out_client_lock;
+                goto out_put_session;
-        nfsd4_get_session_locked(ses);
        status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
        if (status)
                goto out_put_session;
@@ -2225,11 +2640,10 @@ nfsd4_destroy_session(struct svc_rqst *r,
        spin_lock(&nn->client_lock);
        status = nfs_ok;
 out_put_session:
-        nfsd4_put_session(ses);
+        nfsd4_put_session_locked(ses);
 out_client_lock:
        spin_unlock(&nn->client_lock);
 out:
-        nfs4_unlock_state();
        return status;
 }
@@ -2300,7 +2714,8 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        struct nfsd4_conn *conn;
        __be32 status;
        int buflen;
-        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+        struct net *net = SVC_NET(rqstp);
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        if (resp->opcnt != 1)
                return nfserr_sequence_pos;
@@ -2314,17 +2729,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
                return nfserr_jukebox;
        spin_lock(&nn->client_lock);
-        status = nfserr_badsession;
+        session = find_in_sessionid_hashtbl(&seq->sessionid, net, &status);
-        session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp));
        if (!session)
                goto out_no_session;
        clp = session->se_client;
-        status = get_client_locked(clp);
-        if (status)
-                goto out_no_session;
-        status = nfsd4_get_session_locked(session);
-        if (status)
-                goto out_put_client;
        status = nfserr_too_many_ops;
        if (nfsd4_session_too_many_ops(rqstp, session))
@@ -2354,6 +2762,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
                        goto out_put_session;
                cstate->slot = slot;
                cstate->session = session;
+                cstate->clp = clp;
                /* Return the cached reply status and set cstate->status
                 * for nfsd4_proc_compound processing */
                status = nfsd4_replay_cache_entry(resp, seq);
@@ -2388,6 +2797,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        cstate->slot = slot;
        cstate->session = session;
+        cstate->clp = clp;
 out:
        switch (clp->cl_cb_state) {
@@ -2408,31 +2818,48 @@ out_no_session:
        spin_unlock(&nn->client_lock);
        return status;
 out_put_session:
-        nfsd4_put_session(session);
+        nfsd4_put_session_locked(session);
-out_put_client:
-        put_client_renew_locked(clp);
        goto out_no_session;
 }
+void
+nfsd4_sequence_done(struct nfsd4_compoundres *resp)
+{
+        struct nfsd4_compound_state *cs = &resp->cstate;
+        if (nfsd4_has_session(cs)) {
+                if (cs->status != nfserr_replay_cache) {
+                        nfsd4_store_cache_entry(resp);
+                        cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
+                }
+                /* Drop session reference that was taken in nfsd4_sequence() */
+                nfsd4_put_session(cs->session);
+        } else if (cs->clp)
+                put_client_renew(cs->clp);
+}
 __be32
 nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
 {
-        struct nfs4_client *conf, *unconf, *clp;
+        struct nfs4_client *conf, *unconf;
+        struct nfs4_client *clp = NULL;
        __be32 status = 0;
        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
-        nfs4_lock_state();
+        spin_lock(&nn->client_lock);
        unconf = find_unconfirmed_client(&dc->clientid, true, nn);
        conf = find_confirmed_client(&dc->clientid, true, nn);
        WARN_ON_ONCE(conf && unconf);
        if (conf) {
-                clp = conf;
                if (client_has_state(conf)) {
                        status = nfserr_clientid_busy;
                        goto out;
                }
+                status = mark_client_expired_locked(conf);
+                if (status)
+                        goto out;
+                clp = conf;
        } else if (unconf)
                clp = unconf;
        else {
@@ -2440,12 +2867,15 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
                goto out;
        }
        if (!mach_creds_match(clp, rqstp)) {
+                clp = NULL;
                status = nfserr_wrong_cred;
                goto out;
        }
-        expire_client(clp);
+        unhash_client_locked(clp);
 out:
-        nfs4_unlock_state();
+        spin_unlock(&nn->client_lock);
+        if (clp)
+                expire_client(clp);
        return status;
 }
@@ -2464,7 +2894,6 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
                 return nfs_ok;
        }
-        nfs4_lock_state();
        status = nfserr_complete_already;
        if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE,
                             &cstate->session->se_client->cl_flags))
@@ -2484,7 +2913,6 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
        status = nfs_ok;
        nfsd4_client_record_create(cstate->session->se_client);
 out:
-        nfs4_unlock_state();
        return status;
 }
@@ -2494,12 +2922,16 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
        struct xdr_netobj       clname = setclid->se_name;
        nfs4_verifier           clverifier = setclid->se_verf;
-        struct nfs4_client      *conf, *unconf, *new;
+        struct nfs4_client      *conf, *new;
+        struct nfs4_client      *unconf = NULL;
        __be32                  status;
        struct nfsd_net         *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+        new = create_client(clname, rqstp, &clverifier);
+        if (new == NULL)
+                return nfserr_jukebox;
        /* Cases below refer to rfc 3530 section 14.2.33: */
-        nfs4_lock_state();
+        spin_lock(&nn->client_lock);
        conf = find_confirmed_client_by_name(&clname, nn);
        if (conf) {
                /* case 0: */
@@ -2517,11 +2949,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        }
        unconf = find_unconfirmed_client_by_name(&clname, nn);
        if (unconf)
-                expire_client(unconf);
+                unhash_client_locked(unconf);
-        status = nfserr_jukebox;
-        new = create_client(clname, rqstp, &clverifier);
-        if (new == NULL)
-                goto out;
        if (conf && same_verf(&conf->cl_verifier, &clverifier))
                /* case 1: probable callback update */
                copy_clid(new, conf);
@@ -2533,9 +2961,14 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
        setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
        memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data));
+        new = NULL;
        status = nfs_ok;
 out:
-        nfs4_unlock_state();
+        spin_unlock(&nn->client_lock);
+        if (new)
+                free_client(new);
+        if (unconf)
+                expire_client(unconf);
        return status;
 }
@@ -2546,6 +2979,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                         struct nfsd4_setclientid_confirm *setclientid_confirm)
 {
        struct nfs4_client *conf, *unconf;
+        struct nfs4_client *old = NULL;
        nfs4_verifier confirm = setclientid_confirm->sc_confirm; 
        clientid_t * clid = &setclientid_confirm->sc_clientid;
        __be32 status;
@@ -2553,8 +2987,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
        if (STALE_CLIENTID(clid, nn))
                return nfserr_stale_clientid;
-        nfs4_lock_state();
+        spin_lock(&nn->client_lock);
        conf = find_confirmed_client(clid, false, nn);
        unconf = find_unconfirmed_client(clid, false, nn);
        /*
@@ -2578,22 +3012,30 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
        }
        status = nfs_ok;
        if (conf) { /* case 1: callback update */
+                old = unconf;
+                unhash_client_locked(old);
                nfsd4_change_callback(conf, &unconf->cl_cb_conn);
-                nfsd4_probe_callback(conf);
-                expire_client(unconf);
        } else { /* case 3: normal case; new or rebooted client */
-                conf = find_confirmed_client_by_name(&unconf->cl_name, nn);
+                old = find_confirmed_client_by_name(&unconf->cl_name, nn);
-                if (conf) {
+                if (old) {
-                        status = mark_client_expired(conf);
+                        status = mark_client_expired_locked(old);
-                        if (status)
+                        if (status) {
+                                old = NULL;
                                goto out;
-                        expire_client(conf);
+                        }
                }
                move_to_confirmed(unconf);
-                nfsd4_probe_callback(unconf);
+                conf = unconf;
        }
+        get_client_locked(conf);
+        spin_unlock(&nn->client_lock);
+        nfsd4_probe_callback(conf);
+        spin_lock(&nn->client_lock);
+        put_client_renew_locked(conf);
 out:
-        nfs4_unlock_state();
+        spin_unlock(&nn->client_lock);
+        if (old)
+                expire_client(old);
        return status;
 }
@@ -2603,21 +3045,23 @@ static struct nfs4_file *nfsd4_alloc_file(void)
 }
 /* OPEN Share state helper functions */
-static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
+static void nfsd4_init_file(struct nfs4_file *fp, struct knfsd_fh *fh)
 {
-        unsigned int hashval = file_hashval(ino);
+        unsigned int hashval = file_hashval(fh);
+        lockdep_assert_held(&state_lock);
        atomic_set(&fp->fi_ref, 1);
+        spin_lock_init(&fp->fi_lock);
        INIT_LIST_HEAD(&fp->fi_stateids);
        INIT_LIST_HEAD(&fp->fi_delegations);
-        fp->fi_inode = igrab(ino);
+        fh_copy_shallow(&fp->fi_fhandle, fh);
        fp->fi_had_conflict = false;
        fp->fi_lease = NULL;
+        fp->fi_share_deny = 0;
        memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
        memset(fp->fi_access, 0, sizeof(fp->fi_access));
-        spin_lock(&state_lock);
        hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]);
-        spin_unlock(&state_lock);
 }
 void
@@ -2673,6 +3117,28 @@ static void init_nfs4_replay(struct nfs4_replay *rp)
        rp->rp_status = nfserr_serverfault;
        rp->rp_buflen = 0;
        rp->rp_buf = rp->rp_ibuf;
+        mutex_init(&rp->rp_mutex);
+}
+static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
+                struct nfs4_stateowner *so)
+{
+        if (!nfsd4_has_session(cstate)) {
+                mutex_lock(&so->so_replay.rp_mutex);
+                cstate->replay_owner = so;
+                atomic_inc(&so->so_count);
+        }
+}
+void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
+{
+        struct nfs4_stateowner *so = cstate->replay_owner;
+        if (so != NULL) {
+                cstate->replay_owner = NULL;
+                mutex_unlock(&so->so_replay.rp_mutex);
+                nfs4_put_stateowner(so);
+        }
 }
 static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp)
@@ -2693,111 +3159,172 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
        INIT_LIST_HEAD(&sop->so_stateids);
        sop->so_client = clp;
        init_nfs4_replay(&sop->so_replay);
+        atomic_set(&sop->so_count, 1);
        return sop;
 }
 static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
 {
-        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+        lockdep_assert_held(&clp->cl_lock);
-        list_add(&oo->oo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);
+        list_add(&oo->oo_owner.so_strhash,
+                 &clp->cl_ownerstr_hashtbl[strhashval]);
        list_add(&oo->oo_perclient, &clp->cl_openowners);
 }
+static void nfs4_unhash_openowner(struct nfs4_stateowner *so)
+{
+        unhash_openowner_locked(openowner(so));
+}
+static void nfs4_free_openowner(struct nfs4_stateowner *so)
+{
+        struct nfs4_openowner *oo = openowner(so);
+        kmem_cache_free(openowner_slab, oo);
+}
+static const struct nfs4_stateowner_operations openowner_ops = {
+        .so_unhash =    nfs4_unhash_openowner,
+        .so_free =      nfs4_free_openowner,
+};
 static struct nfs4_openowner *
-alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) {
+alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
-        struct nfs4_openowner *oo;
+                           struct nfsd4_compound_state *cstate)
+{
+        struct nfs4_client *clp = cstate->clp;
+        struct nfs4_openowner *oo, *ret;
        oo = alloc_stateowner(openowner_slab, &open->op_owner, clp);
        if (!oo)
                return NULL;
+        oo->oo_owner.so_ops = &openowner_ops;
        oo->oo_owner.so_is_open_owner = 1;
        oo->oo_owner.so_seqid = open->op_seqid;
-        oo->oo_flags = NFS4_OO_NEW;
+        oo->oo_flags = 0;
+        if (nfsd4_has_session(cstate))
+                oo->oo_flags |= NFS4_OO_CONFIRMED;
        oo->oo_time = 0;
        oo->oo_last_closed_stid = NULL;
        INIT_LIST_HEAD(&oo->oo_close_lru);
-        hash_openowner(oo, clp, strhashval);
+        spin_lock(&clp->cl_lock);
+        ret = find_openstateowner_str_locked(strhashval, open, clp);
+        if (ret == NULL) {
+                hash_openowner(oo, clp, strhashval);
+                ret = oo;
+        } else
+                nfs4_free_openowner(&oo->oo_owner);
+        spin_unlock(&clp->cl_lock);
        return oo;
 }
 static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
        struct nfs4_openowner *oo = open->op_openowner;
+        atomic_inc(&stp->st_stid.sc_count);
        stp->st_stid.sc_type = NFS4_OPEN_STID;
-        INIT_LIST_HEAD(&stp->st_lockowners);
+        INIT_LIST_HEAD(&stp->st_locks);
-        list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
-        list_add(&stp->st_perfile, &fp->fi_stateids);
        stp->st_stateowner = &oo->oo_owner;
+        atomic_inc(&stp->st_stateowner->so_count);
        get_nfs4_file(fp);
-        stp->st_file = fp;
+        stp->st_stid.sc_file = fp;
        stp->st_access_bmap = 0;
        stp->st_deny_bmap = 0;
-        set_access(open->op_share_access, stp);
-        set_deny(open->op_share_deny, stp);
        stp->st_openstp = NULL;
+        spin_lock(&oo->oo_owner.so_client->cl_lock);
+        list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
+        spin_lock(&fp->fi_lock);
+        list_add(&stp->st_perfile, &fp->fi_stateids);
+        spin_unlock(&fp->fi_lock);
+        spin_unlock(&oo->oo_owner.so_client->cl_lock);
 }
+/*
+ * In the 4.0 case we need to keep the owners around a little while to handle
+ * CLOSE replay. We still do need to release any file access that is held by
+ * them before returning however.
+ */
 static void
-move_to_close_lru(struct nfs4_openowner *oo, struct net *net)
+move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
 {
-        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        struct nfs4_ol_stateid *last;
+        struct nfs4_openowner *oo = openowner(s->st_stateowner);
+        struct nfsd_net *nn = net_generic(s->st_stid.sc_client->net,
+                                                nfsd_net_id);
        dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
+        /*
+         * We know that we hold one reference via nfsd4_close, and another
+         * "persistent" reference for the client. If the refcount is higher
+         * than 2, then there are still calls in progress that are using this
+         * stateid. We can't put the sc_file reference until they are finished.
+         * Wait for the refcount to drop to 2. Since it has been unhashed,
+         * there should be no danger of the refcount going back up again at
+         * this point.
+         */
+        wait_event(close_wq, atomic_read(&s->st_stid.sc_count) == 2);
+        release_all_access(s);
+        if (s->st_stid.sc_file) {
+                put_nfs4_file(s->st_stid.sc_file);
+                s->st_stid.sc_file = NULL;
+        }
+        spin_lock(&nn->client_lock);
+        last = oo->oo_last_closed_stid;
+        oo->oo_last_closed_stid = s;
        list_move_tail(&oo->oo_close_lru, &nn->close_lru);
        oo->oo_time = get_seconds();
+        spin_unlock(&nn->client_lock);
+        if (last)
+                nfs4_put_stid(&last->st_stid);
 }
-static int
+/* search file_hashtbl[] for file */
-same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
+static struct nfs4_file *
-                                                        clientid_t *clid)
+find_file_locked(struct knfsd_fh *fh)
 {
-        return (sop->so_owner.len == owner->len) &&
+        unsigned int hashval = file_hashval(fh);
-                0 == memcmp(sop->so_owner.data, owner->data, owner->len) &&
+        struct nfs4_file *fp;
-                (sop->so_client->cl_clientid.cl_id == clid->cl_id);
-}
-static struct nfs4_openowner *
+        lockdep_assert_held(&state_lock);
-find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
-                        bool sessions, struct nfsd_net *nn)
-{
-        struct nfs4_stateowner *so;
-        struct nfs4_openowner *oo;
-        struct nfs4_client *clp;
-        list_for_each_entry(so, &nn->ownerstr_hashtbl[hashval], so_strhash) {
+        hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
-                if (!so->so_is_open_owner)
+                if (nfsd_fh_match(&fp->fi_fhandle, fh)) {
-                        continue;
+                        get_nfs4_file(fp);
-                if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
+                        return fp;
-                        oo = openowner(so);
-                        clp = oo->oo_owner.so_client;
-                        if ((bool)clp->cl_minorversion != sessions)
-                                return NULL;
-                        renew_client(oo->oo_owner.so_client);
-                        return oo;
                }
        }
        return NULL;
 }
-/* search file_hashtbl[] for file */
 static struct nfs4_file *
-find_file(struct inode *ino)
+find_file(struct knfsd_fh *fh)
 {
-        unsigned int hashval = file_hashval(ino);
        struct nfs4_file *fp;
        spin_lock(&state_lock);
-        hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
+        fp = find_file_locked(fh);
-                if (fp->fi_inode == ino) {
+        spin_unlock(&state_lock);
-                        get_nfs4_file(fp);
+        return fp;
-                        spin_unlock(&state_lock);
+}
-                        return fp;
-                }
+static struct nfs4_file *
+find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh)
+{
+        struct nfs4_file *fp;
+        spin_lock(&state_lock);
+        fp = find_file_locked(fh);
+        if (fp == NULL) {
+                nfsd4_init_file(new, fh);
+                fp = new;
        }
        spin_unlock(&state_lock);
-        return NULL;
+        return fp;
 }
 /*
@@ -2807,47 +3334,53 @@ find_file(struct inode *ino)
 static __be32
 nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 {
-        struct inode *ino = current_fh->fh_dentry->d_inode;
        struct nfs4_file *fp;
-        struct nfs4_ol_stateid *stp;
+        __be32 ret = nfs_ok;
-        __be32 ret;
-        fp = find_file(ino);
+        fp = find_file(&current_fh->fh_handle);
        if (!fp)
-                return nfs_ok;
+                return ret;
-        ret = nfserr_locked;
+        /* Check for conflicting share reservations */
-        /* Search for conflicting share reservations */
+        spin_lock(&fp->fi_lock);
-        list_for_each_entry(stp, &fp->fi_stateids, st_perfile) {
+        if (fp->fi_share_deny & deny_type)
-                if (test_deny(deny_type, stp) ||
+                ret = nfserr_locked;
-                    test_deny(NFS4_SHARE_DENY_BOTH, stp))
+        spin_unlock(&fp->fi_lock);
-                        goto out;
-        }
-        ret = nfs_ok;
-out:
        put_nfs4_file(fp);
        return ret;
 }
-static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
+void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp)
 {
-        struct nfs4_client *clp = dp->dl_stid.sc_client;
+        struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net,
-        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+                                          nfsd_net_id);
-        lockdep_assert_held(&state_lock);
+        block_delegations(&dp->dl_stid.sc_file->fi_fhandle);
-        /* We're assuming the state code never drops its reference
+        /*
+         * We can't do this in nfsd_break_deleg_cb because it is
+         * already holding inode->i_lock.
+         *
+         * If the dl_time != 0, then we know that it has already been
+         * queued for a lease break. Don't queue it again.
+         */
+        spin_lock(&state_lock);
+        if (dp->dl_time == 0) {
+                dp->dl_time = get_seconds();
+                list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
+        }
+        spin_unlock(&state_lock);
+}
+static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
+{
+        /*
+         * We're assuming the state code never drops its reference
         * without first removing the lease.  Since we're in this lease
         * callback (and since the lease code is serialized by the kernel
         * lock) we know the server hasn't removed the lease yet, we know
-         * it's safe to take a reference: */
+         * it's safe to take a reference.
-        atomic_inc(&dp->dl_count);
+         */
+        atomic_inc(&dp->dl_stid.sc_count);
-        list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
-        /* Only place dl_time is set; protected by i_lock: */
-        dp->dl_time = get_seconds();
-        block_delegations(&dp->dl_fh);
        nfsd4_cb_recall(dp);
 }
@@ -2872,11 +3405,20 @@ static void nfsd_break_deleg_cb(struct file_lock *fl)
         */
        fl->fl_break_time = 0;
-        spin_lock(&state_lock);
+        spin_lock(&fp->fi_lock);
        fp->fi_had_conflict = true;
-        list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+        /*
-                nfsd_break_one_deleg(dp);
+         * If there are no delegations on the list, then we can't count on this
-        spin_unlock(&state_lock);
+         * lease ever being cleaned up. Set the fl_break_time to jiffies so that
+         * time_out_leases will do it ASAP. The fact that fi_had_conflict is now
+         * true should keep any new delegations from being hashed.
+         */
+        if (list_empty(&fp->fi_delegations))
+                fl->fl_break_time = jiffies;
+        else
+                list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+                        nfsd_break_one_deleg(dp);
+        spin_unlock(&fp->fi_lock);
 }
 static
@@ -2904,6 +3446,42 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4
        return nfserr_bad_seqid;
 }
+static __be32 lookup_clientid(clientid_t *clid,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd_net *nn)
+{
+        struct nfs4_client *found;
+        if (cstate->clp) {
+                found = cstate->clp;
+                if (!same_clid(&found->cl_clientid, clid))
+                        return nfserr_stale_clientid;
+                return nfs_ok;
+        }
+        if (STALE_CLIENTID(clid, nn))
+                return nfserr_stale_clientid;
+        /*
+         * For v4.1+ we get the client in the SEQUENCE op. If we don't have one
+         * cached already then we know this is for is for v4.0 and "sessions"
+         * will be false.
+         */
+        WARN_ON_ONCE(cstate->session);
+        spin_lock(&nn->client_lock);
+        found = find_confirmed_client(clid, false, nn);
+        if (!found) {
+                spin_unlock(&nn->client_lock);
+                return nfserr_expired;
+        }
+        atomic_inc(&found->cl_refcount);
+        spin_unlock(&nn->client_lock);
+        /* Cache the nfs4_client in cstate! */
+        cstate->clp = found;
+        return nfs_ok;
+}
 __be32
 nfsd4_process_open1(struct nfsd4_compound_state *cstate,
                    struct nfsd4_open *open, struct nfsd_net *nn)
@@ -2924,19 +3502,19 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
        if (open->op_file == NULL)
                return nfserr_jukebox;
-        strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
+        status = lookup_clientid(clientid, cstate, nn);
-        oo = find_openstateowner_str(strhashval, open, cstate->minorversion, nn);
+        if (status)
+                return status;
+        clp = cstate->clp;
+        strhashval = ownerstr_hashval(&open->op_owner);
+        oo = find_openstateowner_str(strhashval, open, clp);
        open->op_openowner = oo;
        if (!oo) {
-                clp = find_confirmed_client(clientid, cstate->minorversion,
-                                            nn);
-                if (clp == NULL)
-                        return nfserr_expired;
                goto new_owner;
        }
        if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
                /* Replace unconfirmed owners without checking for replay. */
-                clp = oo->oo_owner.so_client;
                release_openowner(oo);
                open->op_openowner = NULL;
                goto new_owner;
@@ -2944,15 +3522,14 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
        status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
        if (status)
                return status;
-        clp = oo->oo_owner.so_client;
        goto alloc_stateid;
 new_owner:
-        oo = alloc_init_open_stateowner(strhashval, clp, open);
+        oo = alloc_init_open_stateowner(strhashval, open, cstate);
        if (oo == NULL)
                return nfserr_jukebox;
        open->op_openowner = oo;
 alloc_stateid:
-        open->op_stp = nfs4_alloc_stateid(clp);
+        open->op_stp = nfs4_alloc_open_stateid(clp);
        if (!open->op_stp)
                return nfserr_jukebox;
        return nfs_ok;
@@ -2994,14 +3571,18 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
 {
        int flags;
        __be32 status = nfserr_bad_stateid;
+        struct nfs4_delegation *deleg;
-        *dp = find_deleg_stateid(cl, &open->op_delegate_stateid);
+        deleg = find_deleg_stateid(cl, &open->op_delegate_stateid);
-        if (*dp == NULL)
+        if (deleg == NULL)
                goto out;
        flags = share_access_to_flags(open->op_share_access);
-        status = nfs4_check_delegmode(*dp, flags);
+        status = nfs4_check_delegmode(deleg, flags);
-        if (status)
+        if (status) {
-                *dp = NULL;
+                nfs4_put_stid(&deleg->dl_stid);
+                goto out;
+        }
+        *dp = deleg;
 out:
        if (!nfsd4_is_deleg_cur(open))
                return nfs_ok;
@@ -3011,24 +3592,25 @@ out:
        return nfs_ok;
 }
-static __be32
+static struct nfs4_ol_stateid *
-nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp)
+nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
 {
-        struct nfs4_ol_stateid *local;
+        struct nfs4_ol_stateid *local, *ret = NULL;
        struct nfs4_openowner *oo = open->op_openowner;
+        spin_lock(&fp->fi_lock);
        list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
                /* ignore lock owners */
                if (local->st_stateowner->so_is_open_owner == 0)
                        continue;
-                /* remember if we have seen this open owner */
+                if (local->st_stateowner == &oo->oo_owner) {
-                if (local->st_stateowner == &oo->oo_owner)
+                        ret = local;
-                        *stpp = local;
+                        atomic_inc(&ret->st_stid.sc_count);
-                /* check for conflicting share reservations */
+                        break;
-                if (!test_share(local, open))
+                }
-                        return nfserr_share_denied;
        }
-        return nfs_ok;
+        spin_unlock(&fp->fi_lock);
+        return ret;
 }
 static inline int nfs4_access_to_access(u32 nfs4_access)
@@ -3042,24 +3624,6 @@ static inline int nfs4_access_to_access(u32 nfs4_access)
        return flags;
 }
-static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
-                struct svc_fh *cur_fh, struct nfsd4_open *open)
-{
-        __be32 status;
-        int oflag = nfs4_access_to_omode(open->op_share_access);
-        int access = nfs4_access_to_access(open->op_share_access);
-        if (!fp->fi_fds[oflag]) {
-                status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
-                        &fp->fi_fds[oflag]);
-                if (status)
-                        return status;
-        }
-        nfs4_file_get_access(fp, oflag);
-        return nfs_ok;
-}
 static inline __be32
 nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
                struct nfsd4_open *open)
@@ -3075,34 +3639,99 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
        return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0);
 }
-static __be32
+static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
-nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
+                struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
+                struct nfsd4_open *open)
 {
-        u32 op_share_access = open->op_share_access;
+        struct file *filp = NULL;
-        bool new_access;
        __be32 status;
+        int oflag = nfs4_access_to_omode(open->op_share_access);
+        int access = nfs4_access_to_access(open->op_share_access);
+        unsigned char old_access_bmap, old_deny_bmap;
-        new_access = !test_access(op_share_access, stp);
+        spin_lock(&fp->fi_lock);
-        if (new_access) {
-                status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
+        /*
-                if (status)
+         * Are we trying to set a deny mode that would conflict with
-                        return status;
+         * current access?
+         */
+        status = nfs4_file_check_deny(fp, open->op_share_deny);
+        if (status != nfs_ok) {
+                spin_unlock(&fp->fi_lock);
+                goto out;
        }
-        status = nfsd4_truncate(rqstp, cur_fh, open);
-        if (status) {
+        /* set access to the file */
-                if (new_access) {
+        status = nfs4_file_get_access(fp, open->op_share_access);
-                        int oflag = nfs4_access_to_omode(op_share_access);
+        if (status != nfs_ok) {
-                        nfs4_file_put_access(fp, oflag);
+                spin_unlock(&fp->fi_lock);
-                }
+                goto out;
-                return status;
        }
-        /* remember the open */
-        set_access(op_share_access, stp);
+        /* Set access bits in stateid */
+        old_access_bmap = stp->st_access_bmap;
+        set_access(open->op_share_access, stp);
+        /* Set new deny mask */
+        old_deny_bmap = stp->st_deny_bmap;
        set_deny(open->op_share_deny, stp);
+        fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH);
-        return nfs_ok;
+        if (!fp->fi_fds[oflag]) {
+                spin_unlock(&fp->fi_lock);
+                status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp);
+                if (status)
+                        goto out_put_access;
+                spin_lock(&fp->fi_lock);
+                if (!fp->fi_fds[oflag]) {
+                        fp->fi_fds[oflag] = filp;
+                        filp = NULL;
+                }
+        }
+        spin_unlock(&fp->fi_lock);
+        if (filp)
+                fput(filp);
+        status = nfsd4_truncate(rqstp, cur_fh, open);
+        if (status)
+                goto out_put_access;
+out:
+        return status;
+out_put_access:
+        stp->st_access_bmap = old_access_bmap;
+        nfs4_file_put_access(fp, open->op_share_access);
+        reset_union_bmap_deny(bmap_to_share_mode(old_deny_bmap), stp);
+        goto out;
 }
+static __be32
+nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
+{
+        __be32 status;
+        unsigned char old_deny_bmap;
+        if (!test_access(open->op_share_access, stp))
+                return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open);
+        /* test and set deny mode */
+        spin_lock(&fp->fi_lock);
+        status = nfs4_file_check_deny(fp, open->op_share_deny);
+        if (status == nfs_ok) {
+                old_deny_bmap = stp->st_deny_bmap;
+                set_deny(open->op_share_deny, stp);
+                fp->fi_share_deny |=
+                                (open->op_share_deny & NFS4_SHARE_DENY_BOTH);
+        }
+        spin_unlock(&fp->fi_lock);
+        if (status != nfs_ok)
+                return status;
+        status = nfsd4_truncate(rqstp, cur_fh, open);
+        if (status != nfs_ok)
+                reset_union_bmap_deny(old_deny_bmap, stp);
+        return status;
+}
 static void
 nfs4_set_claim_prev(struct nfsd4_open *open, bool has_session)
@@ -3123,7 +3752,7 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
        return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
 }
-static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag)
+static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
 {
        struct file_lock *fl;
@@ -3135,53 +3764,101 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f
        fl->fl_flags = FL_DELEG;
        fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
        fl->fl_end = OFFSET_MAX;
-        fl->fl_owner = (fl_owner_t)(dp->dl_file);
+        fl->fl_owner = (fl_owner_t)fp;
        fl->fl_pid = current->tgid;
        return fl;
 }
 static int nfs4_setlease(struct nfs4_delegation *dp)
 {
-        struct nfs4_file *fp = dp->dl_file;
+        struct nfs4_file *fp = dp->dl_stid.sc_file;
        struct file_lock *fl;
-        int status;
+        struct file *filp;
+        int status = 0;
-        fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
+        fl = nfs4_alloc_init_lease(fp, NFS4_OPEN_DELEGATE_READ);
        if (!fl)
                return -ENOMEM;
-        fl->fl_file = find_readable_file(fp);
+        filp = find_readable_file(fp);
-        status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
+        if (!filp) {
-        if (status)
+                /* We should always have a readable file here */
-                goto out_free;
+                WARN_ON_ONCE(1);
+                return -EBADF;
+        }
+        fl->fl_file = filp;
+        status = vfs_setlease(filp, fl->fl_type, &fl);
+        if (status) {
+                locks_free_lock(fl);
+                goto out_fput;
+        }
+        spin_lock(&state_lock);
+        spin_lock(&fp->fi_lock);
+        /* Did the lease get broken before we took the lock? */
+        status = -EAGAIN;
+        if (fp->fi_had_conflict)
+                goto out_unlock;
+        /* Race breaker */
+        if (fp->fi_lease) {
+                status = 0;
+                atomic_inc(&fp->fi_delegees);
+                hash_delegation_locked(dp, fp);
+                goto out_unlock;
+        }
        fp->fi_lease = fl;
-        fp->fi_deleg_file = get_file(fl->fl_file);
+        fp->fi_deleg_file = filp;
        atomic_set(&fp->fi_delegees, 1);
-        spin_lock(&state_lock);
        hash_delegation_locked(dp, fp);
+        spin_unlock(&fp->fi_lock);
        spin_unlock(&state_lock);
        return 0;
-out_free:
+out_unlock:
-        locks_free_lock(fl);
+        spin_unlock(&fp->fi_lock);
+        spin_unlock(&state_lock);
+out_fput:
+        fput(filp);
        return status;
 }
-static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp)
+static struct nfs4_delegation *
+nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
+                    struct nfs4_file *fp)
 {
+        int status;
+        struct nfs4_delegation *dp;
        if (fp->fi_had_conflict)
-                return -EAGAIN;
+                return ERR_PTR(-EAGAIN);
+        dp = alloc_init_deleg(clp, fh);
+        if (!dp)
+                return ERR_PTR(-ENOMEM);
        get_nfs4_file(fp);
-        dp->dl_file = fp;
-        if (!fp->fi_lease)
-                return nfs4_setlease(dp);
        spin_lock(&state_lock);
+        spin_lock(&fp->fi_lock);
+        dp->dl_stid.sc_file = fp;
+        if (!fp->fi_lease) {
+                spin_unlock(&fp->fi_lock);
+                spin_unlock(&state_lock);
+                status = nfs4_setlease(dp);
+                goto out;
+        }
        atomic_inc(&fp->fi_delegees);
        if (fp->fi_had_conflict) {
-                spin_unlock(&state_lock);
+                status = -EAGAIN;
-                return -EAGAIN;
+                goto out_unlock;
        }
        hash_delegation_locked(dp, fp);
+        status = 0;
+out_unlock:
+        spin_unlock(&fp->fi_lock);
        spin_unlock(&state_lock);
-        return 0;
+out:
+        if (status) {
+                nfs4_put_stid(&dp->dl_stid);
+                return ERR_PTR(status);
+        }
+        return dp;
 }
 static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
@@ -3212,11 +3889,12 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
 * proper support for them.
 */
 static void
-nfs4_open_delegation(struct net *net, struct svc_fh *fh,
+nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
-                     struct nfsd4_open *open, struct nfs4_ol_stateid *stp)
+                        struct nfs4_ol_stateid *stp)
 {
        struct nfs4_delegation *dp;
-        struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner);
+        struct nfs4_openowner *oo = openowner(stp->st_stateowner);
+        struct nfs4_client *clp = stp->st_stid.sc_client;
        int cb_up;
        int status = 0;
@@ -3235,7 +3913,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
                         * Let's not give out any delegations till everyone's
                         * had the chance to reclaim theirs....
                         */
-                        if (locks_in_grace(net))
+                        if (locks_in_grace(clp->net))
                                goto out_no_deleg;
                        if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
                                goto out_no_deleg;
@@ -3254,21 +3932,17 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
                default:
                        goto out_no_deleg;
        }
-        dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh);
+        dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file);
-        if (dp == NULL)
+        if (IS_ERR(dp))
                goto out_no_deleg;
-        status = nfs4_set_delegation(dp, stp->st_file);
-        if (status)
-                goto out_free;
        memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
        dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
                STATEID_VAL(&dp->dl_stid.sc_stateid));
        open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
+        nfs4_put_stid(&dp->dl_stid);
        return;
-out_free:
-        destroy_delegation(dp);
 out_no_deleg:
        open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
        if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
@@ -3301,16 +3975,12 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
         */
 }
-/*
- * called with nfs4_lock_state() held.
- */
 __be32
 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
        struct nfsd4_compoundres *resp = rqstp->rq_resp;
        struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
        struct nfs4_file *fp = NULL;
-        struct inode *ino = current_fh->fh_dentry->d_inode;
        struct nfs4_ol_stateid *stp = NULL;
        struct nfs4_delegation *dp = NULL;
        __be32 status;
@@ -3320,21 +3990,18 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
         * and check for delegations in the process of being recalled.
         * If not found, create the nfs4_file struct
         */
-        fp = find_file(ino);
+        fp = find_or_add_file(open->op_file, &current_fh->fh_handle);
-        if (fp) {
+        if (fp != open->op_file) {
-                if ((status = nfs4_check_open(fp, open, &stp)))
-                        goto out;
                status = nfs4_check_deleg(cl, open, &dp);
                if (status)
                        goto out;
+                stp = nfsd4_find_existing_open(fp, open);
        } else {
+                open->op_file = NULL;
                status = nfserr_bad_stateid;
                if (nfsd4_is_deleg_cur(open))
                        goto out;
                status = nfserr_jukebox;
-                fp = open->op_file;
-                open->op_file = NULL;
-                nfsd4_init_file(fp, ino);
        }
        /*
@@ -3347,22 +4014,19 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
                if (status)
                        goto out;
        } else {
-                status = nfs4_get_vfs_file(rqstp, fp, current_fh, open);
-                if (status)
-                        goto out;
-                status = nfsd4_truncate(rqstp, current_fh, open);
-                if (status)
-                        goto out;
                stp = open->op_stp;
                open->op_stp = NULL;
                init_open_stateid(stp, fp, open);
+                status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
+                if (status) {
+                        release_open_stateid(stp);
+                        goto out;
+                }
        }
        update_stateid(&stp->st_stid.sc_stateid);
        memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
        if (nfsd4_has_session(&resp->cstate)) {
-                open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
                if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
                        open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
                        open->op_why_no_deleg = WND4_NOT_WANTED;
@@ -3374,7 +4038,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
        * Attempt to hand out a delegation. No error return, because the
        * OPEN succeeds even if we fail.
        */
-        nfs4_open_delegation(SVC_NET(rqstp), current_fh, open, stp);
+        nfs4_open_delegation(current_fh, open, stp);
 nodeleg:
        status = nfs_ok;
@@ -3397,41 +4061,27 @@ out:
        if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) &&
            !nfsd4_has_session(&resp->cstate))
                open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
+        if (dp)
+                nfs4_put_stid(&dp->dl_stid);
+        if (stp)
+                nfs4_put_stid(&stp->st_stid);
        return status;
 }
-void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
+void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
+                              struct nfsd4_open *open, __be32 status)
 {
        if (open->op_openowner) {
-                struct nfs4_openowner *oo = open->op_openowner;
+                struct nfs4_stateowner *so = &open->op_openowner->oo_owner;
-                if (!list_empty(&oo->oo_owner.so_stateids))
+                nfsd4_cstate_assign_replay(cstate, so);
-                        list_del_init(&oo->oo_close_lru);
+                nfs4_put_stateowner(so);
-                if (oo->oo_flags & NFS4_OO_NEW) {
-                        if (status) {
-                                release_openowner(oo);
-                                open->op_openowner = NULL;
-                        } else
-                                oo->oo_flags &= ~NFS4_OO_NEW;
-                }
        }
        if (open->op_file)
                nfsd4_free_file(open->op_file);
        if (open->op_stp)
-                free_generic_stateid(open->op_stp);
+                nfs4_put_stid(&open->op_stp->st_stid);
-}
-static __be32 lookup_clientid(clientid_t *clid, bool session, struct nfsd_net *nn, struct nfs4_client **clp)
-{
-        struct nfs4_client *found;
-        if (STALE_CLIENTID(clid, nn))
-                return nfserr_stale_clientid;
-        found = find_confirmed_client(clid, session, nn);
-        if (clp)
-                *clp = found;
-        return found ? nfs_ok : nfserr_expired;
 }
 __be32
@@ -3442,19 +4092,18 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        __be32 status;
        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
-        nfs4_lock_state();
        dprintk("process_renew(%08x/%08x): starting\n", 
                        clid->cl_boot, clid->cl_id);
-        status = lookup_clientid(clid, cstate->minorversion, nn, &clp);
+        status = lookup_clientid(clid, cstate, nn);
        if (status)
                goto out;
+        clp = cstate->clp;
        status = nfserr_cb_path_down;
        if (!list_empty(&clp->cl_delegations)
                        && clp->cl_cb_state != NFSD4_CB_UP)
                goto out;
        status = nfs_ok;
 out:
-        nfs4_unlock_state();
        return status;
 }
@@ -3483,12 +4132,11 @@ nfs4_laundromat(struct nfsd_net *nn)
        struct nfs4_client *clp;
        struct nfs4_openowner *oo;
        struct nfs4_delegation *dp;
+        struct nfs4_ol_stateid *stp;
        struct list_head *pos, *next, reaplist;
        time_t cutoff = get_seconds() - nn->nfsd4_lease;
        time_t t, new_timeo = nn->nfsd4_lease;
-        nfs4_lock_state();
        dprintk("NFSD: laundromat service - starting\n");
        nfsd4_end_grace(nn);
        INIT_LIST_HEAD(&reaplist);
@@ -3505,13 +4153,14 @@ nfs4_laundromat(struct nfsd_net *nn)
                                clp->cl_clientid.cl_id);
                        continue;
                }
-                list_move(&clp->cl_lru, &reaplist);
+                list_add(&clp->cl_lru, &reaplist);
        }
        spin_unlock(&nn->client_lock);
        list_for_each_safe(pos, next, &reaplist) {
                clp = list_entry(pos, struct nfs4_client, cl_lru);
                dprintk("NFSD: purging unused client (clientid %08x)\n",
                        clp->cl_clientid.cl_id);
+                list_del_init(&clp->cl_lru);
                expire_client(clp);
        }
        spin_lock(&state_lock);
@@ -3524,24 +4173,37 @@ nfs4_laundromat(struct nfsd_net *nn)
                        new_timeo = min(new_timeo, t);
                        break;
                }
-                list_move(&dp->dl_recall_lru, &reaplist);
+                unhash_delegation_locked(dp);
+                list_add(&dp->dl_recall_lru, &reaplist);
        }
        spin_unlock(&state_lock);
-        list_for_each_safe(pos, next, &reaplist) {
+        while (!list_empty(&reaplist)) {
-                dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+                dp = list_first_entry(&reaplist, struct nfs4_delegation,
+                                        dl_recall_lru);
+                list_del_init(&dp->dl_recall_lru);
                revoke_delegation(dp);
        }
-        list_for_each_safe(pos, next, &nn->close_lru) {
-                oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
+        spin_lock(&nn->client_lock);
-                if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
+        while (!list_empty(&nn->close_lru)) {
+                oo = list_first_entry(&nn->close_lru, struct nfs4_openowner,
+                                        oo_close_lru);
+                if (time_after((unsigned long)oo->oo_time,
+                               (unsigned long)cutoff)) {
                        t = oo->oo_time - cutoff;
                        new_timeo = min(new_timeo, t);
                        break;
                }
-                release_openowner(oo);
+                list_del_init(&oo->oo_close_lru);
+                stp = oo->oo_last_closed_stid;
+                oo->oo_last_closed_stid = NULL;
+                spin_unlock(&nn->client_lock);
+                nfs4_put_stid(&stp->st_stid);
+                spin_lock(&nn->client_lock);
        }
+        spin_unlock(&nn->client_lock);
        new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
-        nfs4_unlock_state();
        return new_timeo;
 }
@@ -3564,7 +4226,7 @@ laundromat_main(struct work_struct *laundry)
 static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
 {
-        if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode)
+        if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
                return nfserr_bad_stateid;
        return nfs_ok;
 }
@@ -3666,10 +4328,10 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 {
        struct nfs4_stid *s;
        struct nfs4_ol_stateid *ols;
-        __be32 status;
+        __be32 status = nfserr_bad_stateid;
        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
-                return nfserr_bad_stateid;
+                return status;
        /* Client debugging aid. */
        if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) {
                char addr_str[INET6_ADDRSTRLEN];
@@ -3677,53 +4339,62 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
                                 sizeof(addr_str));
                pr_warn_ratelimited("NFSD: client %s testing state ID "
                                        "with incorrect client ID\n", addr_str);
-                return nfserr_bad_stateid;
+                return status;
        }
-        s = find_stateid(cl, stateid);
+        spin_lock(&cl->cl_lock);
+        s = find_stateid_locked(cl, stateid);
        if (!s)
-                return nfserr_bad_stateid;
+                goto out_unlock;
        status = check_stateid_generation(stateid, &s->sc_stateid, 1);
        if (status)
-                return status;
+                goto out_unlock;
        switch (s->sc_type) {
        case NFS4_DELEG_STID:
-                return nfs_ok;
+                status = nfs_ok;
+                break;
        case NFS4_REVOKED_DELEG_STID:
-                return nfserr_deleg_revoked;
+                status = nfserr_deleg_revoked;
+                break;
        case NFS4_OPEN_STID:
        case NFS4_LOCK_STID:
                ols = openlockstateid(s);
                if (ols->st_stateowner->so_is_open_owner
                                && !(openowner(ols->st_stateowner)->oo_flags
                                                & NFS4_OO_CONFIRMED))
-                        return nfserr_bad_stateid;
+                        status = nfserr_bad_stateid;
-                return nfs_ok;
+                else
+                        status = nfs_ok;
+                break;
        default:
                printk("unknown stateid type %x\n", s->sc_type);
+                /* Fallthrough */
        case NFS4_CLOSED_STID:
-                return nfserr_bad_stateid;
+        case NFS4_CLOSED_DELEG_STID:
+                status = nfserr_bad_stateid;
        }
+out_unlock:
+        spin_unlock(&cl->cl_lock);
+        return status;
 }
-static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
+static __be32
-                                   struct nfs4_stid **s, bool sessions,
+nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
-                                   struct nfsd_net *nn)
+                     stateid_t *stateid, unsigned char typemask,
+                     struct nfs4_stid **s, struct nfsd_net *nn)
 {
-        struct nfs4_client *cl;
        __be32 status;
        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
                return nfserr_bad_stateid;
-        status = lookup_clientid(&stateid->si_opaque.so_clid, sessions,
+        status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn);
-                                                        nn, &cl);
        if (status == nfserr_stale_clientid) {
-                if (sessions)
+                if (cstate->session)
                        return nfserr_bad_stateid;
                return nfserr_stale_stateid;
        }
        if (status)
                return status;
-        *s = find_stateid_by_type(cl, stateid, typemask);
+        *s = find_stateid_by_type(cstate->clp, stateid, typemask);
        if (!*s)
                return nfserr_bad_stateid;
        return nfs_ok;
@@ -3754,12 +4425,11 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
        if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
                return check_special_stateids(net, current_fh, stateid, flags);
-        nfs4_lock_state();
+        status = nfsd4_lookup_stateid(cstate, stateid,
+                                NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
-        status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
+                                &s, nn);
-                                      &s, cstate->minorversion, nn);
        if (status)
-                goto out;
+                return status;
        status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
        if (status)
                goto out;
@@ -3770,12 +4440,13 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
                if (status)
                        goto out;
                if (filpp) {
-                        file = dp->dl_file->fi_deleg_file;
+                        file = dp->dl_stid.sc_file->fi_deleg_file;
                        if (!file) {
                                WARN_ON_ONCE(1);
                                status = nfserr_serverfault;
                                goto out;
                        }
+                        get_file(file);
                }
                break;
        case NFS4_OPEN_STID:
@@ -3791,10 +4462,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
                if (status)
                        goto out;
                if (filpp) {
+                        struct nfs4_file *fp = stp->st_stid.sc_file;
                        if (flags & RD_STATE)
-                                file = find_readable_file(stp->st_file);
+                                file = find_readable_file(fp);
                        else
-                                file = find_writeable_file(stp->st_file);
+                                file = find_writeable_file(fp);
                }
                break;
        default:
@@ -3803,28 +4476,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
        }
        status = nfs_ok;
        if (file)
-                *filpp = get_file(file);
+                *filpp = file;
 out:
-        nfs4_unlock_state();
+        nfs4_put_stid(s);
        return status;
 }
-static __be32
-nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
-{
-        struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
-        if (check_for_locks(stp->st_file, lo))
-                return nfserr_locks_held;
-        /*
-         * Currently there's a 1-1 lock stateid<->lockowner
-         * correspondance, and we have to delete the lockowner when we
-         * delete the lock stateid:
-         */
-        release_lockowner(lo);
-        return nfs_ok;
-}
 /*
 * Test if the stateid is valid
 */
@@ -3835,11 +4492,9 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        struct nfsd4_test_stateid_id *stateid;
        struct nfs4_client *cl = cstate->session->se_client;
-        nfs4_lock_state();
        list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list)
                stateid->ts_id_status =
                        nfsd4_validate_stateid(cl, &stateid->ts_id_stateid);
-        nfs4_unlock_state();
        return nfs_ok;
 }
@@ -3851,37 +4506,50 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        stateid_t *stateid = &free_stateid->fr_stateid;
        struct nfs4_stid *s;
        struct nfs4_delegation *dp;
+        struct nfs4_ol_stateid *stp;
        struct nfs4_client *cl = cstate->session->se_client;
        __be32 ret = nfserr_bad_stateid;
-        nfs4_lock_state();
+        spin_lock(&cl->cl_lock);
-        s = find_stateid(cl, stateid);
+        s = find_stateid_locked(cl, stateid);
        if (!s)
-                goto out;
+                goto out_unlock;
        switch (s->sc_type) {
        case NFS4_DELEG_STID:
                ret = nfserr_locks_held;
-                goto out;
+                break;
        case NFS4_OPEN_STID:
-        case NFS4_LOCK_STID:
                ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
                if (ret)
-                        goto out;
+                        break;
-                if (s->sc_type == NFS4_LOCK_STID)
+                ret = nfserr_locks_held;
-                        ret = nfsd4_free_lock_stateid(openlockstateid(s));
-                else
-                        ret = nfserr_locks_held;
                break;
+        case NFS4_LOCK_STID:
+                ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+                if (ret)
+                        break;
+                stp = openlockstateid(s);
+                ret = nfserr_locks_held;
+                if (check_for_locks(stp->st_stid.sc_file,
+                                    lockowner(stp->st_stateowner)))
+                        break;
+                unhash_lock_stateid(stp);
+                spin_unlock(&cl->cl_lock);
+                nfs4_put_stid(s);
+                ret = nfs_ok;
+                goto out;
        case NFS4_REVOKED_DELEG_STID:
                dp = delegstateid(s);
-                destroy_revoked_delegation(dp);
+                list_del_init(&dp->dl_recall_lru);
+                spin_unlock(&cl->cl_lock);
+                nfs4_put_stid(s);
                ret = nfs_ok;
-                break;
+                goto out;
-        default:
+        /* Default falls through and returns nfserr_bad_stateid */
-                ret = nfserr_bad_stateid;
        }
+out_unlock:
+        spin_unlock(&cl->cl_lock);
 out:
-        nfs4_unlock_state();
        return ret;
 }
@@ -3926,20 +4594,24 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 {
        __be32 status;
        struct nfs4_stid *s;
+        struct nfs4_ol_stateid *stp = NULL;
        dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
                seqid, STATEID_VAL(stateid));
        *stpp = NULL;
-        status = nfsd4_lookup_stateid(stateid, typemask, &s,
+        status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
-                                      cstate->minorversion, nn);
        if (status)
                return status;
-        *stpp = openlockstateid(s);
+        stp = openlockstateid(s);
-        if (!nfsd4_has_session(cstate))
+        nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
-                cstate->replay_owner = (*stpp)->st_stateowner;
-        return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
+        status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
+        if (!status)
+                *stpp = stp;
+        else
+                nfs4_put_stid(&stp->st_stid);
+        return status;
 }
 static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
@@ -3947,14 +4619,18 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
 {
        __be32 status;
        struct nfs4_openowner *oo;
+        struct nfs4_ol_stateid *stp;
        status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
-                                                NFS4_OPEN_STID, stpp, nn);
+                                                NFS4_OPEN_STID, &stp, nn);
        if (status)
                return status;
-        oo = openowner((*stpp)->st_stateowner);
+        oo = openowner(stp->st_stateowner);
-        if (!(oo->oo_flags & NFS4_OO_CONFIRMED))
+        if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
+                nfs4_put_stid(&stp->st_stid);
                return nfserr_bad_stateid;
+        }
+        *stpp = stp;
        return nfs_ok;
 }
@@ -3974,8 +4650,6 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                return status;
-        nfs4_lock_state();
        status = nfs4_preprocess_seqid_op(cstate,
                                        oc->oc_seqid, &oc->oc_req_stateid,
                                        NFS4_OPEN_STID, &stp, nn);
@@ -3984,7 +4658,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        oo = openowner(stp->st_stateowner);
        status = nfserr_bad_stateid;
        if (oo->oo_flags & NFS4_OO_CONFIRMED)
-                goto out;
+                goto put_stateid;
        oo->oo_flags |= NFS4_OO_CONFIRMED;
        update_stateid(&stp->st_stid.sc_stateid);
        memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
@@ -3993,10 +4667,10 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfsd4_client_record_create(oo->oo_owner.so_client);
        status = nfs_ok;
+put_stateid:
+        nfs4_put_stid(&stp->st_stid);
 out:
        nfsd4_bump_seqid(cstate, status);
-        if (!cstate->replay_owner)
-                nfs4_unlock_state();
        return status;
 }
@@ -4004,7 +4678,7 @@ static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 a
 {
        if (!test_access(access, stp))
                return;
-        nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access));
+        nfs4_file_put_access(stp->st_stid.sc_file, access);
        clear_access(access, stp);
 }
@@ -4026,16 +4700,6 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac
        }
 }
-static void
-reset_union_bmap_deny(unsigned long deny, struct nfs4_ol_stateid *stp)
-{
-        int i;
-        for (i = 0; i < 4; i++) {
-                if ((i & deny) != i)
-                        clear_deny(i, stp);
-        }
-}
 __be32
 nfsd4_open_downgrade(struct svc_rqst *rqstp,
                     struct nfsd4_compound_state *cstate,
@@ -4053,21 +4717,20 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
                dprintk("NFSD: %s: od_deleg_want=0x%x ignored\n", __func__,
                        od->od_deleg_want);
-        nfs4_lock_state();
        status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
                                        &od->od_stateid, &stp, nn);
        if (status)
                goto out; 
        status = nfserr_inval;
        if (!test_access(od->od_share_access, stp)) {
-                dprintk("NFSD: access not a subset current bitmap: 0x%lx, input access=%08x\n",
+                dprintk("NFSD: access not a subset of current bitmap: 0x%hhx, input access=%08x\n",
                        stp->st_access_bmap, od->od_share_access);
-                goto out;
+                goto put_stateid;
        }
        if (!test_deny(od->od_share_deny, stp)) {
-                dprintk("NFSD:deny not a subset current bitmap: 0x%lx, input deny=%08x\n",
+                dprintk("NFSD: deny not a subset of current bitmap: 0x%hhx, input deny=%08x\n",
                        stp->st_deny_bmap, od->od_share_deny);
-                goto out;
+                goto put_stateid;
        }
        nfs4_stateid_downgrade(stp, od->od_share_access);
@@ -4076,17 +4739,31 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
        update_stateid(&stp->st_stid.sc_stateid);
        memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
        status = nfs_ok;
+put_stateid:
+        nfs4_put_stid(&stp->st_stid);
 out:
        nfsd4_bump_seqid(cstate, status);
-        if (!cstate->replay_owner)
-                nfs4_unlock_state();
        return status;
 }
 static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 {
-        unhash_open_stateid(s);
+        struct nfs4_client *clp = s->st_stid.sc_client;
+        LIST_HEAD(reaplist);
        s->st_stid.sc_type = NFS4_CLOSED_STID;
+        spin_lock(&clp->cl_lock);
+        unhash_open_stateid(s, &reaplist);
+        if (clp->cl_minorversion) {
+                put_ol_stateid_locked(s, &reaplist);
+                spin_unlock(&clp->cl_lock);
+                free_ol_stateid_reaplist(&reaplist);
+        } else {
+                spin_unlock(&clp->cl_lock);
+                free_ol_stateid_reaplist(&reaplist);
+                move_to_close_lru(s, clp->net);
+        }
 }
 /*
@@ -4097,7 +4774,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
            struct nfsd4_close *close)
 {
        __be32 status;
-        struct nfs4_openowner *oo;
        struct nfs4_ol_stateid *stp;
        struct net *net = SVC_NET(rqstp);
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -4105,7 +4781,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        dprintk("NFSD: nfsd4_close on file %pd\n", 
                        cstate->current_fh.fh_dentry);
-        nfs4_lock_state();
        status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
                                        &close->cl_stateid,
                                        NFS4_OPEN_STID|NFS4_CLOSED_STID,
@@ -4113,31 +4788,14 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        nfsd4_bump_seqid(cstate, status);
        if (status)
                goto out; 
-        oo = openowner(stp->st_stateowner);
        update_stateid(&stp->st_stid.sc_stateid);
        memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
        nfsd4_close_open_stateid(stp);
-        if (cstate->minorversion)
+        /* put reference from nfs4_preprocess_seqid_op */
-                free_generic_stateid(stp);
+        nfs4_put_stid(&stp->st_stid);
-        else
-                oo->oo_last_closed_stid = stp;
-        if (list_empty(&oo->oo_owner.so_stateids)) {
-                if (cstate->minorversion)
-                        release_openowner(oo);
-                else {
-                        /*
-                         * In the 4.0 case we need to keep the owners around a
-                         * little while to handle CLOSE replay.
-                         */
-                        move_to_close_lru(oo, SVC_NET(rqstp));
-                }
-        }
 out:
-        if (!cstate->replay_owner)
-                nfs4_unlock_state();
        return status;
 }
@@ -4154,28 +4812,24 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
                return status;
-        nfs4_lock_state();
+        status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn);
-        status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s,
-                                      cstate->minorversion, nn);
        if (status)
                goto out;
        dp = delegstateid(s);
        status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
        if (status)
-                goto out;
+                goto put_stateid;
        destroy_delegation(dp);
+put_stateid:
+        nfs4_put_stid(&dp->dl_stid);
 out:
-        nfs4_unlock_state();
        return status;
 }
 #define LOFF_OVERFLOW(start, len)      ((u64)(len) > ~(u64)(start))
-#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)
 static inline u64
 end_offset(u64 start, u64 len)
 {
@@ -4196,13 +4850,6 @@ last_byte_offset(u64 start, u64 len)
        return end > start ? end - 1: NFS4_MAX_UINT64;
 }
-static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername)
-{
-        return (file_hashval(inode) + cl_id
-                        + opaque_hashval(ownername->data, ownername->len))
-                & LOCKOWNER_INO_HASH_MASK;
-}
 /*
 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
 * we can't properly handle lock requests that go beyond the (2^63 - 1)-th
@@ -4255,47 +4902,56 @@ nevermind:
                deny->ld_type = NFS4_WRITE_LT;
 }
-static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, clientid_t *clid, struct xdr_netobj *owner)
+static struct nfs4_lockowner *
+find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner,
+                struct nfs4_client *clp)
 {
-        struct nfs4_ol_stateid *lst;
+        unsigned int strhashval = ownerstr_hashval(owner);
+        struct nfs4_stateowner *so;
-        if (!same_owner_str(&lo->lo_owner, owner, clid))
+        lockdep_assert_held(&clp->cl_lock);
-                return false;
-        if (list_empty(&lo->lo_owner.so_stateids)) {
+        list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[strhashval],
-                WARN_ON_ONCE(1);
+                            so_strhash) {
-                return false;
+                if (so->so_is_open_owner)
+                        continue;
+                if (!same_owner_str(so, owner))
+                        continue;
+                atomic_inc(&so->so_count);
+                return lockowner(so);
        }
-        lst = list_first_entry(&lo->lo_owner.so_stateids,
+        return NULL;
-                               struct nfs4_ol_stateid, st_perstateowner);
-        return lst->st_file->fi_inode == inode;
 }
 static struct nfs4_lockowner *
-find_lockowner_str(struct inode *inode, clientid_t *clid,
+find_lockowner_str(clientid_t *clid, struct xdr_netobj *owner,
-                   struct xdr_netobj *owner, struct nfsd_net *nn)
+                struct nfs4_client *clp)
 {
-        unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);
        struct nfs4_lockowner *lo;
-        list_for_each_entry(lo, &nn->lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
+        spin_lock(&clp->cl_lock);
-                if (same_lockowner_ino(lo, inode, clid, owner))
+        lo = find_lockowner_str_locked(clid, owner, clp);
-                        return lo;
+        spin_unlock(&clp->cl_lock);
-        }
+        return lo;
-        return NULL;
 }
-static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
+static void nfs4_unhash_lockowner(struct nfs4_stateowner *sop)
 {
-        struct inode *inode = open_stp->st_file->fi_inode;
+        unhash_lockowner_locked(lockowner(sop));
-        unsigned int inohash = lockowner_ino_hashval(inode,
+}
-                        clp->cl_clientid.cl_id, &lo->lo_owner.so_owner);
-        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+static void nfs4_free_lockowner(struct nfs4_stateowner *sop)
+{
+        struct nfs4_lockowner *lo = lockowner(sop);
-        list_add(&lo->lo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);
+        kmem_cache_free(lockowner_slab, lo);
-        list_add(&lo->lo_owner_ino_hash, &nn->lockowner_ino_hashtbl[inohash]);
-        list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
 }
+static const struct nfs4_stateowner_operations lockowner_ops = {
+        .so_unhash =    nfs4_unhash_lockowner,
+        .so_free =      nfs4_free_lockowner,
+};
 /*
 * Alloc a lock owner structure.
 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 
@@ -4303,42 +4959,107 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
 *
 * strhashval = ownerstr_hashval
 */
 static struct nfs4_lockowner *
-alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) {
+alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
-        struct nfs4_lockowner *lo;
+                           struct nfs4_ol_stateid *open_stp,
+                           struct nfsd4_lock *lock)
+{
+        struct nfs4_lockowner *lo, *ret;
        lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
        if (!lo)
                return NULL;
        INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
        lo->lo_owner.so_is_open_owner = 0;
-        /* It is the openowner seqid that will be incremented in encode in the
+        lo->lo_owner.so_seqid = lock->lk_new_lock_seqid;
-         * case of new lockowners; so increment the lock seqid manually: */
+        lo->lo_owner.so_ops = &lockowner_ops;
-        lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1;
+        spin_lock(&clp->cl_lock);
-        hash_lockowner(lo, strhashval, clp, open_stp);
+        ret = find_lockowner_str_locked(&clp->cl_clientid,
+                        &lock->lk_new_owner, clp);
+        if (ret == NULL) {
+                list_add(&lo->lo_owner.so_strhash,
+                         &clp->cl_ownerstr_hashtbl[strhashval]);
+                ret = lo;
+        } else
+                nfs4_free_lockowner(&lo->lo_owner);
+        spin_unlock(&clp->cl_lock);
        return lo;
 }
-static struct nfs4_ol_stateid *
+static void
-alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp)
+init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
+                  struct nfs4_file *fp, struct inode *inode,
+                  struct nfs4_ol_stateid *open_stp)
 {
-        struct nfs4_ol_stateid *stp;
        struct nfs4_client *clp = lo->lo_owner.so_client;
-        stp = nfs4_alloc_stateid(clp);
+        lockdep_assert_held(&clp->cl_lock);
-        if (stp == NULL)
-                return NULL;
+        atomic_inc(&stp->st_stid.sc_count);
        stp->st_stid.sc_type = NFS4_LOCK_STID;
-        list_add(&stp->st_perfile, &fp->fi_stateids);
-        list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
        stp->st_stateowner = &lo->lo_owner;
+        atomic_inc(&lo->lo_owner.so_count);
        get_nfs4_file(fp);
-        stp->st_file = fp;
+        stp->st_stid.sc_file = fp;
+        stp->st_stid.sc_free = nfs4_free_lock_stateid;
        stp->st_access_bmap = 0;
        stp->st_deny_bmap = open_stp->st_deny_bmap;
        stp->st_openstp = open_stp;
-        return stp;
+        list_add(&stp->st_locks, &open_stp->st_locks);
+        list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
+        spin_lock(&fp->fi_lock);
+        list_add(&stp->st_perfile, &fp->fi_stateids);
+        spin_unlock(&fp->fi_lock);
+}
+static struct nfs4_ol_stateid *
+find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp)
+{
+        struct nfs4_ol_stateid *lst;
+        struct nfs4_client *clp = lo->lo_owner.so_client;
+        lockdep_assert_held(&clp->cl_lock);
+        list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) {
+                if (lst->st_stid.sc_file == fp) {
+                        atomic_inc(&lst->st_stid.sc_count);
+                        return lst;
+                }
+        }
+        return NULL;
+}
+static struct nfs4_ol_stateid *
+find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi,
+                            struct inode *inode, struct nfs4_ol_stateid *ost,
+                            bool *new)
+{
+        struct nfs4_stid *ns = NULL;
+        struct nfs4_ol_stateid *lst;
+        struct nfs4_openowner *oo = openowner(ost->st_stateowner);
+        struct nfs4_client *clp = oo->oo_owner.so_client;
+        spin_lock(&clp->cl_lock);
+        lst = find_lock_stateid(lo, fi);
+        if (lst == NULL) {
+                spin_unlock(&clp->cl_lock);
+                ns = nfs4_alloc_stid(clp, stateid_slab);
+                if (ns == NULL)
+                        return NULL;
+                spin_lock(&clp->cl_lock);
+                lst = find_lock_stateid(lo, fi);
+                if (likely(!lst)) {
+                        lst = openlockstateid(ns);
+                        init_lock_stateid(lst, lo, fi, inode, ost);
+                        ns = NULL;
+                        *new = true;
+                }
+        }
+        spin_unlock(&clp->cl_lock);
+        if (ns)
+                nfs4_put_stid(ns);
+        return lst;
 }
 static int
@@ -4350,46 +5071,53 @@ check_lock_length(u64 offset, u64 length)
 static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
 {
-        struct nfs4_file *fp = lock_stp->st_file;
+        struct nfs4_file *fp = lock_stp->st_stid.sc_file;
-        int oflag = nfs4_access_to_omode(access);
+        lockdep_assert_held(&fp->fi_lock);
        if (test_access(access, lock_stp))
                return;
-        nfs4_file_get_access(fp, oflag);
+        __nfs4_file_get_access(fp, access);
        set_access(access, lock_stp);
 }
-static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
+static __be32
+lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
+                            struct nfs4_ol_stateid *ost,
+                            struct nfsd4_lock *lock,
+                            struct nfs4_ol_stateid **lst, bool *new)
 {
-        struct nfs4_file *fi = ost->st_file;
+        __be32 status;
+        struct nfs4_file *fi = ost->st_stid.sc_file;
        struct nfs4_openowner *oo = openowner(ost->st_stateowner);
        struct nfs4_client *cl = oo->oo_owner.so_client;
+        struct inode *inode = cstate->current_fh.fh_dentry->d_inode;
        struct nfs4_lockowner *lo;
        unsigned int strhashval;
-        struct nfsd_net *nn = net_generic(cl->net, nfsd_net_id);
+        lo = find_lockowner_str(&cl->cl_clientid, &lock->v.new.owner, cl);
-        lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid,
+        if (!lo) {
-                                &lock->v.new.owner, nn);
+                strhashval = ownerstr_hashval(&lock->v.new.owner);
-        if (lo) {
+                lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
-                if (!cstate->minorversion)
+                if (lo == NULL)
-                        return nfserr_bad_seqid;
+                        return nfserr_jukebox;
-                /* XXX: a lockowner always has exactly one stateid: */
+        } else {
-                *lst = list_first_entry(&lo->lo_owner.so_stateids,
+                /* with an existing lockowner, seqids must be the same */
-                                struct nfs4_ol_stateid, st_perstateowner);
+                status = nfserr_bad_seqid;
-                return nfs_ok;
+                if (!cstate->minorversion &&
+                    lock->lk_new_lock_seqid != lo->lo_owner.so_seqid)
+                        goto out;
        }
-        strhashval = ownerstr_hashval(cl->cl_clientid.cl_id,
-                        &lock->v.new.owner);
+        *lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
-        lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
-        if (lo == NULL)
-                return nfserr_jukebox;
-        *lst = alloc_init_lock_stateid(lo, fi, ost);
        if (*lst == NULL) {
-                release_lockowner(lo);
+                status = nfserr_jukebox;
-                return nfserr_jukebox;
+                goto out;
        }
-        *new = true;
+        status = nfs_ok;
-        return nfs_ok;
+out:
+        nfs4_put_stateowner(&lo->lo_owner);
+        return status;
 }
 /*
@@ -4401,14 +5129,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
        struct nfs4_openowner *open_sop = NULL;
        struct nfs4_lockowner *lock_sop = NULL;
-        struct nfs4_ol_stateid *lock_stp;
+        struct nfs4_ol_stateid *lock_stp = NULL;
+        struct nfs4_ol_stateid *open_stp = NULL;
+        struct nfs4_file *fp;
        struct file *filp = NULL;
        struct file_lock *file_lock = NULL;
        struct file_lock *conflock = NULL;
        __be32 status = 0;
-        bool new_state = false;
        int lkflg;
        int err;
+        bool new = false;
        struct net *net = SVC_NET(rqstp);
        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -4425,11 +5155,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                return status;
        }
-        nfs4_lock_state();
        if (lock->lk_is_new) {
-                struct nfs4_ol_stateid *open_stp = NULL;
                if (nfsd4_has_session(cstate))
                        /* See rfc 5661 18.10.3: given clientid is ignored: */
                        memcpy(&lock->v.new.clientid,
@@ -4453,12 +5179,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                                                &lock->v.new.clientid))
                        goto out;
                status = lookup_or_create_lock_state(cstate, open_stp, lock,
-                                                        &lock_stp, &new_state);
+                                                        &lock_stp, &new);
-        } else
+        } else {
                status = nfs4_preprocess_seqid_op(cstate,
                                       lock->lk_old_lock_seqid,
                                       &lock->lk_old_lock_stateid,
                                       NFS4_LOCK_STID, &lock_stp, nn);
+        }
        if (status)
                goto out;
        lock_sop = lockowner(lock_stp->st_stateowner);
@@ -4482,20 +5209,25 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                goto out;
        }
+        fp = lock_stp->st_stid.sc_file;
        locks_init_lock(file_lock);
        switch (lock->lk_type) {
                case NFS4_READ_LT:
                case NFS4_READW_LT:
-                        filp = find_readable_file(lock_stp->st_file);
+                        spin_lock(&fp->fi_lock);
+                        filp = find_readable_file_locked(fp);
                        if (filp)
                                get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
+                        spin_unlock(&fp->fi_lock);
                        file_lock->fl_type = F_RDLCK;
                        break;
                case NFS4_WRITE_LT:
                case NFS4_WRITEW_LT:
-                        filp = find_writeable_file(lock_stp->st_file);
+                        spin_lock(&fp->fi_lock);
+                        filp = find_writeable_file_locked(fp);
                        if (filp)
                                get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
+                        spin_unlock(&fp->fi_lock);
                        file_lock->fl_type = F_WRLCK;
                        break;
                default:
@@ -4544,11 +5276,27 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                break;
        }
 out:
-        if (status && new_state)
+        if (filp)
-                release_lockowner(lock_sop);
+                fput(filp);
+        if (lock_stp) {
+                /* Bump seqid manually if the 4.0 replay owner is openowner */
+                if (cstate->replay_owner &&
+                    cstate->replay_owner != &lock_sop->lo_owner &&
+                    seqid_mutating_err(ntohl(status)))
+                        lock_sop->lo_owner.so_seqid++;
+                /*
+                 * If this is a new, never-before-used stateid, and we are
+                 * returning an error, then just go ahead and release it.
+                 */
+                if (status && new)
+                        release_lock_stateid(lock_stp);
+                nfs4_put_stid(&lock_stp->st_stid);
+        }
+        if (open_stp)
+                nfs4_put_stid(&open_stp->st_stid);
        nfsd4_bump_seqid(cstate, status);
-        if (!cstate->replay_owner)
-                nfs4_unlock_state();
        if (file_lock)
                locks_free_lock(file_lock);
        if (conflock)
@@ -4580,9 +5328,8 @@ __be32
 nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
            struct nfsd4_lockt *lockt)
 {
-        struct inode *inode;
        struct file_lock *file_lock = NULL;
-        struct nfs4_lockowner *lo;
+        struct nfs4_lockowner *lo = NULL;
        __be32 status;
        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
@@ -4592,10 +5339,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (check_lock_length(lockt->lt_offset, lockt->lt_length))
                 return nfserr_inval;
-        nfs4_lock_state();
        if (!nfsd4_has_session(cstate)) {
-                status = lookup_clientid(&lockt->lt_clientid, false, nn, NULL);
+                status = lookup_clientid(&lockt->lt_clientid, cstate, nn);
                if (status)
                        goto out;
        }
@@ -4603,7 +5348,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
                goto out;
-        inode = cstate->current_fh.fh_dentry->d_inode;
        file_lock = locks_alloc_lock();
        if (!file_lock) {
                dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
@@ -4626,7 +5370,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                goto out;
        }
-        lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner, nn);
+        lo = find_lockowner_str(&lockt->lt_clientid, &lockt->lt_owner,
+                                cstate->clp);
        if (lo)
                file_lock->fl_owner = (fl_owner_t)lo;
        file_lock->fl_pid = current->tgid;
@@ -4646,7 +5391,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                nfs4_set_lock_denied(file_lock, &lockt->lt_denied);
        }
 out:
-        nfs4_unlock_state();
+        if (lo)
+                nfs4_put_stateowner(&lo->lo_owner);
        if (file_lock)
                locks_free_lock(file_lock);
        return status;
@@ -4670,23 +5416,21 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (check_lock_length(locku->lu_offset, locku->lu_length))
                 return nfserr_inval;
-        nfs4_lock_state();
-                                                                                
        status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
                                        &locku->lu_stateid, NFS4_LOCK_STID,
                                        &stp, nn);
        if (status)
                goto out;
-        filp = find_any_file(stp->st_file);
+        filp = find_any_file(stp->st_stid.sc_file);
        if (!filp) {
                status = nfserr_lock_range;
-                goto out;
+                goto put_stateid;
        }
        file_lock = locks_alloc_lock();
        if (!file_lock) {
                dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
                status = nfserr_jukebox;
-                goto out;
+                goto fput;
        }
        locks_init_lock(file_lock);
        file_lock->fl_type = F_UNLCK;
@@ -4708,41 +5452,51 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        }
        update_stateid(&stp->st_stid.sc_stateid);
        memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+fput:
+        fput(filp);
+put_stateid:
+        nfs4_put_stid(&stp->st_stid);
 out:
        nfsd4_bump_seqid(cstate, status);
-        if (!cstate->replay_owner)
-                nfs4_unlock_state();
        if (file_lock)
                locks_free_lock(file_lock);
        return status;
 out_nfserr:
        status = nfserrno(err);
-        goto out;
+        goto fput;
 }
 /*
 * returns
- *      1: locks held by lockowner
+ *      true:  locks held by lockowner
- *      0: no locks held by lockowner
+ *      false: no locks held by lockowner
 */
-static int
+static bool
-check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
+check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 {
        struct file_lock **flpp;
-        struct inode *inode = filp->fi_inode;
+        int status = false;
-        int status = 0;
+        struct file *filp = find_any_file(fp);
+        struct inode *inode;
+        if (!filp) {
+                /* Any valid lock stateid should have some sort of access */
+                WARN_ON_ONCE(1);
+                return status;
+        }
+        inode = file_inode(filp);
        spin_lock(&inode->i_lock);
        for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
                if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
-                        status = 1;
+                        status = true;
-                        goto out;
+                        break;
                }
        }
-out:
        spin_unlock(&inode->i_lock);
+        fput(filp);
        return status;
 }
@@ -4753,53 +5507,46 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 {
        clientid_t *clid = &rlockowner->rl_clientid;
        struct nfs4_stateowner *sop;
-        struct nfs4_lockowner *lo;
+        struct nfs4_lockowner *lo = NULL;
        struct nfs4_ol_stateid *stp;
        struct xdr_netobj *owner = &rlockowner->rl_owner;
-        struct list_head matches;
+        unsigned int hashval = ownerstr_hashval(owner);
-        unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
        __be32 status;
        struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+        struct nfs4_client *clp;
        dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
                clid->cl_boot, clid->cl_id);
-        nfs4_lock_state();
+        status = lookup_clientid(clid, cstate, nn);
-        status = lookup_clientid(clid, cstate->minorversion, nn, NULL);
        if (status)
-                goto out;
+                return status;
-        status = nfserr_locks_held;
+        clp = cstate->clp;
-        INIT_LIST_HEAD(&matches);
+        /* Find the matching lock stateowner */
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval],
+                            so_strhash) {
-        list_for_each_entry(sop, &nn->ownerstr_hashtbl[hashval], so_strhash) {
+                if (sop->so_is_open_owner || !same_owner_str(sop, owner))
-                if (sop->so_is_open_owner)
                        continue;
-                if (!same_owner_str(sop, owner, clid))
-                        continue;
+                /* see if there are still any locks associated with it */
-                list_for_each_entry(stp, &sop->so_stateids,
+                lo = lockowner(sop);
-                                st_perstateowner) {
+                list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) {
-                        lo = lockowner(sop);
+                        if (check_for_locks(stp->st_stid.sc_file, lo)) {
-                        if (check_for_locks(stp->st_file, lo))
+                                status = nfserr_locks_held;
-                                goto out;
+                                spin_unlock(&clp->cl_lock);
-                        list_add(&lo->lo_list, &matches);
+                                return status;
+                        }
                }
+                atomic_inc(&sop->so_count);
+                break;
        }
-        /* Clients probably won't expect us to return with some (but not all)
+        spin_unlock(&clp->cl_lock);
-         * of the lockowner state released; so don't release any until all
+        if (lo)
-         * have been checked. */
-        status = nfs_ok;
-        while (!list_empty(&matches)) {
-                lo = list_entry(matches.next, struct nfs4_lockowner,
-                                                                lo_list);
-                /* unhash_stateowner deletes so_perclient only
-                 * for openowners. */
-                list_del(&lo->lo_list);
                release_lockowner(lo);
-        }
-out:
-        nfs4_unlock_state();
        return status;
 }
@@ -4887,34 +5634,123 @@ nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn)
 * Called from OPEN. Look for clientid in reclaim list.
 */
 __be32
-nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn)
+nfs4_check_open_reclaim(clientid_t *clid,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd_net *nn)
 {
-        struct nfs4_client *clp;
+        __be32 status;
        /* find clientid in conf_id_hashtbl */
-        clp = find_confirmed_client(clid, sessions, nn);
+        status = lookup_clientid(clid, cstate, nn);
-        if (clp == NULL)
+        if (status)
                return nfserr_reclaim_bad;
-        return nfsd4_client_record_check(clp) ? nfserr_reclaim_bad : nfs_ok;
+        if (nfsd4_client_record_check(cstate->clp))
+                return nfserr_reclaim_bad;
+        return nfs_ok;
 }
 #ifdef CONFIG_NFSD_FAULT_INJECTION
+static inline void
+put_client(struct nfs4_client *clp)
+{
+        atomic_dec(&clp->cl_refcount);
+}
-u64 nfsd_forget_client(struct nfs4_client *clp, u64 max)
+static struct nfs4_client *
+nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size)
 {
-        if (mark_client_expired(clp))
+        struct nfs4_client *clp;
-                return 0;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
-        expire_client(clp);
+                                          nfsd_net_id);
-        return 1;
+        if (!nfsd_netns_ready(nn))
+                return NULL;
+        list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+                if (memcmp(&clp->cl_addr, addr, addr_size) == 0)
+                        return clp;
+        }
+        return NULL;
 }
-u64 nfsd_print_client(struct nfs4_client *clp, u64 num)
+u64
+nfsd_inject_print_clients(void)
 {
+        struct nfs4_client *clp;
+        u64 count = 0;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                          nfsd_net_id);
        char buf[INET6_ADDRSTRLEN];
-        rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
-        printk(KERN_INFO "NFS Client: %s\n", buf);
+        if (!nfsd_netns_ready(nn))
-        return 1;
+                return 0;
+        spin_lock(&nn->client_lock);
+        list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+                rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
+                pr_info("NFS Client: %s\n", buf);
+                ++count;
+        }
+        spin_unlock(&nn->client_lock);
+        return count;
+}
+u64
+nfsd_inject_forget_client(struct sockaddr_storage *addr, size_t addr_size)
+{
+        u64 count = 0;
+        struct nfs4_client *clp;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                          nfsd_net_id);
+        if (!nfsd_netns_ready(nn))
+                return count;
+        spin_lock(&nn->client_lock);
+        clp = nfsd_find_client(addr, addr_size);
+        if (clp) {
+                if (mark_client_expired_locked(clp) == nfs_ok)
+                        ++count;
+                else
+                        clp = NULL;
+        }
+        spin_unlock(&nn->client_lock);
+        if (clp)
+                expire_client(clp);
+        return count;
+}
+u64
+nfsd_inject_forget_clients(u64 max)
+{
+        u64 count = 0;
+        struct nfs4_client *clp, *next;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                                nfsd_net_id);
+        LIST_HEAD(reaplist);
+        if (!nfsd_netns_ready(nn))
+                return count;
+        spin_lock(&nn->client_lock);
+        list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
+                if (mark_client_expired_locked(clp) == nfs_ok) {
+                        list_add(&clp->cl_lru, &reaplist);
+                        if (max != 0 && ++count >= max)
+                                break;
+                }
+        }
+        spin_unlock(&nn->client_lock);
+        list_for_each_entry_safe(clp, next, &reaplist, cl_lru)
+                expire_client(clp);
+        return count;
 }
 static void nfsd_print_count(struct nfs4_client *clp, unsigned int count,
@@ -4925,158 +5761,484 @@ static void nfsd_print_count(struct nfs4_client *clp, unsigned int count,
        printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type);
 }
-static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_lockowner *))
+static void
+nfsd_inject_add_lock_to_list(struct nfs4_ol_stateid *lst,
+                             struct list_head *collect)
+{
+        struct nfs4_client *clp = lst->st_stid.sc_client;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                          nfsd_net_id);
+        if (!collect)
+                return;
+        lockdep_assert_held(&nn->client_lock);
+        atomic_inc(&clp->cl_refcount);
+        list_add(&lst->st_locks, collect);
+}
+static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max,
+                                    struct list_head *collect,
+                                    void (*func)(struct nfs4_ol_stateid *))
 {
        struct nfs4_openowner *oop;
-        struct nfs4_lockowner *lop, *lo_next;
        struct nfs4_ol_stateid *stp, *st_next;
+        struct nfs4_ol_stateid *lst, *lst_next;
        u64 count = 0;
+        spin_lock(&clp->cl_lock);
        list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) {
-                list_for_each_entry_safe(stp, st_next, &oop->oo_owner.so_stateids, st_perstateowner) {
+                list_for_each_entry_safe(stp, st_next,
-                        list_for_each_entry_safe(lop, lo_next, &stp->st_lockowners, lo_perstateid) {
+                                &oop->oo_owner.so_stateids, st_perstateowner) {
-                                if (func)
+                        list_for_each_entry_safe(lst, lst_next,
-                                        func(lop);
+                                        &stp->st_locks, st_locks) {
-                                if (++count == max)
+                                if (func) {
-                                        return count;
+                                        func(lst);
+                                        nfsd_inject_add_lock_to_list(lst,
+                                                                collect);
+                                }
+                                ++count;
+                                /*
+                                 * Despite the fact that these functions deal
+                                 * with 64-bit integers for "count", we must
+                                 * ensure that it doesn't blow up the
+                                 * clp->cl_refcount. Throw a warning if we
+                                 * start to approach INT_MAX here.
+                                 */
+                                WARN_ON_ONCE(count == (INT_MAX / 2));
+                                if (count == max)
+                                        goto out;
                        }
                }
        }
+out:
+        spin_unlock(&clp->cl_lock);
        return count;
 }
-u64 nfsd_forget_client_locks(struct nfs4_client *clp, u64 max)
+static u64
+nfsd_collect_client_locks(struct nfs4_client *clp, struct list_head *collect,
+                          u64 max)
 {
-        return nfsd_foreach_client_lock(clp, max, release_lockowner);
+        return nfsd_foreach_client_lock(clp, max, collect, unhash_lock_stateid);
 }
-u64 nfsd_print_client_locks(struct nfs4_client *clp, u64 max)
+static u64
+nfsd_print_client_locks(struct nfs4_client *clp)
 {
-        u64 count = nfsd_foreach_client_lock(clp, max, NULL);
+        u64 count = nfsd_foreach_client_lock(clp, 0, NULL, NULL);
        nfsd_print_count(clp, count, "locked files");
        return count;
 }
-static u64 nfsd_foreach_client_open(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_openowner *))
+u64
+nfsd_inject_print_locks(void)
+{
+        struct nfs4_client *clp;
+        u64 count = 0;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                                nfsd_net_id);
+        if (!nfsd_netns_ready(nn))
+                return 0;
+        spin_lock(&nn->client_lock);
+        list_for_each_entry(clp, &nn->client_lru, cl_lru)
+                count += nfsd_print_client_locks(clp);
+        spin_unlock(&nn->client_lock);
+        return count;
+}
+static void
+nfsd_reap_locks(struct list_head *reaplist)
+{
+        struct nfs4_client *clp;
+        struct nfs4_ol_stateid *stp, *next;
+        list_for_each_entry_safe(stp, next, reaplist, st_locks) {
+                list_del_init(&stp->st_locks);
+                clp = stp->st_stid.sc_client;
+                nfs4_put_stid(&stp->st_stid);
+                put_client(clp);
+        }
+}
+u64
+nfsd_inject_forget_client_locks(struct sockaddr_storage *addr, size_t addr_size)
+{
+        unsigned int count = 0;
+        struct nfs4_client *clp;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                                nfsd_net_id);
+        LIST_HEAD(reaplist);
+        if (!nfsd_netns_ready(nn))
+                return count;
+        spin_lock(&nn->client_lock);
+        clp = nfsd_find_client(addr, addr_size);
+        if (clp)
+                count = nfsd_collect_client_locks(clp, &reaplist, 0);
+        spin_unlock(&nn->client_lock);
+        nfsd_reap_locks(&reaplist);
+        return count;
+}
+u64
+nfsd_inject_forget_locks(u64 max)
+{
+        u64 count = 0;
+        struct nfs4_client *clp;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                                nfsd_net_id);
+        LIST_HEAD(reaplist);
+        if (!nfsd_netns_ready(nn))
+                return count;
+        spin_lock(&nn->client_lock);
+        list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+                count += nfsd_collect_client_locks(clp, &reaplist, max - count);
+                if (max != 0 && count >= max)
+                        break;
+        }
+        spin_unlock(&nn->client_lock);
+        nfsd_reap_locks(&reaplist);
+        return count;
+}
+static u64
+nfsd_foreach_client_openowner(struct nfs4_client *clp, u64 max,
+                              struct list_head *collect,
+                              void (*func)(struct nfs4_openowner *))
 {
        struct nfs4_openowner *oop, *next;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                                nfsd_net_id);
        u64 count = 0;
+        lockdep_assert_held(&nn->client_lock);
+        spin_lock(&clp->cl_lock);
        list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) {
-                if (func)
+                if (func) {
                        func(oop);
-                if (++count == max)
+                        if (collect) {
+                                atomic_inc(&clp->cl_refcount);
+                                list_add(&oop->oo_perclient, collect);
+                        }
+                }
+                ++count;
+                /*
+                 * Despite the fact that these functions deal with
+                 * 64-bit integers for "count", we must ensure that
+                 * it doesn't blow up the clp->cl_refcount. Throw a
+                 * warning if we start to approach INT_MAX here.
+                 */
+                WARN_ON_ONCE(count == (INT_MAX / 2));
+                if (count == max)
                        break;
        }
+        spin_unlock(&clp->cl_lock);
        return count;
 }
-u64 nfsd_forget_client_openowners(struct nfs4_client *clp, u64 max)
+static u64
+nfsd_print_client_openowners(struct nfs4_client *clp)
 {
-        return nfsd_foreach_client_open(clp, max, release_openowner);
+        u64 count = nfsd_foreach_client_openowner(clp, 0, NULL, NULL);
+        nfsd_print_count(clp, count, "openowners");
+        return count;
 }
-u64 nfsd_print_client_openowners(struct nfs4_client *clp, u64 max)
+static u64
+nfsd_collect_client_openowners(struct nfs4_client *clp,
+                               struct list_head *collect, u64 max)
 {
-        u64 count = nfsd_foreach_client_open(clp, max, NULL);
+        return nfsd_foreach_client_openowner(clp, max, collect,
-        nfsd_print_count(clp, count, "open files");
+                                                unhash_openowner_locked);
-        return count;
 }
-static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
+u64
-                                     struct list_head *victims)
+nfsd_inject_print_openowners(void)
 {
-        struct nfs4_delegation *dp, *next;
+        struct nfs4_client *clp;
        u64 count = 0;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                                nfsd_net_id);
+        if (!nfsd_netns_ready(nn))
+                return 0;
+        spin_lock(&nn->client_lock);
+        list_for_each_entry(clp, &nn->client_lru, cl_lru)
+                count += nfsd_print_client_openowners(clp);
+        spin_unlock(&nn->client_lock);
-        lockdep_assert_held(&state_lock);
-        list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
-                if (victims)
-                        list_move(&dp->dl_recall_lru, victims);
-                if (++count == max)
-                        break;
-        }
        return count;
 }
-u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max)
+static void
+nfsd_reap_openowners(struct list_head *reaplist)
 {
-        struct nfs4_delegation *dp, *next;
+        struct nfs4_client *clp;
-        LIST_HEAD(victims);
+        struct nfs4_openowner *oop, *next;
-        u64 count;
-        spin_lock(&state_lock);
+        list_for_each_entry_safe(oop, next, reaplist, oo_perclient) {
-        count = nfsd_find_all_delegations(clp, max, &victims);
+                list_del_init(&oop->oo_perclient);
-        spin_unlock(&state_lock);
+                clp = oop->oo_owner.so_client;
+                release_openowner(oop);
+                put_client(clp);
+        }
+}
-        list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
+u64
-                revoke_delegation(dp);
+nfsd_inject_forget_client_openowners(struct sockaddr_storage *addr,
+                                     size_t addr_size)
+{
+        unsigned int count = 0;
+        struct nfs4_client *clp;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                                nfsd_net_id);
+        LIST_HEAD(reaplist);
+        if (!nfsd_netns_ready(nn))
+                return count;
+        spin_lock(&nn->client_lock);
+        clp = nfsd_find_client(addr, addr_size);
+        if (clp)
+                count = nfsd_collect_client_openowners(clp, &reaplist, 0);
+        spin_unlock(&nn->client_lock);
+        nfsd_reap_openowners(&reaplist);
        return count;
 }
-u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max)
+u64
+nfsd_inject_forget_openowners(u64 max)
 {
-        struct nfs4_delegation *dp, *next;
+        u64 count = 0;
-        LIST_HEAD(victims);
+        struct nfs4_client *clp;
-        u64 count;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                                nfsd_net_id);
+        LIST_HEAD(reaplist);
-        spin_lock(&state_lock);
+        if (!nfsd_netns_ready(nn))
-        count = nfsd_find_all_delegations(clp, max, &victims);
+                return count;
-        list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
-                nfsd_break_one_deleg(dp);
-        spin_unlock(&state_lock);
+        spin_lock(&nn->client_lock);
+        list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+                count += nfsd_collect_client_openowners(clp, &reaplist,
+                                                        max - count);
+                if (max != 0 && count >= max)
+                        break;
+        }
+        spin_unlock(&nn->client_lock);
+        nfsd_reap_openowners(&reaplist);
        return count;
 }
-u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max)
+static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
+                                     struct list_head *victims)
 {
+        struct nfs4_delegation *dp, *next;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                                nfsd_net_id);
        u64 count = 0;
+        lockdep_assert_held(&nn->client_lock);
        spin_lock(&state_lock);
-        count = nfsd_find_all_delegations(clp, max, NULL);
+        list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
+                if (victims) {
+                        /*
+                         * It's not safe to mess with delegations that have a
+                         * non-zero dl_time. They might have already been broken
+                         * and could be processed by the laundromat outside of
+                         * the state_lock. Just leave them be.
+                         */
+                        if (dp->dl_time != 0)
+                                continue;
+                        atomic_inc(&clp->cl_refcount);
+                        unhash_delegation_locked(dp);
+                        list_add(&dp->dl_recall_lru, victims);
+                }
+                ++count;
+                /*
+                 * Despite the fact that these functions deal with
+                 * 64-bit integers for "count", we must ensure that
+                 * it doesn't blow up the clp->cl_refcount. Throw a
+                 * warning if we start to approach INT_MAX here.
+                 */
+                WARN_ON_ONCE(count == (INT_MAX / 2));
+                if (count == max)
+                        break;
+        }
        spin_unlock(&state_lock);
+        return count;
+}
+static u64
+nfsd_print_client_delegations(struct nfs4_client *clp)
+{
+        u64 count = nfsd_find_all_delegations(clp, 0, NULL);
        nfsd_print_count(clp, count, "delegations");
        return count;
 }
-u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64))
+u64
+nfsd_inject_print_delegations(void)
 {
-        struct nfs4_client *clp, *next;
+        struct nfs4_client *clp;
        u64 count = 0;
-        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id);
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                                nfsd_net_id);
        if (!nfsd_netns_ready(nn))
                return 0;
-        list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
+        spin_lock(&nn->client_lock);
-                count += func(clp, max - count);
+        list_for_each_entry(clp, &nn->client_lru, cl_lru)
-                if ((max != 0) && (count >= max))
+                count += nfsd_print_client_delegations(clp);
-                        break;
+        spin_unlock(&nn->client_lock);
+        return count;
+}
+static void
+nfsd_forget_delegations(struct list_head *reaplist)
+{
+        struct nfs4_client *clp;
+        struct nfs4_delegation *dp, *next;
+        list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) {
+                list_del_init(&dp->dl_recall_lru);
+                clp = dp->dl_stid.sc_client;
+                revoke_delegation(dp);
+                put_client(clp);
        }
+}
+u64
+nfsd_inject_forget_client_delegations(struct sockaddr_storage *addr,
+                                      size_t addr_size)
+{
+        u64 count = 0;
+        struct nfs4_client *clp;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                                nfsd_net_id);
+        LIST_HEAD(reaplist);
+        if (!nfsd_netns_ready(nn))
+                return count;
+        spin_lock(&nn->client_lock);
+        clp = nfsd_find_client(addr, addr_size);
+        if (clp)
+                count = nfsd_find_all_delegations(clp, 0, &reaplist);
+        spin_unlock(&nn->client_lock);
+        nfsd_forget_delegations(&reaplist);
        return count;
 }
-struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size)
+u64
+nfsd_inject_forget_delegations(u64 max)
 {
+        u64 count = 0;
        struct nfs4_client *clp;
-        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id);
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                                nfsd_net_id);
+        LIST_HEAD(reaplist);
        if (!nfsd_netns_ready(nn))
-                return NULL;
+                return count;
+        spin_lock(&nn->client_lock);
        list_for_each_entry(clp, &nn->client_lru, cl_lru) {
-                if (memcmp(&clp->cl_addr, addr, addr_size) == 0)
+                count += nfsd_find_all_delegations(clp, max - count, &reaplist);
-                        return clp;
+                if (max != 0 && count >= max)
+                        break;
        }
-        return NULL;
+        spin_unlock(&nn->client_lock);
+        nfsd_forget_delegations(&reaplist);
+        return count;
 }
+static void
+nfsd_recall_delegations(struct list_head *reaplist)
+{
+        struct nfs4_client *clp;
+        struct nfs4_delegation *dp, *next;
+        list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) {
+                list_del_init(&dp->dl_recall_lru);
+                clp = dp->dl_stid.sc_client;
+                /*
+                 * We skipped all entries that had a zero dl_time before,
+                 * so we can now reset the dl_time back to 0. If a delegation
+                 * break comes in now, then it won't make any difference since
+                 * we're recalling it either way.
+                 */
+                spin_lock(&state_lock);
+                dp->dl_time = 0;
+                spin_unlock(&state_lock);
+                nfsd_break_one_deleg(dp);
+                put_client(clp);
+        }
+}
+u64
+nfsd_inject_recall_client_delegations(struct sockaddr_storage *addr,
+                                      size_t addr_size)
+{
+        u64 count = 0;
+        struct nfs4_client *clp;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                                nfsd_net_id);
+        LIST_HEAD(reaplist);
+        if (!nfsd_netns_ready(nn))
+                return count;
+        spin_lock(&nn->client_lock);
+        clp = nfsd_find_client(addr, addr_size);
+        if (clp)
+                count = nfsd_find_all_delegations(clp, 0, &reaplist);
+        spin_unlock(&nn->client_lock);
+        nfsd_recall_delegations(&reaplist);
+        return count;
+}
+u64
+nfsd_inject_recall_delegations(u64 max)
+{
+        u64 count = 0;
+        struct nfs4_client *clp, *next;
+        struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
+                                                nfsd_net_id);
+        LIST_HEAD(reaplist);
+        if (!nfsd_netns_ready(nn))
+                return count;
+        spin_lock(&nn->client_lock);
+        list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
+                count += nfsd_find_all_delegations(clp, max - count, &reaplist);
+                if (max != 0 && ++count >= max)
+                        break;
+        }
+        spin_unlock(&nn->client_lock);
+        nfsd_recall_delegations(&reaplist);
+        return count;
+}
 #endif /* CONFIG_NFSD_FAULT_INJECTION */
 /*
@@ -5113,14 +6275,6 @@ static int nfs4_state_create_net(struct net *net)
                        CLIENT_HASH_SIZE, GFP_KERNEL);
        if (!nn->unconf_id_hashtbl)
                goto err_unconf_id;
-        nn->ownerstr_hashtbl = kmalloc(sizeof(struct list_head) *
-                        OWNER_HASH_SIZE, GFP_KERNEL);
-        if (!nn->ownerstr_hashtbl)
-                goto err_ownerstr;
-        nn->lockowner_ino_hashtbl = kmalloc(sizeof(struct list_head) *
-                        LOCKOWNER_INO_HASH_SIZE, GFP_KERNEL);
-        if (!nn->lockowner_ino_hashtbl)
-                goto err_lockowner_ino;
        nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) *
                        SESSION_HASH_SIZE, GFP_KERNEL);
        if (!nn->sessionid_hashtbl)
@@ -5130,10 +6284,6 @@ static int nfs4_state_create_net(struct net *net)
                INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]);
                INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]);
        }
-        for (i = 0; i < OWNER_HASH_SIZE; i++)
-                INIT_LIST_HEAD(&nn->ownerstr_hashtbl[i]);
-        for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
-                INIT_LIST_HEAD(&nn->lockowner_ino_hashtbl[i]);
        for (i = 0; i < SESSION_HASH_SIZE; i++)
                INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]);
        nn->conf_name_tree = RB_ROOT;
@@ -5149,10 +6299,6 @@ static int nfs4_state_create_net(struct net *net)
        return 0;
 err_sessionid:
-        kfree(nn->lockowner_ino_hashtbl);
-err_lockowner_ino:
-        kfree(nn->ownerstr_hashtbl);
-err_ownerstr:
        kfree(nn->unconf_id_hashtbl);
 err_unconf_id:
        kfree(nn->conf_id_hashtbl);
@@ -5182,8 +6328,6 @@ nfs4_state_destroy_net(struct net *net)
        }
        kfree(nn->sessionid_hashtbl);
-        kfree(nn->lockowner_ino_hashtbl);
-        kfree(nn->ownerstr_hashtbl);
        kfree(nn->unconf_id_hashtbl);
        kfree(nn->conf_id_hashtbl);
        put_net(net);
@@ -5247,22 +6391,22 @@ nfs4_state_shutdown_net(struct net *net)
        cancel_delayed_work_sync(&nn->laundromat_work);
        locks_end_grace(&nn->nfsd4_manager);
-        nfs4_lock_state();
        INIT_LIST_HEAD(&reaplist);
        spin_lock(&state_lock);
        list_for_each_safe(pos, next, &nn->del_recall_lru) {
                dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-                list_move(&dp->dl_recall_lru, &reaplist);
+                unhash_delegation_locked(dp);
+                list_add(&dp->dl_recall_lru, &reaplist);
        }
        spin_unlock(&state_lock);
        list_for_each_safe(pos, next, &reaplist) {
                dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-                destroy_delegation(dp);
+                list_del_init(&dp->dl_recall_lru);
+                nfs4_put_stid(&dp->dl_stid);
        }
        nfsd4_client_tracking_exit(net);
        nfs4_state_destroy_net(net);
-        nfs4_unlock_state();
 }
 void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 944275c8f56d..f9821ce6658a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -181,28 +181,43 @@ static int zero_clientid(clientid_t *clid)
 }
 /**
- * defer_free - mark an allocation as deferred freed
+ * svcxdr_tmpalloc - allocate memory to be freed after compound processing
- * @argp: NFSv4 compound argument structure to be freed with
+ * @argp: NFSv4 compound argument structure
- * @release: release callback to free @p, typically kfree()
+ * @p: pointer to be freed (with kfree())
- * @p: pointer to be freed
 *
 * Marks @p to be freed when processing the compound operation
 * described in @argp finishes.
 */
-static int
+static void *
-defer_free(struct nfsd4_compoundargs *argp,
+svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len)
-                void (*release)(const void *), void *p)
 {
-        struct tmpbuf *tb;
+        struct svcxdr_tmpbuf *tb;
-        tb = kmalloc(sizeof(*tb), GFP_KERNEL);
+        tb = kmalloc(sizeof(*tb) + len, GFP_KERNEL);
        if (!tb)
-                return -ENOMEM;
+                return NULL;
-        tb->buf = p;
-        tb->release = release;
        tb->next = argp->to_free;
        argp->to_free = tb;
-        return 0;
+        return tb->buf;
+}
+/*
+ * For xdr strings that need to be passed to other kernel api's
+ * as null-terminated strings.
+ *
+ * Note null-terminating in place usually isn't safe since the
+ * buffer might end on a page boundary.
+ */
+static char *
+svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len)
+{
+        char *p = svcxdr_tmpalloc(argp, len + 1);
+        if (!p)
+                return NULL;
+        memcpy(p, buf, len);
+        p[len] = '\0';
+        return p;
 }
 /**
@@ -217,19 +232,13 @@ defer_free(struct nfsd4_compoundargs *argp,
 */
 static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
 {
-        if (p == argp->tmp) {
+        void *ret;
-                p = kmemdup(argp->tmp, nbytes, GFP_KERNEL);
-                if (!p)
+        ret = svcxdr_tmpalloc(argp, nbytes);
-                        return NULL;
+        if (!ret)
-        } else {
-                BUG_ON(p != argp->tmpp);
-                argp->tmpp = NULL;
-        }
-        if (defer_free(argp, kfree, p)) {
-                kfree(p);
                return NULL;
-        } else
+        memcpy(ret, p, nbytes);
-                return (char *)p;
+        return ret;
 }
 static __be32
@@ -292,12 +301,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                if (nace > NFS4_ACL_MAX)
                        return nfserr_fbig;
-                *acl = nfs4_acl_new(nace);
+                *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace));
                if (*acl == NULL)
                        return nfserr_jukebox;
-                defer_free(argp, kfree, *acl);
                (*acl)->naces = nace;
                for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) {
                        READ_BUF(16); len += 16;
@@ -418,12 +425,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                        return nfserr_badlabel;
                len += (XDR_QUADLEN(dummy32) << 2);
                READMEM(buf, dummy32);
-                label->data = kzalloc(dummy32 + 1, GFP_KERNEL);
+                label->len = dummy32;
+                label->data = svcxdr_dupstr(argp, buf, dummy32);
                if (!label->data)
                        return nfserr_jukebox;
-                label->len = dummy32;
-                defer_free(argp, kfree, label->data);
-                memcpy(label->data, buf, dummy32);
        }
 #endif
@@ -598,20 +603,11 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
        switch (create->cr_type) {
        case NF4LNK:
                READ_BUF(4);
-                create->cr_linklen = be32_to_cpup(p++);
+                create->cr_datalen = be32_to_cpup(p++);
-                READ_BUF(create->cr_linklen);
+                READ_BUF(create->cr_datalen);
-                /*
+                create->cr_data = svcxdr_dupstr(argp, p, create->cr_datalen);
-                 * The VFS will want a null-terminated string, and
+                if (!create->cr_data)
-                 * null-terminating in place isn't safe since this might
-                 * end on a page boundary:
-                 */
-                create->cr_linkname =
-                                kmalloc(create->cr_linklen + 1, GFP_KERNEL);
-                if (!create->cr_linkname)
                        return nfserr_jukebox;
-                memcpy(create->cr_linkname, p, create->cr_linklen);
-                create->cr_linkname[create->cr_linklen] = '\0';
-                defer_free(argp, kfree, create->cr_linkname);
                break;
        case NF4BLK:
        case NF4CHR:
@@ -1481,13 +1477,12 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta
        INIT_LIST_HEAD(&test_stateid->ts_stateid_list);
        for (i = 0; i < test_stateid->ts_num_ids; i++) {
-                stateid = kmalloc(sizeof(struct nfsd4_test_stateid_id), GFP_KERNEL);
+                stateid = svcxdr_tmpalloc(argp, sizeof(*stateid));
                if (!stateid) {
                        status = nfserrno(-ENOMEM);
                        goto out;
                }
-                defer_free(argp, kfree, stateid);
                INIT_LIST_HEAD(&stateid->ts_id_list);
                list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list);
@@ -1640,7 +1635,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
                goto xdr_error;
        if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
-                argp->ops = kmalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
+                argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
                if (!argp->ops) {
                        argp->ops = argp->iops;
                        dprintk("nfsd: couldn't allocate room for COMPOUND\n");
@@ -3077,11 +3072,8 @@ static __be32 nfsd4_encode_splice_read(
        __be32 nfserr;
        __be32 *p = xdr->p - 2;
-        /*
+        /* Make sure there will be room for padding if needed */
-         * Don't inline pages unless we know there's room for eof,
+        if (xdr->end - xdr->p < 1)
-         * count, and possible padding:
-         */
-        if (xdr->end - xdr->p < 3)
                return nfserr_resource;
        nfserr = nfsd_splice_read(read->rd_rqstp, file,
@@ -3147,9 +3139,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
        len = maxcount;
        v = 0;
-        thislen = (void *)xdr->end - (void *)xdr->p;
+        thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p));
-        if (len < thislen)
-                thislen = len;
        p = xdr_reserve_space(xdr, (thislen+3)&~3);
        WARN_ON_ONCE(!p);
        resp->rqstp->rq_vec[v].iov_base = p;
@@ -3216,10 +3206,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
        xdr_commit_encode(xdr);
        maxcount = svc_max_payload(resp->rqstp);
-        if (maxcount > xdr->buf->buflen - xdr->buf->len)
+        maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
-                maxcount = xdr->buf->buflen - xdr->buf->len;
+        maxcount = min_t(unsigned long, maxcount, read->rd_length);
-        if (maxcount > read->rd_length)
-                maxcount = read->rd_length;
        if (!read->rd_filp) {
                err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp,
@@ -3937,8 +3925,6 @@ status:
 * 
 * XDR note: do not encode rp->rp_buflen: the buffer contains the
 * previously sent already encoded operation.
- *
- * called with nfs4_lock_state() held
 */
 void
 nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op)
@@ -3977,9 +3963,8 @@ int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp)
        kfree(args->tmpp);
        args->tmpp = NULL;
        while (args->to_free) {
-                struct tmpbuf *tb = args->to_free;
+                struct svcxdr_tmpbuf *tb = args->to_free;
                args->to_free = tb->next;
-                tb->release(tb->buf);
                kfree(tb);
        }
        return 1;
@@ -4012,7 +3997,6 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
        /*
         * All that remains is to write the tag and operation count...
         */
-        struct nfsd4_compound_state *cs = &resp->cstate;
        struct xdr_buf *buf = resp->xdr.buf;
        WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
@@ -4026,19 +4010,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
        p += XDR_QUADLEN(resp->taglen);
        *p++ = htonl(resp->opcnt);
-        if (nfsd4_has_session(cs)) {
+        nfsd4_sequence_done(resp);
-                struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
-                struct nfs4_client *clp = cs->session->se_client;
-                if (cs->status != nfserr_replay_cache) {
-                        nfsd4_store_cache_entry(resp);
-                        cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
-                }
-                /* Renew the clientid on success and on replay */
-                spin_lock(&nn->client_lock);
-                nfsd4_put_session(cs->session);
-                spin_unlock(&nn->client_lock);
-                put_client_renew(clp);
-        }
        return 1;
 }
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 6040da8830ff..ff9567633245 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -221,7 +221,12 @@ static void
 hash_refile(struct svc_cacherep *rp)
 {
        hlist_del_init(&rp->c_hash);
-        hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits));
+        /*
+         * No point in byte swapping c_xid since we're just using it to pick
+         * a hash bucket.
+         */
+        hlist_add_head(&rp->c_hash, cache_hash +
+                        hash_32((__force u32)rp->c_xid, maskbits));
 }
 /*
@@ -356,7 +361,11 @@ nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
        struct hlist_head       *rh;
        unsigned int            entries = 0;
-        rh = &cache_hash[hash_32(rqstp->rq_xid, maskbits)];
+        /*
+         * No point in byte swapping rq_xid since we're just using it to pick
+         * a hash bucket.
+         */
+        rh = &cache_hash[hash_32((__force u32)rqstp->rq_xid, maskbits)];
        hlist_for_each_entry(rp, rh, c_hash) {
                ++entries;
                if (nfsd_cache_match(rqstp, csum, rp)) {
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 51844048937f..4e042105fb6e 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -39,6 +39,7 @@ enum {
        NFSD_Versions,
        NFSD_Ports,
        NFSD_MaxBlkSize,
+        NFSD_MaxConnections,
        NFSD_SupportedEnctypes,
        /*
         * The below MUST come last.  Otherwise we leave a hole in nfsd_files[]
@@ -62,6 +63,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_versions(struct file *file, char *buf, size_t size);
 static ssize_t write_ports(struct file *file, char *buf, size_t size);
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
+static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
 #ifdef CONFIG_NFSD_V4
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
 static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
@@ -77,6 +79,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_Versions] = write_versions,
        [NFSD_Ports] = write_ports,
        [NFSD_MaxBlkSize] = write_maxblksize,
+        [NFSD_MaxConnections] = write_maxconn,
 #ifdef CONFIG_NFSD_V4
        [NFSD_Leasetime] = write_leasetime,
        [NFSD_Gracetime] = write_gracetime,
@@ -369,8 +372,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
        if (maxsize < NFS_FHSIZE)
                return -EINVAL;
-        if (maxsize > NFS3_FHSIZE)
+        maxsize = min(maxsize, NFS3_FHSIZE);
-                maxsize = NFS3_FHSIZE;
        if (qword_get(&mesg, mesg, size)>0)
                return -EINVAL;
@@ -871,10 +873,8 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
                /* force bsize into allowed range and
                 * required alignment.
                 */
-                if (bsize < 1024)
+                bsize = max_t(int, bsize, 1024);
-                        bsize = 1024;
+                bsize = min_t(int, bsize, NFSSVC_MAXBLKSIZE);
-                if (bsize > NFSSVC_MAXBLKSIZE)
-                        bsize = NFSSVC_MAXBLKSIZE;
                bsize &= ~(1024-1);
                mutex_lock(&nfsd_mutex);
                if (nn->nfsd_serv) {
@@ -889,6 +889,44 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
                                                        nfsd_max_blksize);
 }
+/**
+ * write_maxconn - Set or report the current max number of connections
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing an unsigned
+ *                                      integer value representing the new
+ *                                      number of max connections
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C string
+ *                      containing numeric value of max_connections setting
+ *                      for this net namespace;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
+static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
+{
+        char *mesg = buf;
+        struct net *net = file->f_dentry->d_sb->s_fs_info;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+        unsigned int maxconn = nn->max_connections;
+        if (size > 0) {
+                int rv = get_uint(&mesg, &maxconn);
+                if (rv)
+                        return rv;
+                nn->max_connections = maxconn;
+        }
+        return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn);
+}
 #ifdef CONFIG_NFSD_V4
 static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
                                  time_t *time, struct nfsd_net *nn)
@@ -1064,6 +1102,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
                [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
+                [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
 #if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
                [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
 #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index ec8393418154..e883a5868be6 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -162,7 +162,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
                        /* deprecated, convert to type 3 */
                        len = key_len(FSID_ENCODE_DEV)/4;
                        fh->fh_fsid_type = FSID_ENCODE_DEV;
-                        fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl(fh->fh_fsid[0]), ntohl(fh->fh_fsid[1])));
+                        /*
+                         * struct knfsd_fh uses host-endian fields, which are
+                         * sometimes used to hold net-endian values. This
+                         * confuses sparse, so we must use __force here to
+                         * keep it from complaining.
+                         */
+                        fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
+                                                        ntohl((__force __be32)fh->fh_fsid[1])));
                        fh->fh_fsid[1] = fh->fh_fsid[2];
                }
                data_left -= len;
@@ -539,8 +546,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
                       dentry);
        fhp->fh_dentry = dget(dentry); /* our internal copy */
-        fhp->fh_export = exp;
+        fhp->fh_export = exp_get(exp);
-        cache_get(&exp->h);
        if (fhp->fh_handle.fh_version == 0xca) {
                /* old style filehandle please */
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 2e89e70ac15c..08236d70c667 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -73,8 +73,15 @@ enum fsid_source {
 extern enum fsid_source fsid_source(struct svc_fh *fhp);
-/* This might look a little large to "inline" but in all calls except
+/*
+ * This might look a little large to "inline" but in all calls except
 * one, 'vers' is constant so moste of the function disappears.
+ *
+ * In some cases the values are considered to be host endian and in
+ * others, net endian. fsidv is always considered to be u32 as the
+ * callers don't know which it will be. So we must use __force to keep
+ * sparse from complaining. Since these values are opaque to the
+ * client, that shouldn't be a problem.
 */
 static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
                           u32 fsid, unsigned char *uuid)
@@ -82,7 +89,7 @@ static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
        u32 *up;
        switch(vers) {
        case FSID_DEV:
-                fsidv[0] = htonl((MAJOR(dev)<<16) |
+                fsidv[0] = (__force __u32)htonl((MAJOR(dev)<<16) |
                                 MINOR(dev));
                fsidv[1] = ino_t_to_u32(ino);
                break;
@@ -90,8 +97,8 @@ static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
                fsidv[0] = fsid;
                break;
        case FSID_MAJOR_MINOR:
-                fsidv[0] = htonl(MAJOR(dev));
+                fsidv[0] = (__force __u32)htonl(MAJOR(dev));
-                fsidv[1] = htonl(MINOR(dev));
+                fsidv[1] = (__force __u32)htonl(MINOR(dev));
                fsidv[2] = ino_t_to_u32(ino);
                break;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 54c6b3d3cc79..b8680738f588 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -403,12 +403,13 @@ nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp,
        fh_init(&newfh, NFS_FHSIZE);
        /*
-         * Create the link, look up new file and set attrs.
+         * Crazy hack: the request fits in a page, and already-decoded
+         * attributes follow argp->tname, so it's safe to just write a
+         * null to ensure it's null-terminated:
         */
+        argp->tname[argp->tlen] = '\0';
        nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
-                                                 argp->tname, argp->tlen,
+                                                 argp->tname, &newfh);
-                                                 &newfh, &argp->attrs);
        fh_put(&argp->ffh);
        fh_put(&newfh);
@@ -716,6 +717,7 @@ nfserrno (int errno)
                { nfserr_noent, -ENOENT },
                { nfserr_io, -EIO },
                { nfserr_nxio, -ENXIO },
+                { nfserr_fbig, -E2BIG },
                { nfserr_acces, -EACCES },
                { nfserr_exist, -EEXIST },
                { nfserr_xdev, -EXDEV },
@@ -743,6 +745,7 @@ nfserrno (int errno)
                { nfserr_notsupp, -EOPNOTSUPP },
                { nfserr_toosmall, -ETOOSMALL },
                { nfserr_serverfault, -ESERVERFAULT },
+                { nfserr_serverfault, -ENFILE },
        };
        int     i;
@@ -750,7 +753,7 @@ nfserrno (int errno)
                if (nfs_errtbl[i].syserr == errno)
                        return nfs_errtbl[i].nfserr;
        }
-        printk (KERN_INFO "nfsd: non-standard errno: %d\n", errno);
+        WARN(1, "nfsd: non-standard errno: %d\n", errno);
        return nfserr_io;
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 1879e43f2868..752d56bbe0ba 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -221,7 +221,8 @@ static int nfsd_startup_generic(int nrservs)
         */
        ret = nfsd_racache_init(2*nrservs);
        if (ret)
-                return ret;
+                goto dec_users;
        ret = nfs4_state_start();
        if (ret)
                goto out_racache;
@@ -229,6 +230,8 @@ static int nfsd_startup_generic(int nrservs)
 out_racache:
        nfsd_racache_shutdown();
+dec_users:
+        nfsd_users--;
        return ret;
 }
@@ -405,6 +408,7 @@ int nfsd_create_serv(struct net *net)
        if (nn->nfsd_serv == NULL)
                return -ENOMEM;
+        nn->nfsd_serv->sv_maxconn = nn->max_connections;
        error = svc_bind(nn->nfsd_serv, net);
        if (error < 0) {
                svc_destroy(nn->nfsd_serv);
@@ -469,8 +473,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
        /* enforce a global maximum number of threads */
        tot = 0;
        for (i = 0; i < n; i++) {
-                if (nthreads[i] > NFSD_MAXSERVS)
+                nthreads[i] = min(nthreads[i], NFSD_MAXSERVS);
-                        nthreads[i] = NFSD_MAXSERVS;
                tot += nthreads[i];
        }
        if (tot > NFSD_MAXSERVS) {
@@ -519,11 +522,11 @@ nfsd_svc(int nrservs, struct net *net)
        mutex_lock(&nfsd_mutex);
        dprintk("nfsd: creating service\n");
-        if (nrservs <= 0)
-                nrservs = 0;
+        nrservs = max(nrservs, 0);
-        if (nrservs > NFSD_MAXSERVS)
+        nrservs = min(nrservs, NFSD_MAXSERVS);
-                nrservs = NFSD_MAXSERVS;
        error = 0;
        if (nrservs == 0 && nn->nfsd_serv == NULL)
                goto out;
@@ -564,6 +567,7 @@ nfsd(void *vrqstp)
        struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
        struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list);
        struct net *net = perm_sock->xpt_net;
+        struct nfsd_net *nn = net_generic(net, nfsd_net_id);
        int err;
        /* Lock module and set up kernel thread */
@@ -597,6 +601,9 @@ nfsd(void *vrqstp)
         * The main request loop
         */
        for (;;) {
+                /* Update sv_maxconn if it has changed */
+                rqstp->rq_server->sv_maxconn = nn->max_connections;
                /*
                 * Find a socket with data available and call its
                 * recvfrom routine.
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 1ac306b769df..412d7061f9e5 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -257,8 +257,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
        len = args->count     = ntohl(*p++);
        p++; /* totalcount - unused */
-        if (len > NFSSVC_MAXBLKSIZE_V2)
+        len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2);
-                len = NFSSVC_MAXBLKSIZE_V2;
        /* set up somewhere to store response.
         * We take pages, put them on reslist and include in iovec
@@ -268,7 +267,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
                struct page *p = *(rqstp->rq_next_page++);
                rqstp->rq_vec[v].iov_base = page_address(p);
-                rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
+                rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
                len -= rqstp->rq_vec[v].iov_len;
                v++;
        }
@@ -400,9 +399,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
                return 0;
        args->cookie = ntohl(*p++);
        args->count  = ntohl(*p++);
-        if (args->count > PAGE_SIZE)
+        args->count  = min_t(u32, args->count, PAGE_SIZE);
-                args->count = PAGE_SIZE;
        args->buffer = page_address(*(rqstp->rq_next_page++));
        return xdr_argsize_check(rqstp, p);
@@ -516,10 +513,11 @@ nfssvc_encode_entry(void *ccdv, const char *name,
        }
        if (cd->offset)
                *cd->offset = htonl(offset);
-        if (namlen > NFS2_MAXNAMLEN)
-                namlen = NFS2_MAXNAMLEN;/* truncate filename */
+        /* truncate filename */
+        namlen = min(namlen, NFS2_MAXNAMLEN);
        slen = XDR_QUADLEN(namlen);
        if ((buflen = cd->buflen - slen - 4) < 0) {
                cd->common.err = nfserr_toosmall;
                return -EINVAL;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 374c66283ac5..4a89e00d7461 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -72,7 +72,13 @@ struct nfsd4_callback {
        bool cb_done;
 };
+/*
+ * A core object that represents a "common" stateid. These are generally
+ * embedded within the different (more specific) stateid objects and contain
+ * fields that are of general use to any stateid.
+ */
 struct nfs4_stid {
+        atomic_t sc_count;
 #define NFS4_OPEN_STID 1
 #define NFS4_LOCK_STID 2
 #define NFS4_DELEG_STID 4
@@ -80,22 +86,43 @@ struct nfs4_stid {
 #define NFS4_CLOSED_STID 8
 /* For a deleg stateid kept around only to process free_stateid's: */
 #define NFS4_REVOKED_DELEG_STID 16
+#define NFS4_CLOSED_DELEG_STID 32
        unsigned char sc_type;
        stateid_t sc_stateid;
        struct nfs4_client *sc_client;
+        struct nfs4_file *sc_file;
+        void (*sc_free)(struct nfs4_stid *);
 };
+/*
+ * Represents a delegation stateid. The nfs4_client holds references to these
+ * and they are put when it is being destroyed or when the delegation is
+ * returned by the client:
+ *
+ * o 1 reference as long as a delegation is still in force (taken when it's
+ *   alloc'd, put when it's returned or revoked)
+ *
+ * o 1 reference as long as a recall rpc is in progress (taken when the lease
+ *   is broken, put when the rpc exits)
+ *
+ * o 1 more ephemeral reference for each nfsd thread currently doing something
+ *   with that delegation without holding the cl_lock
+ *
+ * If the server attempts to recall a delegation and the client doesn't do so
+ * before a timeout, the server may also revoke the delegation. In that case,
+ * the object will either be destroyed (v4.0) or moved to a per-client list of
+ * revoked delegations (v4.1+).
+ *
+ * This object is a superset of the nfs4_stid.
+ */
 struct nfs4_delegation {
        struct nfs4_stid        dl_stid; /* must be first field */
        struct list_head        dl_perfile;
        struct list_head        dl_perclnt;
        struct list_head        dl_recall_lru;  /* delegation recalled */
-        atomic_t                dl_count;       /* ref count */
-        struct nfs4_file        *dl_file;
        u32                     dl_type;
        time_t                  dl_time;
 /* For recall: */
-        struct knfsd_fh         dl_fh;
        int                     dl_retries;
        struct nfsd4_callback   dl_recall;
 };
@@ -194,6 +221,11 @@ struct nfsd4_conn {
        unsigned char cn_flags;
 };
+/*
+ * Representation of a v4.1+ session. These are refcounted in a similar fashion
+ * to the nfs4_client. References are only taken when the server is actively
+ * working on the object (primarily during the processing of compounds).
+ */
 struct nfsd4_session {
        atomic_t                se_ref;
        struct list_head        se_hash;        /* hash by sessionid */
@@ -212,8 +244,6 @@ struct nfsd4_session {
        struct nfsd4_slot       *se_slots[];    /* forward channel slots */
 };
-extern void nfsd4_put_session(struct nfsd4_session *ses);
 /* formatted contents of nfs4_sessionid */
 struct nfsd4_sessionid {
        clientid_t      clientid;
@@ -225,17 +255,35 @@ struct nfsd4_sessionid {
 /*
 * struct nfs4_client - one per client.  Clientids live here.
- *      o Each nfs4_client is hashed by clientid.
 *
- *      o Each nfs4_clients is also hashed by name 
+ * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0)
- *        (the opaque quantity initially sent by the client to identify itself).
+ * or EXCHANGE_ID (for NFSv4.1+). These objects are refcounted and timestamped.
+ * Each nfsd_net_ns object contains a set of these and they are tracked via
+ * short and long form clientid. They are hashed and searched for under the
+ * per-nfsd_net client_lock spinlock.
+ *
+ * References to it are only held during the processing of compounds, and in
+ * certain other operations. In their "resting state" they have a refcount of
+ * 0. If they are not renewed within a lease period, they become eligible for
+ * destruction by the laundromat.
+ *
+ * These objects can also be destroyed prematurely by the fault injection code,
+ * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates.
+ * Care is taken *not* to do this however when the objects have an elevated
+ * refcount.
+ *
+ * o Each nfs4_client is hashed by clientid
+ *
+ * o Each nfs4_clients is also hashed by name (the opaque quantity initially
+ *   sent by the client to identify itself).
 *        
- *      o cl_perclient list is used to ensure no dangling stateowner references
+ * o cl_perclient list is used to ensure no dangling stateowner references
- *        when we expire the nfs4_client
+ *   when we expire the nfs4_client
 */
 struct nfs4_client {
        struct list_head        cl_idhash;      /* hash by cl_clientid.id */
        struct rb_node          cl_namenode;    /* link into by-name trees */
+        struct list_head        *cl_ownerstr_hashtbl;
        struct list_head        cl_openowners;
        struct idr              cl_stateids;    /* stateid lookup */
        struct list_head        cl_delegations;
@@ -329,21 +377,43 @@ struct nfs4_replay {
        unsigned int            rp_buflen;
        char                    *rp_buf;
        struct knfsd_fh         rp_openfh;
+        struct mutex            rp_mutex;
        char                    rp_ibuf[NFSD4_REPLAY_ISIZE];
 };
+struct nfs4_stateowner;
+struct nfs4_stateowner_operations {
+        void (*so_unhash)(struct nfs4_stateowner *);
+        void (*so_free)(struct nfs4_stateowner *);
+};
+/*
+ * A core object that represents either an open or lock owner. The object and
+ * lock owner objects have one of these embedded within them. Refcounts and
+ * other fields common to both owner types are contained within these
+ * structures.
+ */
 struct nfs4_stateowner {
-        struct list_head        so_strhash;   /* hash by op_name */
+        struct list_head                        so_strhash;
-        struct list_head        so_stateids;
+        struct list_head                        so_stateids;
-        struct nfs4_client *    so_client;
+        struct nfs4_client                      *so_client;
-        /* after increment in ENCODE_SEQID_OP_TAIL, represents the next
+        const struct nfs4_stateowner_operations *so_ops;
+        /* after increment in nfsd4_bump_seqid, represents the next
         * sequence id expected from the client: */
-        u32                     so_seqid;
+        atomic_t                                so_count;
-        struct xdr_netobj       so_owner;     /* open owner name */
+        u32                                     so_seqid;
-        struct nfs4_replay      so_replay;
+        struct xdr_netobj                       so_owner; /* open owner name */
-        bool                    so_is_open_owner;
+        struct nfs4_replay                      so_replay;
+        bool                                    so_is_open_owner;
 };
+/*
+ * When a file is opened, the client provides an open state owner opaque string
+ * that indicates the "owner" of that open. These objects are refcounted.
+ * References to it are held by each open state associated with it. This object
+ * is a superset of the nfs4_stateowner struct.
+ */
 struct nfs4_openowner {
        struct nfs4_stateowner  oo_owner; /* must be first field */
        struct list_head        oo_perclient;
@@ -358,15 +428,17 @@ struct nfs4_openowner {
        struct nfs4_ol_stateid *oo_last_closed_stid;
        time_t                  oo_time; /* time of placement on so_close_lru */
 #define NFS4_OO_CONFIRMED   1
-#define NFS4_OO_NEW         4
        unsigned char           oo_flags;
 };
+/*
+ * Represents a generic "lockowner". Similar to an openowner. References to it
+ * are held by the lock stateids that are created on its behalf. This object is
+ * a superset of the nfs4_stateowner struct (or would be if it needed any extra
+ * fields).
+ */
 struct nfs4_lockowner {
        struct nfs4_stateowner  lo_owner; /* must be first element */
-        struct list_head        lo_owner_ino_hash; /* hash by owner,file */
-        struct list_head        lo_perstateid;
-        struct list_head        lo_list; /* for temporary uses */
 };
 static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
@@ -379,9 +451,17 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
        return container_of(so, struct nfs4_lockowner, lo_owner);
 }
-/* nfs4_file: a file opened by some number of (open) nfs4_stateowners. */
+/*
+ * nfs4_file: a file opened by some number of (open) nfs4_stateowners.
+ *
+ * These objects are global. nfsd only keeps one instance of a nfs4_file per
+ * inode (though it may keep multiple file descriptors open per inode). These
+ * are tracked in the file_hashtbl which is protected by the state_lock
+ * spinlock.
+ */
 struct nfs4_file {
        atomic_t                fi_ref;
+        spinlock_t              fi_lock;
        struct hlist_node       fi_hash;    /* hash by "struct inode *" */
        struct list_head        fi_stateids;
        struct list_head        fi_delegations;
@@ -395,49 +475,36 @@ struct nfs4_file {
         *   + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set.
         */
        atomic_t                fi_access[2];
+        u32                     fi_share_deny;
        struct file             *fi_deleg_file;
        struct file_lock        *fi_lease;
        atomic_t                fi_delegees;
-        struct inode            *fi_inode;
+        struct knfsd_fh         fi_fhandle;
        bool                    fi_had_conflict;
 };
-/* XXX: for first cut may fall back on returning file that doesn't work
+/*
- * at all? */
+ * A generic struct representing either a open or lock stateid. The nfs4_client
-static inline struct file *find_writeable_file(struct nfs4_file *f)
+ * holds a reference to each of these objects, and they in turn hold a
-{
+ * reference to their respective stateowners. The client's reference is
-        if (f->fi_fds[O_WRONLY])
+ * released in response to a close or unlock (depending on whether it's an open
-                return f->fi_fds[O_WRONLY];
+ * or lock stateid) or when the client is being destroyed.
-        return f->fi_fds[O_RDWR];
+ *
-}
+ * In the case of v4.0 open stateids, these objects are preserved for a little
+ * while after close in order to handle CLOSE replays. Those are eventually
-static inline struct file *find_readable_file(struct nfs4_file *f)
+ * reclaimed via a LRU scheme by the laundromat.
-{
+ *
-        if (f->fi_fds[O_RDONLY])
+ * This object is a superset of the nfs4_stid. "ol" stands for "Open or Lock".
-                return f->fi_fds[O_RDONLY];
+ * Better suggestions welcome.
-        return f->fi_fds[O_RDWR];
+ */
-}
-static inline struct file *find_any_file(struct nfs4_file *f)
-{
-        if (f->fi_fds[O_RDWR])
-                return f->fi_fds[O_RDWR];
-        else if (f->fi_fds[O_WRONLY])
-                return f->fi_fds[O_WRONLY];
-        else
-                return f->fi_fds[O_RDONLY];
-}
-/* "ol" stands for "Open or Lock".  Better suggestions welcome. */
 struct nfs4_ol_stateid {
        struct nfs4_stid    st_stid; /* must be first field */
        struct list_head              st_perfile;
        struct list_head              st_perstateowner;
-        struct list_head              st_lockowners;
+        struct list_head              st_locks;
        struct nfs4_stateowner      * st_stateowner;
-        struct nfs4_file            * st_file;
+        unsigned char                 st_access_bmap;
-        unsigned long                 st_access_bmap;
+        unsigned char                 st_deny_bmap;
-        unsigned long                 st_deny_bmap;
        struct nfs4_ol_stateid         * st_openstp;
 };
@@ -456,15 +523,16 @@ struct nfsd_net;
 extern __be32 nfs4_preprocess_stateid_op(struct net *net,
                struct nfsd4_compound_state *cstate,
                stateid_t *stateid, int flags, struct file **filp);
-extern void nfs4_lock_state(void);
+void nfs4_put_stid(struct nfs4_stid *s);
-extern void nfs4_unlock_state(void);
 void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
 extern void nfs4_release_reclaim(struct nfsd_net *);
 extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
                                                        struct nfsd_net *nn);
-extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn);
+extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
+                struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
 extern int set_callback_cred(void);
-extern void nfsd4_init_callback(struct nfsd4_callback *);
+void nfsd4_run_cb_null(struct work_struct *w);
+void nfsd4_run_cb_recall(struct work_struct *w);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
 extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
@@ -472,11 +540,10 @@ extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
 extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
 extern void nfsd4_shutdown_callback(struct nfs4_client *);
-extern void nfs4_put_delegation(struct nfs4_delegation *dp);
+extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
 extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
                                                        struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
-extern void put_client_renew(struct nfs4_client *clp);
 /* nfs4recover operations */
 extern int nfsd4_client_tracking_init(struct net *net);
@@ -490,19 +557,24 @@ extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time);
 #ifdef CONFIG_NFSD_FAULT_INJECTION
 int nfsd_fault_inject_init(void);
 void nfsd_fault_inject_cleanup(void);
-u64 nfsd_for_n_state(u64, u64 (*)(struct nfs4_client *, u64));
-struct nfs4_client *nfsd_find_client(struct sockaddr_storage *, size_t);
+u64 nfsd_inject_print_clients(void);
+u64 nfsd_inject_forget_client(struct sockaddr_storage *, size_t);
-u64 nfsd_forget_client(struct nfs4_client *, u64);
+u64 nfsd_inject_forget_clients(u64);
-u64 nfsd_forget_client_locks(struct nfs4_client*, u64);
-u64 nfsd_forget_client_openowners(struct nfs4_client *, u64);
+u64 nfsd_inject_print_locks(void);
-u64 nfsd_forget_client_delegations(struct nfs4_client *, u64);
+u64 nfsd_inject_forget_client_locks(struct sockaddr_storage *, size_t);
-u64 nfsd_recall_client_delegations(struct nfs4_client *, u64);
+u64 nfsd_inject_forget_locks(u64);
-u64 nfsd_print_client(struct nfs4_client *, u64);
+u64 nfsd_inject_print_openowners(void);
-u64 nfsd_print_client_locks(struct nfs4_client *, u64);
+u64 nfsd_inject_forget_client_openowners(struct sockaddr_storage *, size_t);
-u64 nfsd_print_client_openowners(struct nfs4_client *, u64);
+u64 nfsd_inject_forget_openowners(u64);
-u64 nfsd_print_client_delegations(struct nfs4_client *, u64);
+u64 nfsd_inject_print_delegations(void);
+u64 nfsd_inject_forget_client_delegations(struct sockaddr_storage *, size_t);
+u64 nfsd_inject_forget_delegations(u64);
+u64 nfsd_inject_recall_client_delegations(struct sockaddr_storage *, size_t);
+u64 nfsd_inject_recall_delegations(u64);
 #else /* CONFIG_NFSD_FAULT_INJECTION */
 static inline int nfsd_fault_inject_init(void) { return 0; }
 static inline void nfsd_fault_inject_cleanup(void) {}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 140c496f612c..f501a9b5c9df 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -189,8 +189,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
        dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
        dparent = fhp->fh_dentry;
-        exp  = fhp->fh_export;
+        exp = exp_get(fhp->fh_export);
-        exp_get(exp);
        /* Lookup the name, but don't follow links */
        if (isdotent(name, len)) {
@@ -464,7 +463,7 @@ out_put_write_access:
        if (size_change)
                put_write_access(inode);
        if (!err)
-                commit_metadata(fhp);
+                err = nfserrno(commit_metadata(fhp));
 out:
        return err;
 }
@@ -820,7 +819,8 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
        return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
 }
-__be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err)
+static __be32
+nfsd_finish_read(struct file *file, unsigned long *count, int host_err)
 {
        if (host_err >= 0) {
                nfsdstats.io_read += host_err;
@@ -831,7 +831,7 @@ __be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err)
                return nfserrno(host_err);
 }
-int nfsd_splice_read(struct svc_rqst *rqstp,
+__be32 nfsd_splice_read(struct svc_rqst *rqstp,
                     struct file *file, loff_t offset, unsigned long *count)
 {
        struct splice_desc sd = {
@@ -847,7 +847,7 @@ int nfsd_splice_read(struct svc_rqst *rqstp,
        return nfsd_finish_read(file, count, host_err);
 }
-int nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
+__be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
                unsigned long *count)
 {
        mm_segment_t oldfs;
@@ -1121,7 +1121,8 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
                iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
        if (iap->ia_valid)
                return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
-        return 0;
+        /* Callers expect file metadata to be committed here */
+        return nfserrno(commit_metadata(resfhp));
 }
 /* HPUX client sometimes creates a file in mode 000, and sets size to 0.
@@ -1253,9 +1254,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        err = nfsd_create_setattr(rqstp, resfhp, iap);
        /*
-         * nfsd_setattr already committed the child.  Transactional filesystems
+         * nfsd_create_setattr already committed the child.  Transactional
-         * had a chance to commit changes for both parent and child
+         * filesystems had a chance to commit changes for both parent and
-         * simultaneously making the following commit_metadata a noop.
+         * child * simultaneously making the following commit_metadata a
+         * noop.
         */
        err2 = nfserrno(commit_metadata(fhp));
        if (err2)
@@ -1426,7 +1428,8 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        err = nfsd_create_setattr(rqstp, resfhp, iap);
        /*
-         * nfsd_setattr already committed the child (and possibly also the parent).
+         * nfsd_create_setattr already committed the child
+         * (and possibly also the parent).
         */
        if (!err)
                err = nfserrno(commit_metadata(fhp));
@@ -1504,16 +1507,15 @@ out_nfserr:
 __be32
 nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
                                char *fname, int flen,
-                                char *path,  int plen,
+                                char *path,
-                                struct svc_fh *resfhp,
+                                struct svc_fh *resfhp)
-                                struct iattr *iap)
 {
        struct dentry   *dentry, *dnew;
        __be32          err, cerr;
        int             host_err;
        err = nfserr_noent;
-        if (!flen || !plen)
+        if (!flen || path[0] == '\0')
                goto out;
        err = nfserr_exist;
        if (isdotent(fname, flen))
@@ -1534,18 +1536,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (IS_ERR(dnew))
                goto out_nfserr;
-        if (unlikely(path[plen] != 0)) {
+        host_err = vfs_symlink(dentry->d_inode, dnew, path);
-                char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
-                if (path_alloced == NULL)
-                        host_err = -ENOMEM;
-                else {
-                        strncpy(path_alloced, path, plen);
-                        path_alloced[plen] = 0;
-                        host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced);
-                        kfree(path_alloced);
-                }
-        } else
-                host_err = vfs_symlink(dentry->d_inode, dnew, path);
        err = nfserrno(host_err);
        if (!err)
                err = nfserrno(commit_metadata(fhp));
@@ -2093,8 +2084,7 @@ nfsd_racache_init(int cache_size)
        if (raparm_hash[0].pb_head)
                return 0;
        nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
-        if (nperbucket < 2)
+        nperbucket = max(2, nperbucket);
-                nperbucket = 2;
        cache_size = nperbucket * RAPARM_HASH_SIZE;
        dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 91b6ae3f658b..c2ff3f14e5f6 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -74,9 +74,9 @@ struct raparms;
 __be32          nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *,
                                struct file **, struct raparms **);
 void            nfsd_put_tmp_read_open(struct file *, struct raparms *);
-int             nfsd_splice_read(struct svc_rqst *,
+__be32          nfsd_splice_read(struct svc_rqst *,
                                struct file *, loff_t, unsigned long *);
-int             nfsd_readv(struct file *, loff_t, struct kvec *, int,
+__be32          nfsd_readv(struct file *, loff_t, struct kvec *, int,
                                unsigned long *);
 __be32          nfsd_read(struct svc_rqst *, struct svc_fh *,
                                loff_t, struct kvec *, int, unsigned long *);
@@ -85,8 +85,8 @@ __be32 		nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
 __be32          nfsd_readlink(struct svc_rqst *, struct svc_fh *,
                                char *, int *);
 __be32          nfsd_symlink(struct svc_rqst *, struct svc_fh *,
-                                char *name, int len, char *path, int plen,
+                                char *name, int len, char *path,
-                                struct svc_fh *res, struct iattr *);
+                                struct svc_fh *res);
 __be32          nfsd_link(struct svc_rqst *, struct svc_fh *,
                                char *, int, struct svc_fh *);
 __be32          nfsd_rename(struct svc_rqst *,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 18cbb6d9c8a9..465e7799742a 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -55,6 +55,7 @@ struct nfsd4_compound_state {
        struct svc_fh           current_fh;
        struct svc_fh           save_fh;
        struct nfs4_stateowner  *replay_owner;
+        struct nfs4_client      *clp;
        /* For sessions DRC */
        struct nfsd4_session    *session;
        struct nfsd4_slot       *slot;
@@ -107,8 +108,8 @@ struct nfsd4_create {
        u32             cr_type;            /* request */
        union {                             /* request */
                struct {
-                        u32 namelen;
+                        u32 datalen;
-                        char *name;
+                        char *data;
                } link;   /* NF4LNK */
                struct {
                        u32 specdata1;
@@ -121,8 +122,8 @@ struct nfsd4_create {
        struct nfs4_acl *cr_acl;
        struct xdr_netobj cr_label;
 };
-#define cr_linklen      u.link.namelen
+#define cr_datalen      u.link.datalen
-#define cr_linkname     u.link.name
+#define cr_data         u.link.data
 #define cr_specdata1    u.dev.specdata1
 #define cr_specdata2    u.dev.specdata2
@@ -478,6 +479,14 @@ struct nfsd4_op {
 bool nfsd4_cache_this_op(struct nfsd4_op *);
+/*
+ * Memory needed just for the duration of processing one compound:
+ */
+struct svcxdr_tmpbuf {
+        struct svcxdr_tmpbuf *next;
+        char buf[];
+};
 struct nfsd4_compoundargs {
        /* scratch variables for XDR decode */
        __be32 *                        p;
@@ -486,11 +495,7 @@ struct nfsd4_compoundargs {
        int                             pagelen;
        __be32                          tmp[8];
        __be32 *                        tmpp;
-        struct tmpbuf {
+        struct svcxdr_tmpbuf            *to_free;
-                struct tmpbuf *next;
-                void (*release)(const void *);
-                void *buf;
-        }                               *to_free;
        struct svc_rqst                 *rqstp;
@@ -574,7 +579,6 @@ extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
 extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                struct nfsd4_compound_state *,
                struct nfsd4_setclientid_confirm *setclientid_confirm);
-extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
 extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
                struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
 extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *);
@@ -585,6 +589,7 @@ extern __be32 nfsd4_create_session(struct svc_rqst *,
 extern __be32 nfsd4_sequence(struct svc_rqst *,
                struct nfsd4_compound_state *,
                struct nfsd4_sequence *);
+extern void nfsd4_sequence_done(struct nfsd4_compoundres *resp);
 extern __be32 nfsd4_destroy_session(struct svc_rqst *,
                struct nfsd4_compound_state *,
                struct nfsd4_destroy_session *);
@@ -594,7 +599,9 @@ extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
                struct nfsd4_open *open, struct nfsd_net *nn);
 extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
                struct svc_fh *current_fh, struct nfsd4_open *open);
-extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status);
+extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate);
+extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
+                struct nfsd4_open *open, __be32 status);
 extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
                struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
 extern __be32 nfsd4_close(struct svc_rqst *rqstp,
@@ -625,6 +632,7 @@ extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp,
 extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp,
                struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid);
 extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr);
 #endif
 /*
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
index 85c98737a146..fc603e0431bb 100644
--- a/fs/nilfs2/Makefile
+++ b/fs/nilfs2/Makefile
@@ -2,4 +2,4 @@ obj-$(CONFIG_NILFS2_FS) += nilfs2.o
 nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
        btnode.o bmap.o btree.o direct.o dat.o recovery.o \
        the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
-        ifile.o alloc.o gcinode.o ioctl.o
+        ifile.o alloc.o gcinode.o ioctl.o sysfs.o
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 9bc72dec3fa6..0696161bf59d 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -320,6 +320,14 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
 int nilfs_init_gcinode(struct inode *inode);
 void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
+/* sysfs.c */
+int __init nilfs_sysfs_init(void);
+void nilfs_sysfs_exit(void);
+int nilfs_sysfs_create_device_group(struct super_block *);
+void nilfs_sysfs_delete_device_group(struct the_nilfs *);
+int nilfs_sysfs_create_snapshot_group(struct nilfs_root *);
+void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *);
 /*
 * Inodes and files operations
 */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8c532b2ca3ab..228f5bdf0772 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -942,7 +942,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
                        iput(inode);
                }
        } else {
-                dentry = d_obtain_alias(inode);
+                dentry = d_obtain_root(inode);
                if (IS_ERR(dentry)) {
                        ret = PTR_ERR(dentry);
                        goto failed_dentry;
@@ -1452,13 +1452,19 @@ static int __init init_nilfs_fs(void)
        if (err)
                goto fail;
-        err = register_filesystem(&nilfs_fs_type);
+        err = nilfs_sysfs_init();
        if (err)
                goto free_cachep;
+        err = register_filesystem(&nilfs_fs_type);
+        if (err)
+                goto deinit_sysfs_entry;
        printk(KERN_INFO "NILFS version 2 loaded\n");
        return 0;
+deinit_sysfs_entry:
+        nilfs_sysfs_exit();
 free_cachep:
        nilfs_destroy_cachep();
 fail:
@@ -1468,6 +1474,7 @@ fail:
 static void __exit exit_nilfs_fs(void)
 {
        nilfs_destroy_cachep();
+        nilfs_sysfs_exit();
        unregister_filesystem(&nilfs_fs_type);
 }
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
new file mode 100644
index 000000000000..bbb0dcc35905
--- /dev/null
+++ b/fs/nilfs2/sysfs.c
@@ -0,0 +1,1137 @@
+/*
+ * sysfs.c - sysfs support implementation.
+ *
+ * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
+ * Copyright (C) 2014 HGST, Inc., a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com>
+ */
+#include <linux/kobject.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "sufile.h"
+#include "cpfile.h"
+#include "sysfs.h"
+/* /sys/fs/<nilfs>/ */
+static struct kset *nilfs_kset;
+#define NILFS_SHOW_TIME(time_t_val, buf) ({ \
+                struct tm res; \
+                int count = 0; \
+                time_to_tm(time_t_val, 0, &res); \
+                res.tm_year += 1900; \
+                res.tm_mon += 1; \
+                count = scnprintf(buf, PAGE_SIZE, \
+                                    "%ld-%.2d-%.2d %.2d:%.2d:%.2d\n", \
+                                    res.tm_year, res.tm_mon, res.tm_mday, \
+                                    res.tm_hour, res.tm_min, res.tm_sec);\
+                count; \
+})
+#define NILFS_DEV_INT_GROUP_OPS(name, parent_name) \
+static ssize_t nilfs_##name##_attr_show(struct kobject *kobj, \
+                                        struct attribute *attr, char *buf) \
+{ \
+        struct the_nilfs *nilfs = container_of(kobj->parent, \
+                                                struct the_nilfs, \
+                                                ns_##parent_name##_kobj); \
+        struct nilfs_##name##_attr *a = container_of(attr, \
+                                                struct nilfs_##name##_attr, \
+                                                attr); \
+        return a->show ? a->show(a, nilfs, buf) : 0; \
+} \
+static ssize_t nilfs_##name##_attr_store(struct kobject *kobj, \
+                                         struct attribute *attr, \
+                                         const char *buf, size_t len) \
+{ \
+        struct the_nilfs *nilfs = container_of(kobj->parent, \
+                                                struct the_nilfs, \
+                                                ns_##parent_name##_kobj); \
+        struct nilfs_##name##_attr *a = container_of(attr, \
+                                                struct nilfs_##name##_attr, \
+                                                attr); \
+        return a->store ? a->store(a, nilfs, buf, len) : 0; \
+} \
+static const struct sysfs_ops nilfs_##name##_attr_ops = { \
+        .show   = nilfs_##name##_attr_show, \
+        .store  = nilfs_##name##_attr_store, \
+};
+#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \
+static void nilfs_##name##_attr_release(struct kobject *kobj) \
+{ \
+        struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \
+        struct the_nilfs *nilfs = container_of(kobj->parent, \
+                                                struct the_nilfs, \
+                                                ns_##parent_name##_kobj); \
+        subgroups = nilfs->ns_##parent_name##_subgroups; \
+        complete(&subgroups->sg_##name##_kobj_unregister); \
+} \
+static struct kobj_type nilfs_##name##_ktype = { \
+        .default_attrs  = nilfs_##name##_attrs, \
+        .sysfs_ops      = &nilfs_##name##_attr_ops, \
+        .release        = nilfs_##name##_attr_release, \
+};
+#define NILFS_DEV_INT_GROUP_FNS(name, parent_name) \
+static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \
+{ \
+        struct kobject *parent; \
+        struct kobject *kobj; \
+        struct completion *kobj_unregister; \
+        struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \
+        int err; \
+        subgroups = nilfs->ns_##parent_name##_subgroups; \
+        kobj = &subgroups->sg_##name##_kobj; \
+        kobj_unregister = &subgroups->sg_##name##_kobj_unregister; \
+        parent = &nilfs->ns_##parent_name##_kobj; \
+        kobj->kset = nilfs_kset; \
+        init_completion(kobj_unregister); \
+        err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \
+                                    #name); \
+        if (err) \
+                return err; \
+        return 0; \
+} \
+static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \
+{ \
+        kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \
+}
+/************************************************************************
+ *                        NILFS snapshot attrs                          *
+ ************************************************************************/
+static ssize_t
+nilfs_snapshot_inodes_count_show(struct nilfs_snapshot_attr *attr,
+                                 struct nilfs_root *root, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                        (unsigned long long)atomic64_read(&root->inodes_count));
+}
+static ssize_t
+nilfs_snapshot_blocks_count_show(struct nilfs_snapshot_attr *attr,
+                                 struct nilfs_root *root, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                        (unsigned long long)atomic64_read(&root->blocks_count));
+}
+static const char snapshot_readme_str[] =
+        "The group contains details about mounted snapshot.\n\n"
+        "(1) inodes_count\n\tshow number of inodes for snapshot.\n\n"
+        "(2) blocks_count\n\tshow number of blocks for snapshot.\n\n";
+static ssize_t
+nilfs_snapshot_README_show(struct nilfs_snapshot_attr *attr,
+                            struct nilfs_root *root, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, snapshot_readme_str);
+}
+NILFS_SNAPSHOT_RO_ATTR(inodes_count);
+NILFS_SNAPSHOT_RO_ATTR(blocks_count);
+NILFS_SNAPSHOT_RO_ATTR(README);
+static struct attribute *nilfs_snapshot_attrs[] = {
+        NILFS_SNAPSHOT_ATTR_LIST(inodes_count),
+        NILFS_SNAPSHOT_ATTR_LIST(blocks_count),
+        NILFS_SNAPSHOT_ATTR_LIST(README),
+        NULL,
+};
+static ssize_t nilfs_snapshot_attr_show(struct kobject *kobj,
+                                        struct attribute *attr, char *buf)
+{
+        struct nilfs_root *root =
+                        container_of(kobj, struct nilfs_root, snapshot_kobj);
+        struct nilfs_snapshot_attr *a =
+                        container_of(attr, struct nilfs_snapshot_attr, attr);
+        return a->show ? a->show(a, root, buf) : 0;
+}
+static ssize_t nilfs_snapshot_attr_store(struct kobject *kobj,
+                                         struct attribute *attr,
+                                         const char *buf, size_t len)
+{
+        struct nilfs_root *root =
+                        container_of(kobj, struct nilfs_root, snapshot_kobj);
+        struct nilfs_snapshot_attr *a =
+                        container_of(attr, struct nilfs_snapshot_attr, attr);
+        return a->store ? a->store(a, root, buf, len) : 0;
+}
+static void nilfs_snapshot_attr_release(struct kobject *kobj)
+{
+        struct nilfs_root *root = container_of(kobj, struct nilfs_root,
+                                                snapshot_kobj);
+        complete(&root->snapshot_kobj_unregister);
+}
+static const struct sysfs_ops nilfs_snapshot_attr_ops = {
+        .show   = nilfs_snapshot_attr_show,
+        .store  = nilfs_snapshot_attr_store,
+};
+static struct kobj_type nilfs_snapshot_ktype = {
+        .default_attrs  = nilfs_snapshot_attrs,
+        .sysfs_ops      = &nilfs_snapshot_attr_ops,
+        .release        = nilfs_snapshot_attr_release,
+};
+int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root)
+{
+        struct the_nilfs *nilfs;
+        struct kobject *parent;
+        int err;
+        nilfs = root->nilfs;
+        parent = &nilfs->ns_dev_subgroups->sg_mounted_snapshots_kobj;
+        root->snapshot_kobj.kset = nilfs_kset;
+        init_completion(&root->snapshot_kobj_unregister);
+        if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
+                err = kobject_init_and_add(&root->snapshot_kobj,
+                                            &nilfs_snapshot_ktype,
+                                            &nilfs->ns_dev_kobj,
+                                            "current_checkpoint");
+        } else {
+                err = kobject_init_and_add(&root->snapshot_kobj,
+                                            &nilfs_snapshot_ktype,
+                                            parent,
+                                            "%llu", root->cno);
+        }
+        if (err)
+                return err;
+        return 0;
+}
+void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root)
+{
+        kobject_del(&root->snapshot_kobj);
+}
+/************************************************************************
+ *                    NILFS mounted snapshots attrs                     *
+ ************************************************************************/
+static const char mounted_snapshots_readme_str[] =
+        "The mounted_snapshots group contains group for\n"
+        "every mounted snapshot.\n";
+static ssize_t
+nilfs_mounted_snapshots_README_show(struct nilfs_mounted_snapshots_attr *attr,
+                                    struct the_nilfs *nilfs, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, mounted_snapshots_readme_str);
+}
+NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(README);
+static struct attribute *nilfs_mounted_snapshots_attrs[] = {
+        NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(README),
+        NULL,
+};
+NILFS_DEV_INT_GROUP_OPS(mounted_snapshots, dev);
+NILFS_DEV_INT_GROUP_TYPE(mounted_snapshots, dev);
+NILFS_DEV_INT_GROUP_FNS(mounted_snapshots, dev);
+/************************************************************************
+ *                      NILFS checkpoints attrs                         *
+ ************************************************************************/
+static ssize_t
+nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr,
+                                            struct the_nilfs *nilfs,
+                                            char *buf)
+{
+        __u64 ncheckpoints;
+        struct nilfs_cpstat cpstat;
+        int err;
+        down_read(&nilfs->ns_segctor_sem);
+        err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
+        up_read(&nilfs->ns_segctor_sem);
+        if (err < 0) {
+                printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n",
+                        err);
+                return err;
+        }
+        ncheckpoints = cpstat.cs_ncps;
+        return snprintf(buf, PAGE_SIZE, "%llu\n", ncheckpoints);
+}
+static ssize_t
+nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr,
+                                        struct the_nilfs *nilfs,
+                                        char *buf)
+{
+        __u64 nsnapshots;
+        struct nilfs_cpstat cpstat;
+        int err;
+        down_read(&nilfs->ns_segctor_sem);
+        err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
+        up_read(&nilfs->ns_segctor_sem);
+        if (err < 0) {
+                printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n",
+                        err);
+                return err;
+        }
+        nsnapshots = cpstat.cs_nsss;
+        return snprintf(buf, PAGE_SIZE, "%llu\n", nsnapshots);
+}
+static ssize_t
+nilfs_checkpoints_last_seg_checkpoint_show(struct nilfs_checkpoints_attr *attr,
+                                            struct the_nilfs *nilfs,
+                                            char *buf)
+{
+        __u64 last_cno;
+        spin_lock(&nilfs->ns_last_segment_lock);
+        last_cno = nilfs->ns_last_cno;
+        spin_unlock(&nilfs->ns_last_segment_lock);
+        return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno);
+}
+static ssize_t
+nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr,
+                                        struct the_nilfs *nilfs,
+                                        char *buf)
+{
+        __u64 cno;
+        down_read(&nilfs->ns_sem);
+        cno = nilfs->ns_cno;
+        up_read(&nilfs->ns_sem);
+        return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
+}
+static const char checkpoints_readme_str[] =
+        "The checkpoints group contains attributes that describe\n"
+        "details about volume's checkpoints.\n\n"
+        "(1) checkpoints_number\n\tshow number of checkpoints on volume.\n\n"
+        "(2) snapshots_number\n\tshow number of snapshots on volume.\n\n"
+        "(3) last_seg_checkpoint\n"
+        "\tshow checkpoint number of the latest segment.\n\n"
+        "(4) next_checkpoint\n\tshow next checkpoint number.\n\n";
+static ssize_t
+nilfs_checkpoints_README_show(struct nilfs_checkpoints_attr *attr,
+                                struct the_nilfs *nilfs, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, checkpoints_readme_str);
+}
+NILFS_CHECKPOINTS_RO_ATTR(checkpoints_number);
+NILFS_CHECKPOINTS_RO_ATTR(snapshots_number);
+NILFS_CHECKPOINTS_RO_ATTR(last_seg_checkpoint);
+NILFS_CHECKPOINTS_RO_ATTR(next_checkpoint);
+NILFS_CHECKPOINTS_RO_ATTR(README);
+static struct attribute *nilfs_checkpoints_attrs[] = {
+        NILFS_CHECKPOINTS_ATTR_LIST(checkpoints_number),
+        NILFS_CHECKPOINTS_ATTR_LIST(snapshots_number),
+        NILFS_CHECKPOINTS_ATTR_LIST(last_seg_checkpoint),
+        NILFS_CHECKPOINTS_ATTR_LIST(next_checkpoint),
+        NILFS_CHECKPOINTS_ATTR_LIST(README),
+        NULL,
+};
+NILFS_DEV_INT_GROUP_OPS(checkpoints, dev);
+NILFS_DEV_INT_GROUP_TYPE(checkpoints, dev);
+NILFS_DEV_INT_GROUP_FNS(checkpoints, dev);
+/************************************************************************
+ *                        NILFS segments attrs                          *
+ ************************************************************************/
+static ssize_t
+nilfs_segments_segments_number_show(struct nilfs_segments_attr *attr,
+                                     struct the_nilfs *nilfs,
+                                     char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_nsegments);
+}
+static ssize_t
+nilfs_segments_blocks_per_segment_show(struct nilfs_segments_attr *attr,
+                                        struct the_nilfs *nilfs,
+                                        char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_blocks_per_segment);
+}
+static ssize_t
+nilfs_segments_clean_segments_show(struct nilfs_segments_attr *attr,
+                                    struct the_nilfs *nilfs,
+                                    char *buf)
+{
+        unsigned long ncleansegs;
+        down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+        ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
+        up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
+        return snprintf(buf, PAGE_SIZE, "%lu\n", ncleansegs);
+}
+static ssize_t
+nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr,
+                                    struct the_nilfs *nilfs,
+                                    char *buf)
+{
+        struct nilfs_sustat sustat;
+        int err;
+        down_read(&nilfs->ns_segctor_sem);
+        err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
+        up_read(&nilfs->ns_segctor_sem);
+        if (err < 0) {
+                printk(KERN_ERR "NILFS: unable to get segment stat: err=%d\n",
+                        err);
+                return err;
+        }
+        return snprintf(buf, PAGE_SIZE, "%llu\n", sustat.ss_ndirtysegs);
+}
+static const char segments_readme_str[] =
+        "The segments group contains attributes that describe\n"
+        "details about volume's segments.\n\n"
+        "(1) segments_number\n\tshow number of segments on volume.\n\n"
+        "(2) blocks_per_segment\n\tshow number of blocks in segment.\n\n"
+        "(3) clean_segments\n\tshow count of clean segments.\n\n"
+        "(4) dirty_segments\n\tshow count of dirty segments.\n\n";
+static ssize_t
+nilfs_segments_README_show(struct nilfs_segments_attr *attr,
+                            struct the_nilfs *nilfs,
+                            char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, segments_readme_str);
+}
+NILFS_SEGMENTS_RO_ATTR(segments_number);
+NILFS_SEGMENTS_RO_ATTR(blocks_per_segment);
+NILFS_SEGMENTS_RO_ATTR(clean_segments);
+NILFS_SEGMENTS_RO_ATTR(dirty_segments);
+NILFS_SEGMENTS_RO_ATTR(README);
+static struct attribute *nilfs_segments_attrs[] = {
+        NILFS_SEGMENTS_ATTR_LIST(segments_number),
+        NILFS_SEGMENTS_ATTR_LIST(blocks_per_segment),
+        NILFS_SEGMENTS_ATTR_LIST(clean_segments),
+        NILFS_SEGMENTS_ATTR_LIST(dirty_segments),
+        NILFS_SEGMENTS_ATTR_LIST(README),
+        NULL,
+};
+NILFS_DEV_INT_GROUP_OPS(segments, dev);
+NILFS_DEV_INT_GROUP_TYPE(segments, dev);
+NILFS_DEV_INT_GROUP_FNS(segments, dev);
+/************************************************************************
+ *                        NILFS segctor attrs                           *
+ ************************************************************************/
+static ssize_t
+nilfs_segctor_last_pseg_block_show(struct nilfs_segctor_attr *attr,
+                                    struct the_nilfs *nilfs,
+                                    char *buf)
+{
+        sector_t last_pseg;
+        spin_lock(&nilfs->ns_last_segment_lock);
+        last_pseg = nilfs->ns_last_pseg;
+        spin_unlock(&nilfs->ns_last_segment_lock);
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                        (unsigned long long)last_pseg);
+}
+static ssize_t
+nilfs_segctor_last_seg_sequence_show(struct nilfs_segctor_attr *attr,
+                                        struct the_nilfs *nilfs,
+                                        char *buf)
+{
+        u64 last_seq;
+        spin_lock(&nilfs->ns_last_segment_lock);
+        last_seq = nilfs->ns_last_seq;
+        spin_unlock(&nilfs->ns_last_segment_lock);
+        return snprintf(buf, PAGE_SIZE, "%llu\n", last_seq);
+}
+static ssize_t
+nilfs_segctor_last_seg_checkpoint_show(struct nilfs_segctor_attr *attr,
+                                        struct the_nilfs *nilfs,
+                                        char *buf)
+{
+        __u64 last_cno;
+        spin_lock(&nilfs->ns_last_segment_lock);
+        last_cno = nilfs->ns_last_cno;
+        spin_unlock(&nilfs->ns_last_segment_lock);
+        return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno);
+}
+static ssize_t
+nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr,
+                                        struct the_nilfs *nilfs,
+                                        char *buf)
+{
+        u64 seg_seq;
+        down_read(&nilfs->ns_sem);
+        seg_seq = nilfs->ns_seg_seq;
+        up_read(&nilfs->ns_sem);
+        return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq);
+}
+static ssize_t
+nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr,
+                                         struct the_nilfs *nilfs,
+                                         char *buf)
+{
+        __u64 segnum;
+        down_read(&nilfs->ns_sem);
+        segnum = nilfs->ns_segnum;
+        up_read(&nilfs->ns_sem);
+        return snprintf(buf, PAGE_SIZE, "%llu\n", segnum);
+}
+static ssize_t
+nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr,
+                                 struct the_nilfs *nilfs,
+                                 char *buf)
+{
+        __u64 nextnum;
+        down_read(&nilfs->ns_sem);
+        nextnum = nilfs->ns_nextnum;
+        up_read(&nilfs->ns_sem);
+        return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum);
+}
+static ssize_t
+nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr,
+                                        struct the_nilfs *nilfs,
+                                        char *buf)
+{
+        unsigned long pseg_offset;
+        down_read(&nilfs->ns_sem);
+        pseg_offset = nilfs->ns_pseg_offset;
+        up_read(&nilfs->ns_sem);
+        return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset);
+}
+static ssize_t
+nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr,
+                                        struct the_nilfs *nilfs,
+                                        char *buf)
+{
+        __u64 cno;
+        down_read(&nilfs->ns_sem);
+        cno = nilfs->ns_cno;
+        up_read(&nilfs->ns_sem);
+        return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
+}
+static ssize_t
+nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr,
+                                        struct the_nilfs *nilfs,
+                                        char *buf)
+{
+        time_t ctime;
+        down_read(&nilfs->ns_sem);
+        ctime = nilfs->ns_ctime;
+        up_read(&nilfs->ns_sem);
+        return NILFS_SHOW_TIME(ctime, buf);
+}
+static ssize_t
+nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr,
+                                            struct the_nilfs *nilfs,
+                                            char *buf)
+{
+        time_t ctime;
+        down_read(&nilfs->ns_sem);
+        ctime = nilfs->ns_ctime;
+        up_read(&nilfs->ns_sem);
+        return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime);
+}
+static ssize_t
+nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr,
+                                         struct the_nilfs *nilfs,
+                                         char *buf)
+{
+        time_t nongc_ctime;
+        down_read(&nilfs->ns_sem);
+        nongc_ctime = nilfs->ns_nongc_ctime;
+        up_read(&nilfs->ns_sem);
+        return NILFS_SHOW_TIME(nongc_ctime, buf);
+}
+static ssize_t
+nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr,
+                                                struct the_nilfs *nilfs,
+                                                char *buf)
+{
+        time_t nongc_ctime;
+        down_read(&nilfs->ns_sem);
+        nongc_ctime = nilfs->ns_nongc_ctime;
+        up_read(&nilfs->ns_sem);
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                        (unsigned long long)nongc_ctime);
+}
+static ssize_t
+nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr,
+                                            struct the_nilfs *nilfs,
+                                            char *buf)
+{
+        u32 ndirtyblks;
+        down_read(&nilfs->ns_sem);
+        ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks);
+        up_read(&nilfs->ns_sem);
+        return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks);
+}
+static const char segctor_readme_str[] =
+        "The segctor group contains attributes that describe\n"
+        "segctor thread activity details.\n\n"
+        "(1) last_pseg_block\n"
+        "\tshow start block number of the latest segment.\n\n"
+        "(2) last_seg_sequence\n"
+        "\tshow sequence value of the latest segment.\n\n"
+        "(3) last_seg_checkpoint\n"
+        "\tshow checkpoint number of the latest segment.\n\n"
+        "(4) current_seg_sequence\n\tshow segment sequence counter.\n\n"
+        "(5) current_last_full_seg\n"
+        "\tshow index number of the latest full segment.\n\n"
+        "(6) next_full_seg\n"
+        "\tshow index number of the full segment index to be used next.\n\n"
+        "(7) next_pseg_offset\n"
+        "\tshow offset of next partial segment in the current full segment.\n\n"
+        "(8) next_checkpoint\n\tshow next checkpoint number.\n\n"
+        "(9) last_seg_write_time\n"
+        "\tshow write time of the last segment in human-readable format.\n\n"
+        "(10) last_seg_write_time_secs\n"
+        "\tshow write time of the last segment in seconds.\n\n"
+        "(11) last_nongc_write_time\n"
+        "\tshow write time of the last segment not for cleaner operation "
+        "in human-readable format.\n\n"
+        "(12) last_nongc_write_time_secs\n"
+        "\tshow write time of the last segment not for cleaner operation "
+        "in seconds.\n\n"
+        "(13) dirty_data_blocks_count\n"
+        "\tshow number of dirty data blocks.\n\n";
+static ssize_t
+nilfs_segctor_README_show(struct nilfs_segctor_attr *attr,
+                          struct the_nilfs *nilfs, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, segctor_readme_str);
+}
+NILFS_SEGCTOR_RO_ATTR(last_pseg_block);
+NILFS_SEGCTOR_RO_ATTR(last_seg_sequence);
+NILFS_SEGCTOR_RO_ATTR(last_seg_checkpoint);
+NILFS_SEGCTOR_RO_ATTR(current_seg_sequence);
+NILFS_SEGCTOR_RO_ATTR(current_last_full_seg);
+NILFS_SEGCTOR_RO_ATTR(next_full_seg);
+NILFS_SEGCTOR_RO_ATTR(next_pseg_offset);
+NILFS_SEGCTOR_RO_ATTR(next_checkpoint);
+NILFS_SEGCTOR_RO_ATTR(last_seg_write_time);
+NILFS_SEGCTOR_RO_ATTR(last_seg_write_time_secs);
+NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time);
+NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time_secs);
+NILFS_SEGCTOR_RO_ATTR(dirty_data_blocks_count);
+NILFS_SEGCTOR_RO_ATTR(README);
+static struct attribute *nilfs_segctor_attrs[] = {
+        NILFS_SEGCTOR_ATTR_LIST(last_pseg_block),
+        NILFS_SEGCTOR_ATTR_LIST(last_seg_sequence),
+        NILFS_SEGCTOR_ATTR_LIST(last_seg_checkpoint),
+        NILFS_SEGCTOR_ATTR_LIST(current_seg_sequence),
+        NILFS_SEGCTOR_ATTR_LIST(current_last_full_seg),
+        NILFS_SEGCTOR_ATTR_LIST(next_full_seg),
+        NILFS_SEGCTOR_ATTR_LIST(next_pseg_offset),
+        NILFS_SEGCTOR_ATTR_LIST(next_checkpoint),
+        NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time),
+        NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time_secs),
+        NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time),
+        NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time_secs),
+        NILFS_SEGCTOR_ATTR_LIST(dirty_data_blocks_count),
+        NILFS_SEGCTOR_ATTR_LIST(README),
+        NULL,
+};
+NILFS_DEV_INT_GROUP_OPS(segctor, dev);
+NILFS_DEV_INT_GROUP_TYPE(segctor, dev);
+NILFS_DEV_INT_GROUP_FNS(segctor, dev);
+/************************************************************************
+ *                        NILFS superblock attrs                        *
+ ************************************************************************/
+static ssize_t
+nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr,
+                                     struct the_nilfs *nilfs,
+                                     char *buf)
+{
+        time_t sbwtime;
+        down_read(&nilfs->ns_sem);
+        sbwtime = nilfs->ns_sbwtime;
+        up_read(&nilfs->ns_sem);
+        return NILFS_SHOW_TIME(sbwtime, buf);
+}
+static ssize_t
+nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr,
+                                         struct the_nilfs *nilfs,
+                                         char *buf)
+{
+        time_t sbwtime;
+        down_read(&nilfs->ns_sem);
+        sbwtime = nilfs->ns_sbwtime;
+        up_read(&nilfs->ns_sem);
+        return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)sbwtime);
+}
+static ssize_t
+nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr,
+                                      struct the_nilfs *nilfs,
+                                      char *buf)
+{
+        unsigned sbwcount;
+        down_read(&nilfs->ns_sem);
+        sbwcount = nilfs->ns_sbwcount;
+        up_read(&nilfs->ns_sem);
+        return snprintf(buf, PAGE_SIZE, "%u\n", sbwcount);
+}
+static ssize_t
+nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr,
+                                            struct the_nilfs *nilfs,
+                                            char *buf)
+{
+        unsigned sb_update_freq;
+        down_read(&nilfs->ns_sem);
+        sb_update_freq = nilfs->ns_sb_update_freq;
+        up_read(&nilfs->ns_sem);
+        return snprintf(buf, PAGE_SIZE, "%u\n", sb_update_freq);
+}
+static ssize_t
+nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
+                                            struct the_nilfs *nilfs,
+                                            const char *buf, size_t count)
+{
+        unsigned val;
+        int err;
+        err = kstrtouint(skip_spaces(buf), 0, &val);
+        if (err) {
+                printk(KERN_ERR "NILFS: unable to convert string: err=%d\n",
+                        err);
+                return err;
+        }
+        if (val < NILFS_SB_FREQ) {
+                val = NILFS_SB_FREQ;
+                printk(KERN_WARNING "NILFS: superblock update frequency cannot be lesser than 10 seconds\n");
+        }
+        down_write(&nilfs->ns_sem);
+        nilfs->ns_sb_update_freq = val;
+        up_write(&nilfs->ns_sem);
+        return count;
+}
+static const char sb_readme_str[] =
+        "The superblock group contains attributes that describe\n"
+        "superblock's details.\n\n"
+        "(1) sb_write_time\n\tshow previous write time of super block "
+        "in human-readable format.\n\n"
+        "(2) sb_write_time_secs\n\tshow previous write time of super block "
+        "in seconds.\n\n"
+        "(3) sb_write_count\n\tshow write count of super block.\n\n"
+        "(4) sb_update_frequency\n"
+        "\tshow/set interval of periodical update of superblock (in seconds).\n\n"
+        "\tYou can set preferable frequency of superblock update by command:\n\n"
+        "\t'echo <val> > /sys/fs/<nilfs>/<dev>/superblock/sb_update_frequency'\n";
+static ssize_t
+nilfs_superblock_README_show(struct nilfs_superblock_attr *attr,
+                                struct the_nilfs *nilfs, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, sb_readme_str);
+}
+NILFS_SUPERBLOCK_RO_ATTR(sb_write_time);
+NILFS_SUPERBLOCK_RO_ATTR(sb_write_time_secs);
+NILFS_SUPERBLOCK_RO_ATTR(sb_write_count);
+NILFS_SUPERBLOCK_RW_ATTR(sb_update_frequency);
+NILFS_SUPERBLOCK_RO_ATTR(README);
+static struct attribute *nilfs_superblock_attrs[] = {
+        NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time),
+        NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time_secs),
+        NILFS_SUPERBLOCK_ATTR_LIST(sb_write_count),
+        NILFS_SUPERBLOCK_ATTR_LIST(sb_update_frequency),
+        NILFS_SUPERBLOCK_ATTR_LIST(README),
+        NULL,
+};
+NILFS_DEV_INT_GROUP_OPS(superblock, dev);
+NILFS_DEV_INT_GROUP_TYPE(superblock, dev);
+NILFS_DEV_INT_GROUP_FNS(superblock, dev);
+/************************************************************************
+ *                        NILFS device attrs                            *
+ ************************************************************************/
+static
+ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr,
+                                struct the_nilfs *nilfs,
+                                char *buf)
+{
+        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        u32 major = le32_to_cpu(sbp[0]->s_rev_level);
+        u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level);
+        return snprintf(buf, PAGE_SIZE, "%d.%d\n", major, minor);
+}
+static
+ssize_t nilfs_dev_blocksize_show(struct nilfs_dev_attr *attr,
+                                 struct the_nilfs *nilfs,
+                                 char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%u\n", nilfs->ns_blocksize);
+}
+static
+ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr,
+                                    struct the_nilfs *nilfs,
+                                    char *buf)
+{
+        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size);
+        return snprintf(buf, PAGE_SIZE, "%llu\n", dev_size);
+}
+static
+ssize_t nilfs_dev_free_blocks_show(struct nilfs_dev_attr *attr,
+                                   struct the_nilfs *nilfs,
+                                   char *buf)
+{
+        sector_t free_blocks = 0;
+        nilfs_count_free_blocks(nilfs, &free_blocks);
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                        (unsigned long long)free_blocks);
+}
+static
+ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr,
+                            struct the_nilfs *nilfs,
+                            char *buf)
+{
+        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        return snprintf(buf, PAGE_SIZE, "%pUb\n", sbp[0]->s_uuid);
+}
+static
+ssize_t nilfs_dev_volume_name_show(struct nilfs_dev_attr *attr,
+                                    struct the_nilfs *nilfs,
+                                    char *buf)
+{
+        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        return scnprintf(buf, sizeof(sbp[0]->s_volume_name), "%s\n",
+                         sbp[0]->s_volume_name);
+}
+static const char dev_readme_str[] =
+        "The <device> group contains attributes that describe file system\n"
+        "partition's details.\n\n"
+        "(1) revision\n\tshow NILFS file system revision.\n\n"
+        "(2) blocksize\n\tshow volume block size in bytes.\n\n"
+        "(3) device_size\n\tshow volume size in bytes.\n\n"
+        "(4) free_blocks\n\tshow count of free blocks on volume.\n\n"
+        "(5) uuid\n\tshow volume's UUID.\n\n"
+        "(6) volume_name\n\tshow volume's name.\n\n";
+static ssize_t nilfs_dev_README_show(struct nilfs_dev_attr *attr,
+                                     struct the_nilfs *nilfs,
+                                     char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, dev_readme_str);
+}
+NILFS_DEV_RO_ATTR(revision);
+NILFS_DEV_RO_ATTR(blocksize);
+NILFS_DEV_RO_ATTR(device_size);
+NILFS_DEV_RO_ATTR(free_blocks);
+NILFS_DEV_RO_ATTR(uuid);
+NILFS_DEV_RO_ATTR(volume_name);
+NILFS_DEV_RO_ATTR(README);
+static struct attribute *nilfs_dev_attrs[] = {
+        NILFS_DEV_ATTR_LIST(revision),
+        NILFS_DEV_ATTR_LIST(blocksize),
+        NILFS_DEV_ATTR_LIST(device_size),
+        NILFS_DEV_ATTR_LIST(free_blocks),
+        NILFS_DEV_ATTR_LIST(uuid),
+        NILFS_DEV_ATTR_LIST(volume_name),
+        NILFS_DEV_ATTR_LIST(README),
+        NULL,
+};
+static ssize_t nilfs_dev_attr_show(struct kobject *kobj,
+                                    struct attribute *attr, char *buf)
+{
+        struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
+                                                ns_dev_kobj);
+        struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr,
+                                                attr);
+        return a->show ? a->show(a, nilfs, buf) : 0;
+}
+static ssize_t nilfs_dev_attr_store(struct kobject *kobj,
+                                    struct attribute *attr,
+                                    const char *buf, size_t len)
+{
+        struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
+                                                ns_dev_kobj);
+        struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr,
+                                                attr);
+        return a->store ? a->store(a, nilfs, buf, len) : 0;
+}
+static void nilfs_dev_attr_release(struct kobject *kobj)
+{
+        struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
+                                                ns_dev_kobj);
+        complete(&nilfs->ns_dev_kobj_unregister);
+}
+static const struct sysfs_ops nilfs_dev_attr_ops = {
+        .show   = nilfs_dev_attr_show,
+        .store  = nilfs_dev_attr_store,
+};
+static struct kobj_type nilfs_dev_ktype = {
+        .default_attrs  = nilfs_dev_attrs,
+        .sysfs_ops      = &nilfs_dev_attr_ops,
+        .release        = nilfs_dev_attr_release,
+};
+int nilfs_sysfs_create_device_group(struct super_block *sb)
+{
+        struct the_nilfs *nilfs = sb->s_fs_info;
+        size_t devgrp_size = sizeof(struct nilfs_sysfs_dev_subgroups);
+        int err;
+        nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL);
+        if (unlikely(!nilfs->ns_dev_subgroups)) {
+                err = -ENOMEM;
+                printk(KERN_ERR "NILFS: unable to allocate memory for device group\n");
+                goto failed_create_device_group;
+        }
+        nilfs->ns_dev_kobj.kset = nilfs_kset;
+        init_completion(&nilfs->ns_dev_kobj_unregister);
+        err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL,
+                                    "%s", sb->s_id);
+        if (err)
+                goto free_dev_subgroups;
+        err = nilfs_sysfs_create_mounted_snapshots_group(nilfs);
+        if (err)
+                goto cleanup_dev_kobject;
+        err = nilfs_sysfs_create_checkpoints_group(nilfs);
+        if (err)
+                goto delete_mounted_snapshots_group;
+        err = nilfs_sysfs_create_segments_group(nilfs);
+        if (err)
+                goto delete_checkpoints_group;
+        err = nilfs_sysfs_create_superblock_group(nilfs);
+        if (err)
+                goto delete_segments_group;
+        err = nilfs_sysfs_create_segctor_group(nilfs);
+        if (err)
+                goto delete_superblock_group;
+        return 0;
+delete_superblock_group:
+        nilfs_sysfs_delete_superblock_group(nilfs);
+delete_segments_group:
+        nilfs_sysfs_delete_segments_group(nilfs);
+delete_checkpoints_group:
+        nilfs_sysfs_delete_checkpoints_group(nilfs);
+delete_mounted_snapshots_group:
+        nilfs_sysfs_delete_mounted_snapshots_group(nilfs);
+cleanup_dev_kobject:
+        kobject_del(&nilfs->ns_dev_kobj);
+free_dev_subgroups:
+        kfree(nilfs->ns_dev_subgroups);
+failed_create_device_group:
+        return err;
+}
+void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
+{
+        nilfs_sysfs_delete_mounted_snapshots_group(nilfs);
+        nilfs_sysfs_delete_checkpoints_group(nilfs);
+        nilfs_sysfs_delete_segments_group(nilfs);
+        nilfs_sysfs_delete_superblock_group(nilfs);
+        nilfs_sysfs_delete_segctor_group(nilfs);
+        kobject_del(&nilfs->ns_dev_kobj);
+        kfree(nilfs->ns_dev_subgroups);
+}
+/************************************************************************
+ *                        NILFS feature attrs                           *
+ ************************************************************************/
+static ssize_t nilfs_feature_revision_show(struct kobject *kobj,
+                                            struct attribute *attr, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%d.%d\n",
+                        NILFS_CURRENT_REV, NILFS_MINOR_REV);
+}
+static const char features_readme_str[] =
+        "The features group contains attributes that describe NILFS file\n"
+        "system driver features.\n\n"
+        "(1) revision\n\tshow current revision of NILFS file system driver.\n";
+static ssize_t nilfs_feature_README_show(struct kobject *kobj,
+                                         struct attribute *attr,
+                                         char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, features_readme_str);
+}
+NILFS_FEATURE_RO_ATTR(revision);
+NILFS_FEATURE_RO_ATTR(README);
+static struct attribute *nilfs_feature_attrs[] = {
+        NILFS_FEATURE_ATTR_LIST(revision),
+        NILFS_FEATURE_ATTR_LIST(README),
+        NULL,
+};
+static const struct attribute_group nilfs_feature_attr_group = {
+        .name = "features",
+        .attrs = nilfs_feature_attrs,
+};
+int __init nilfs_sysfs_init(void)
+{
+        int err;
+        nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj);
+        if (!nilfs_kset) {
+                err = -ENOMEM;
+                printk(KERN_ERR "NILFS: unable to create sysfs entry: err %d\n",
+                        err);
+                goto failed_sysfs_init;
+        }
+        err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
+        if (unlikely(err)) {
+                printk(KERN_ERR "NILFS: unable to create feature group: err %d\n",
+                        err);
+                goto cleanup_sysfs_init;
+        }
+        return 0;
+cleanup_sysfs_init:
+        kset_unregister(nilfs_kset);
+failed_sysfs_init:
+        return err;
+}
+void nilfs_sysfs_exit(void)
+{
+        sysfs_remove_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
+        kset_unregister(nilfs_kset);
+}
diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h
new file mode 100644
index 000000000000..677e3a1a8370
--- /dev/null
+++ b/fs/nilfs2/sysfs.h
@@ -0,0 +1,176 @@
+/*
+ * sysfs.h - sysfs support declarations.
+ *
+ * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
+ * Copyright (C) 2014 HGST, Inc., a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com>
+ */
+#ifndef _NILFS_SYSFS_H
+#define _NILFS_SYSFS_H
+#include <linux/sysfs.h>
+#define NILFS_ROOT_GROUP_NAME   "nilfs2"
+/*
+ * struct nilfs_sysfs_dev_subgroups - device subgroup kernel objects
+ * @sg_superblock_kobj: /sys/fs/<nilfs>/<device>/superblock
+ * @sg_superblock_kobj_unregister: completion state
+ * @sg_segctor_kobj: /sys/fs/<nilfs>/<device>/segctor
+ * @sg_segctor_kobj_unregister: completion state
+ * @sg_mounted_snapshots_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots
+ * @sg_mounted_snapshots_kobj_unregister: completion state
+ * @sg_checkpoints_kobj: /sys/fs/<nilfs>/<device>/checkpoints
+ * @sg_checkpoints_kobj_unregister: completion state
+ * @sg_segments_kobj: /sys/fs/<nilfs>/<device>/segments
+ * @sg_segments_kobj_unregister: completion state
+ */
+struct nilfs_sysfs_dev_subgroups {
+        /* /sys/fs/<nilfs>/<device>/superblock */
+        struct kobject sg_superblock_kobj;
+        struct completion sg_superblock_kobj_unregister;
+        /* /sys/fs/<nilfs>/<device>/segctor */
+        struct kobject sg_segctor_kobj;
+        struct completion sg_segctor_kobj_unregister;
+        /* /sys/fs/<nilfs>/<device>/mounted_snapshots */
+        struct kobject sg_mounted_snapshots_kobj;
+        struct completion sg_mounted_snapshots_kobj_unregister;
+        /* /sys/fs/<nilfs>/<device>/checkpoints */
+        struct kobject sg_checkpoints_kobj;
+        struct completion sg_checkpoints_kobj_unregister;
+        /* /sys/fs/<nilfs>/<device>/segments */
+        struct kobject sg_segments_kobj;
+        struct completion sg_segments_kobj_unregister;
+};
+#define NILFS_COMMON_ATTR_STRUCT(name) \
+struct nilfs_##name##_attr { \
+        struct attribute attr; \
+        ssize_t (*show)(struct kobject *, struct attribute *, \
+                        char *); \
+        ssize_t (*store)(struct kobject *, struct attribute *, \
+                         const char *, size_t); \
+};
+NILFS_COMMON_ATTR_STRUCT(feature);
+#define NILFS_DEV_ATTR_STRUCT(name) \
+struct nilfs_##name##_attr { \
+        struct attribute attr; \
+        ssize_t (*show)(struct nilfs_##name##_attr *, struct the_nilfs *, \
+                        char *); \
+        ssize_t (*store)(struct nilfs_##name##_attr *, struct the_nilfs *, \
+                         const char *, size_t); \
+};
+NILFS_DEV_ATTR_STRUCT(dev);
+NILFS_DEV_ATTR_STRUCT(segments);
+NILFS_DEV_ATTR_STRUCT(mounted_snapshots);
+NILFS_DEV_ATTR_STRUCT(checkpoints);
+NILFS_DEV_ATTR_STRUCT(superblock);
+NILFS_DEV_ATTR_STRUCT(segctor);
+#define NILFS_CP_ATTR_STRUCT(name) \
+struct nilfs_##name##_attr { \
+        struct attribute attr; \
+        ssize_t (*show)(struct nilfs_##name##_attr *, struct nilfs_root *, \
+                        char *); \
+        ssize_t (*store)(struct nilfs_##name##_attr *, struct nilfs_root *, \
+                         const char *, size_t); \
+};
+NILFS_CP_ATTR_STRUCT(snapshot);
+#define NILFS_ATTR(type, name, mode, show, store) \
+        static struct nilfs_##type##_attr nilfs_##type##_attr_##name = \
+                __ATTR(name, mode, show, store)
+#define NILFS_INFO_ATTR(type, name) \
+        NILFS_ATTR(type, name, 0444, NULL, NULL)
+#define NILFS_RO_ATTR(type, name) \
+        NILFS_ATTR(type, name, 0444, nilfs_##type##_##name##_show, NULL)
+#define NILFS_RW_ATTR(type, name) \
+        NILFS_ATTR(type, name, 0644, \
+                    nilfs_##type##_##name##_show, \
+                    nilfs_##type##_##name##_store)
+#define NILFS_FEATURE_INFO_ATTR(name) \
+        NILFS_INFO_ATTR(feature, name)
+#define NILFS_FEATURE_RO_ATTR(name) \
+        NILFS_RO_ATTR(feature, name)
+#define NILFS_FEATURE_RW_ATTR(name) \
+        NILFS_RW_ATTR(feature, name)
+#define NILFS_DEV_INFO_ATTR(name) \
+        NILFS_INFO_ATTR(dev, name)
+#define NILFS_DEV_RO_ATTR(name) \
+        NILFS_RO_ATTR(dev, name)
+#define NILFS_DEV_RW_ATTR(name) \
+        NILFS_RW_ATTR(dev, name)
+#define NILFS_SEGMENTS_RO_ATTR(name) \
+        NILFS_RO_ATTR(segments, name)
+#define NILFS_SEGMENTS_RW_ATTR(name) \
+        NILFS_RW_ATTR(segs_info, name)
+#define NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(name) \
+        NILFS_RO_ATTR(mounted_snapshots, name)
+#define NILFS_CHECKPOINTS_RO_ATTR(name) \
+        NILFS_RO_ATTR(checkpoints, name)
+#define NILFS_CHECKPOINTS_RW_ATTR(name) \
+        NILFS_RW_ATTR(checkpoints, name)
+#define NILFS_SNAPSHOT_INFO_ATTR(name) \
+        NILFS_INFO_ATTR(snapshot, name)
+#define NILFS_SNAPSHOT_RO_ATTR(name) \
+        NILFS_RO_ATTR(snapshot, name)
+#define NILFS_SNAPSHOT_RW_ATTR(name) \
+        NILFS_RW_ATTR(snapshot, name)
+#define NILFS_SUPERBLOCK_RO_ATTR(name) \
+        NILFS_RO_ATTR(superblock, name)
+#define NILFS_SUPERBLOCK_RW_ATTR(name) \
+        NILFS_RW_ATTR(superblock, name)
+#define NILFS_SEGCTOR_INFO_ATTR(name) \
+        NILFS_INFO_ATTR(segctor, name)
+#define NILFS_SEGCTOR_RO_ATTR(name) \
+        NILFS_RO_ATTR(segctor, name)
+#define NILFS_SEGCTOR_RW_ATTR(name) \
+        NILFS_RW_ATTR(segctor, name)
+#define NILFS_FEATURE_ATTR_LIST(name) \
+        (&nilfs_feature_attr_##name.attr)
+#define NILFS_DEV_ATTR_LIST(name) \
+        (&nilfs_dev_attr_##name.attr)
+#define NILFS_SEGMENTS_ATTR_LIST(name) \
+        (&nilfs_segments_attr_##name.attr)
+#define NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(name) \
+        (&nilfs_mounted_snapshots_attr_##name.attr)
+#define NILFS_CHECKPOINTS_ATTR_LIST(name) \
+        (&nilfs_checkpoints_attr_##name.attr)
+#define NILFS_SNAPSHOT_ATTR_LIST(name) \
+        (&nilfs_snapshot_attr_##name.attr)
+#define NILFS_SUPERBLOCK_ATTR_LIST(name) \
+        (&nilfs_superblock_attr_##name.attr)
+#define NILFS_SEGCTOR_ATTR_LIST(name) \
+        (&nilfs_segctor_attr_##name.attr)
+#endif /* _NILFS_SYSFS_H */
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8ba8229ba076..9da25fe9ea61 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -85,6 +85,7 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
        nilfs->ns_cptree = RB_ROOT;
        spin_lock_init(&nilfs->ns_cptree_lock);
        init_rwsem(&nilfs->ns_segctor_sem);
+        nilfs->ns_sb_update_freq = NILFS_SB_FREQ;
        return nilfs;
 }
@@ -97,6 +98,7 @@ void destroy_nilfs(struct the_nilfs *nilfs)
 {
        might_sleep();
        if (nilfs_init(nilfs)) {
+                nilfs_sysfs_delete_device_group(nilfs);
                brelse(nilfs->ns_sbh[0]);
                brelse(nilfs->ns_sbh[1]);
        }
@@ -640,6 +642,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
        if (err)
                goto failed_sbh;
+        err = nilfs_sysfs_create_device_group(sb);
+        if (err)
+                goto failed_sbh;
        set_nilfs_init(nilfs);
        err = 0;
 out:
@@ -740,12 +746,13 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
 {
        struct rb_node **p, *parent;
        struct nilfs_root *root, *new;
+        int err;
        root = nilfs_lookup_root(nilfs, cno);
        if (root)
                return root;
-        new = kmalloc(sizeof(*root), GFP_KERNEL);
+        new = kzalloc(sizeof(*root), GFP_KERNEL);
        if (!new)
                return NULL;
@@ -782,6 +789,12 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
        spin_unlock(&nilfs->ns_cptree_lock);
+        err = nilfs_sysfs_create_snapshot_group(new);
+        if (err) {
+                kfree(new);
+                new = NULL;
+        }
        return new;
 }
@@ -790,6 +803,8 @@ void nilfs_put_root(struct nilfs_root *root)
        if (atomic_dec_and_test(&root->count)) {
                struct the_nilfs *nilfs = root->nilfs;
+                nilfs_sysfs_delete_snapshot_group(root);
                spin_lock(&nilfs->ns_cptree_lock);
                rb_erase(&root->rb_node, &nilfs->ns_cptree);
                spin_unlock(&nilfs->ns_cptree_lock);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index de8cc53b4a5c..d01ead1bea9a 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -33,6 +33,7 @@
 #include <linux/slab.h>
 struct nilfs_sc_info;
+struct nilfs_sysfs_dev_subgroups;
 /* the_nilfs struct */
 enum {
@@ -54,6 +55,7 @@ enum {
 * @ns_sbwcount: write count of super block
 * @ns_sbsize: size of valid data in super block
 * @ns_mount_state: file system state
+ * @ns_sb_update_freq: interval of periodical update of superblocks (in seconds)
 * @ns_seg_seq: segment sequence counter
 * @ns_segnum: index number of the latest full segment.
 * @ns_nextnum: index number of the full segment index to be used next
@@ -95,6 +97,9 @@ enum {
 * @ns_inode_size: size of on-disk inode
 * @ns_first_ino: first not-special inode number
 * @ns_crc_seed: seed value of CRC32 calculation
+ * @ns_dev_kobj: /sys/fs/<nilfs>/<device>
+ * @ns_dev_kobj_unregister: completion state
+ * @ns_dev_subgroups: <device> subgroups pointer
 */
 struct the_nilfs {
        unsigned long           ns_flags;
@@ -114,6 +119,7 @@ struct the_nilfs {
        unsigned                ns_sbwcount;
        unsigned                ns_sbsize;
        unsigned                ns_mount_state;
+        unsigned                ns_sb_update_freq;
        /*
         * Following fields are dedicated to a writable FS-instance.
@@ -188,6 +194,11 @@ struct the_nilfs {
        int                     ns_inode_size;
        int                     ns_first_ino;
        u32                     ns_crc_seed;
+        /* /sys/fs/<nilfs>/<device> */
+        struct kobject ns_dev_kobj;
+        struct completion ns_dev_kobj_unregister;
+        struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups;
 };
 #define THE_NILFS_FNS(bit, name)                                        \
@@ -232,6 +243,8 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty)
 * @ifile: inode file
 * @inodes_count: number of inodes
 * @blocks_count: number of blocks
+ * @snapshot_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot>
+ * @snapshot_kobj_unregister: completion state for kernel object
 */
 struct nilfs_root {
        __u64 cno;
@@ -243,6 +256,10 @@ struct nilfs_root {
        atomic64_t inodes_count;
        atomic64_t blocks_count;
+        /* /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot> */
+        struct kobject snapshot_kobj;
+        struct completion snapshot_kobj_unregister;
 };
 /* Special checkpoint number */
@@ -254,7 +271,8 @@ struct nilfs_root {
 static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
 {
        u64 t = get_seconds();
-        return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ;
+        return t < nilfs->ns_sbwtime ||
+                t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq;
 }
 static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index ee9cb3795c2b..30d3addfad75 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -70,8 +70,15 @@ static int fanotify_get_response(struct fsnotify_group *group,
        wait_event(group->fanotify_data.access_waitq, event->response ||
                                atomic_read(&group->fanotify_data.bypass_perm));
-        if (!event->response) /* bypass_perm set */
+        if (!event->response) { /* bypass_perm set */
+                /*
+                 * Event was canceled because group is being destroyed. Remove
+                 * it from group's event list because we are responsible for
+                 * freeing the permission event.
+                 */
+                fsnotify_remove_event(group, &event->fae.fse);
                return 0;
+        }
        /* userspace responded, convert to something usable */
        switch (event->response) {
@@ -210,7 +217,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
                return -ENOMEM;
        fsn_event = &event->fse;
-        ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge);
+        ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
        if (ret) {
                /* Permission events shouldn't be merged */
                BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 3fdc8a3e1134..b13992a41bd9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -66,7 +66,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
        /* held the notification_mutex the whole time, so this is the
         * same event we peeked above */
-        return fsnotify_remove_notify_event(group);
+        return fsnotify_remove_first_event(group);
 }
 static int create_fd(struct fsnotify_group *group,
@@ -359,6 +359,11 @@ static int fanotify_release(struct inode *ignored, struct file *file)
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
        struct fanotify_perm_event_info *event, *next;
+        /*
+         * There may be still new events arriving in the notification queue
+         * but since userspace cannot use fanotify fd anymore, no event can
+         * enter or leave access_list by now.
+         */
        spin_lock(&group->fanotify_data.access_lock);
        atomic_inc(&group->fanotify_data.bypass_perm);
@@ -373,6 +378,13 @@ static int fanotify_release(struct inode *ignored, struct file *file)
        }
        spin_unlock(&group->fanotify_data.access_lock);
+        /*
+         * Since bypass_perm is set, newly queued events will not wait for
+         * access response. Wake up the already sleeping ones now.
+         * synchronize_srcu() in fsnotify_destroy_group() will wait for all
+         * processes sleeping in fanotify_handle_event() waiting for access
+         * response and thus also for all permission events to be freed.
+         */
        wake_up(&group->fanotify_data.access_waitq);
 #endif
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 74825be65b7b..9ce062218de9 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -232,7 +232,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
        BUG_ON(last == NULL);
        /* mark should be the last entry.  last is the current last entry */
-        hlist_add_after_rcu(&last->i.i_list, &mark->i.i_list);
+        hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list);
 out:
        fsnotify_recalc_inode_mask_locked(inode);
        spin_unlock(&inode->i_lock);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 43ab1e1a07a2..0f88bc0b4e6c 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -108,7 +108,7 @@ int inotify_handle_event(struct fsnotify_group *group,
        if (len)
                strcpy(event->name, file_name);
-        ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge);
+        ret = fsnotify_add_event(group, fsn_event, inotify_merge);
        if (ret) {
                /* Our event wasn't used in the end. Free it. */
                fsnotify_destroy_event(group, fsn_event);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index cc423a30a0c8..daf76652fe58 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -149,7 +149,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
        if (fsnotify_notify_queue_is_empty(group))
                return NULL;
-        event = fsnotify_peek_notify_event(group);
+        event = fsnotify_peek_first_event(group);
        pr_debug("%s: group=%p event=%p\n", __func__, group, event);
@@ -159,7 +159,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
        /* held the notification_mutex the whole time, so this is the
         * same event we peeked above */
-        fsnotify_remove_notify_event(group);
+        fsnotify_remove_first_event(group);
        return event;
 }
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 1e58402171a5..a95d8e037aeb 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -73,7 +73,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
        /* Overflow events are per-group and we don't want to free them */
        if (!event || event->mask == FS_Q_OVERFLOW)
                return;
+        /* If the event is still queued, we have a problem... */
+        WARN_ON(!list_empty(&event->list));
        group->ops->free_event(event);
 }
@@ -83,10 +84,10 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
 * added to the queue, 1 if the event was merged with some other queued event,
 * 2 if the queue of events has overflown.
 */
-int fsnotify_add_notify_event(struct fsnotify_group *group,
+int fsnotify_add_event(struct fsnotify_group *group,
-                              struct fsnotify_event *event,
+                       struct fsnotify_event *event,
-                              int (*merge)(struct list_head *,
+                       int (*merge)(struct list_head *,
-                                           struct fsnotify_event *))
+                                    struct fsnotify_event *))
 {
        int ret = 0;
        struct list_head *list = &group->notification_list;
@@ -125,10 +126,25 @@ queue:
 }
 /*
+ * Remove @event from group's notification queue. It is the responsibility of
+ * the caller to destroy the event.
+ */
+void fsnotify_remove_event(struct fsnotify_group *group,
+                           struct fsnotify_event *event)
+{
+        mutex_lock(&group->notification_mutex);
+        if (!list_empty(&event->list)) {
+                list_del_init(&event->list);
+                group->q_len--;
+        }
+        mutex_unlock(&group->notification_mutex);
+}
+/*
 * Remove and return the first event from the notification list.  It is the
 * responsibility of the caller to destroy the obtained event
 */
-struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
+struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
 {
        struct fsnotify_event *event;
@@ -140,7 +156,7 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
                                 struct fsnotify_event, list);
        /*
         * We need to init list head for the case of overflow event so that
-         * check in fsnotify_add_notify_events() works
+         * check in fsnotify_add_event() works
         */
        list_del_init(&event->list);
        group->q_len--;
@@ -149,9 +165,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
 }
 /*
- * This will not remove the event, that must be done with fsnotify_remove_notify_event()
+ * This will not remove the event, that must be done with
+ * fsnotify_remove_first_event()
 */
-struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
+struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
 {
        BUG_ON(!mutex_is_locked(&group->notification_mutex));
@@ -169,7 +186,7 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
        mutex_lock(&group->notification_mutex);
        while (!fsnotify_notify_queue_is_empty(group)) {
-                event = fsnotify_remove_notify_event(group);
+                event = fsnotify_remove_first_event(group);
                fsnotify_destroy_event(group, event);
        }
        mutex_unlock(&group->notification_mutex);
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 68ca5a8704b5..ac851e8376b1 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -191,7 +191,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
        BUG_ON(last == NULL);
        /* mark should be the last entry.  last is the current last entry */
-        hlist_add_after_rcu(&last->m.m_list, &mark->m.m_list);
+        hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list);
 out:
        fsnotify_recalc_vfsmount_mask_locked(mnt);
        spin_unlock(&mnt->mnt_root->d_lock);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 5c9e2c81cb11..f5ec1ce7a532 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -74,8 +74,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
 * ntfs_attr_extend_initialized - extend the initialized size of an attribute
 * @ni:                 ntfs inode of the attribute to extend
 * @new_init_size:      requested new initialized size in bytes
- * @cached_page:        store any allocated but unused page here
- * @lru_pvec:           lru-buffering pagevec of the caller
 *
 * Extend the initialized size of an attribute described by the ntfs inode @ni
 * to @new_init_size bytes.  This involves zeroing any non-sparse space between
@@ -395,7 +393,6 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
 * @nr_pages:   number of page cache pages to obtain
 * @pages:      array of pages in which to return the obtained page cache pages
 * @cached_page: allocated but as yet unused page
- * @lru_pvec:   lru-buffering pagevec of caller
 *
 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
 * starting at index @index.
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9d8fcf2f3b94..a93bf9892256 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4961,6 +4961,15 @@ leftright:
                el = path_leaf_el(path);
                split_index = ocfs2_search_extent_list(el, cpos);
+                if (split_index == -1) {
+                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
+                                        "Owner %llu has an extent at cpos %u "
+                                        "which can no longer be found.\n",
+                                        (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+                                        cpos);
+                        ret = -EROFS;
+                        goto out;
+                }
                goto leftright;
        }
 out:
@@ -5135,7 +5144,7 @@ int ocfs2_change_extent_flag(handle_t *handle,
        el = path_leaf_el(left_path);
        index = ocfs2_search_extent_list(el, cpos);
-        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+        if (index == -1) {
                ocfs2_error(sb,
                            "Owner %llu has an extent at cpos %u which can no "
                            "longer be found.\n",
@@ -5491,7 +5500,7 @@ int ocfs2_remove_extent(handle_t *handle,
        el = path_leaf_el(path);
        index = ocfs2_search_extent_list(el, cpos);
-        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+        if (index == -1) {
                ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
                            "Owner %llu has an extent at cpos %u which can no "
                            "longer be found.\n",
@@ -5557,7 +5566,7 @@ int ocfs2_remove_extent(handle_t *handle,
                el = path_leaf_el(path);
                index = ocfs2_search_extent_list(el, cpos);
-                if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+                if (index == -1) {
                        ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
                                    "Owner %llu: split at cpos %u lost record.",
                                    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 39efc5057a36..3fcf205ee900 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1923,12 +1923,11 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
                                goto bail;
                        }
-                        if (total_backoff >
+                        if (total_backoff > DLM_JOIN_TIMEOUT_MSECS) {
-                            msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) {
                                status = -ERESTARTSYS;
                                mlog(ML_NOTICE, "Timed out joining dlm domain "
                                     "%s after %u msecs\n", dlm->name,
-                                     jiffies_to_msecs(total_backoff));
+                                     total_backoff);
                                goto bail;
                        }
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 82abf0cc9a12..3ec906ef5d9a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2405,6 +2405,10 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
        if (res->state & DLM_LOCK_RES_MIGRATING)
                return 0;
+        /* delay migration when the lockres is in RECOCERING state */
+        if (res->state & DLM_LOCK_RES_RECOVERING)
+                return 0;
        if (res->owner != dlm->node_num)
                return 0;
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 599eb4c4c8be..6219aaadeb08 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -98,7 +98,7 @@ static int __ocfs2_move_extent(handle_t *handle,
        el = path_leaf_el(path);
        index = ocfs2_search_extent_list(el, cpos);
-        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+        if (index == -1) {
                ocfs2_error(inode->i_sb,
                            "Inode %llu has an extent at cpos %u which can no "
                            "longer be found.\n",
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 636aab69ead5..d81f6e2a97f5 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3109,7 +3109,7 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
        el = path_leaf_el(path);
        index = ocfs2_search_extent_list(el, cpos);
-        if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+        if (index == -1) {
                ocfs2_error(sb,
                            "Inode %llu has an extent at cpos %u which can no "
                            "longer be found.\n",
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 1424c151cccc..a88b2a4fcc85 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -382,7 +382,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
        trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
-        si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
+        si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *),
                            GFP_KERNEL);
        if (!si->si_bh) {
                status = -ENOMEM;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index ec58c7659183..ba8819702c56 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -321,7 +321,7 @@ static int omfs_get_imap(struct super_block *sb)
                goto out;
        sbi->s_imap_size = array_size;
-        sbi->s_imap = kzalloc(array_size * sizeof(unsigned long *), GFP_KERNEL);
+        sbi->s_imap = kcalloc(array_size, sizeof(unsigned long *), GFP_KERNEL);
        if (!sbi->s_imap)
                goto nomem;
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 239493ec718e..7151ea428041 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -23,6 +23,7 @@ proc-y	+= version.o
 proc-y  += softirqs.o
 proc-y  += namespaces.o
 proc-y  += self.o
+proc-y  += thread_self.o
 proc-$(CONFIG_PROC_SYSCTL)      += proc_sysctl.o
 proc-$(CONFIG_NET)              += proc_net.o
 proc-$(CONFIG_PROC_KCORE)       += kcore.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 64db2bceac59..cd3653e4f35c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -297,15 +297,11 @@ static void render_cap_t(struct seq_file *m, const char *header,
        seq_puts(m, header);
        CAP_FOR_EACH_U32(__capi) {
                seq_printf(m, "%08x",
-                           a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
+                           a->cap[CAP_LAST_U32 - __capi]);
        }
        seq_putc(m, '\n');
 }
-/* Remove non-existent capabilities */
-#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \
-                                CAP_TO_MASK(CAP_LAST_CAP + 1) - 1)
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
 {
        const struct cred *cred;
@@ -319,11 +315,6 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
        cap_bset        = cred->cap_bset;
        rcu_read_unlock();
-        NORM_CAPS(cap_inheritable);
-        NORM_CAPS(cap_permitted);
-        NORM_CAPS(cap_effective);
-        NORM_CAPS(cap_bset);
        render_cap_t(m, "CapInh:\t", &cap_inheritable);
        render_cap_t(m, "CapPrm:\t", &cap_permitted);
        render_cap_t(m, "CapEff:\t", &cap_effective);
@@ -473,13 +464,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
        priority = task_prio(task);
        nice = task_nice(task);
-        /* Temporary variable needed for gcc-2.96 */
-        /* convert timespec -> nsec*/
-        start_time =
-                (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
-                                + task->real_start_time.tv_nsec;
        /* convert nsec -> ticks */
-        start_time = nsec_to_clock_t(start_time);
+        start_time = nsec_to_clock_t(task->real_start_time);
        seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
        seq_put_decimal_ll(m, ' ', ppid);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2d696b0c93bf..baf852b648ad 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -105,7 +105,7 @@
 */
 struct pid_entry {
-        char *name;
+        const char *name;
        int len;
        umode_t mode;
        const struct inode_operations *iop;
@@ -130,10 +130,6 @@ struct pid_entry {
                { .proc_get_link = get_link } )
 #define REG(NAME, MODE, fops)                           \
        NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
-#define INF(NAME, MODE, read)                           \
-        NOD(NAME, (S_IFREG|(MODE)),                     \
-                NULL, &proc_info_file_operations,       \
-                { .proc_read = read } )
 #define ONE(NAME, MODE, show)                           \
        NOD(NAME, (S_IFREG|(MODE)),                     \
                NULL, &proc_single_file_operations,     \
@@ -200,27 +196,32 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
        return result;
 }
-static int proc_pid_cmdline(struct task_struct *task, char *buffer)
+static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
+                            struct pid *pid, struct task_struct *task)
 {
-        return get_cmdline(task, buffer, PAGE_SIZE);
+        /*
+         * Rely on struct seq_operations::show() being called once
+         * per internal buffer allocation. See single_open(), traverse().
+         */
+        BUG_ON(m->size < PAGE_SIZE);
+        m->count += get_cmdline(task, m->buf, PAGE_SIZE);
+        return 0;
 }
-static int proc_pid_auxv(struct task_struct *task, char *buffer)
+static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
+                         struct pid *pid, struct task_struct *task)
 {
        struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
-        int res = PTR_ERR(mm);
        if (mm && !IS_ERR(mm)) {
                unsigned int nwords = 0;
                do {
                        nwords += 2;
                } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
-                res = nwords * sizeof(mm->saved_auxv[0]);
+                seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0]));
-                if (res > PAGE_SIZE)
-                        res = PAGE_SIZE;
-                memcpy(buffer, mm->saved_auxv, res);
                mmput(mm);
-        }
+                return 0;
-        return res;
+        } else
+                return PTR_ERR(mm);
 }
@@ -229,7 +230,8 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
 * Provides a wchan file via kallsyms in a proper one-value-per-file format.
 * Returns the resolved symbol.  If that fails, simply return the address.
 */
-static int proc_pid_wchan(struct task_struct *task, char *buffer)
+static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
+                          struct pid *pid, struct task_struct *task)
 {
        unsigned long wchan;
        char symname[KSYM_NAME_LEN];
@@ -240,9 +242,9 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
                if (!ptrace_may_access(task, PTRACE_MODE_READ))
                        return 0;
                else
-                        return sprintf(buffer, "%lu", wchan);
+                        return seq_printf(m, "%lu", wchan);
        else
-                return sprintf(buffer, "%s", symname);
+                return seq_printf(m, "%s", symname);
 }
 #endif /* CONFIG_KALLSYMS */
@@ -304,9 +306,10 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 /*
 * Provides /proc/PID/schedstat
 */
-static int proc_pid_schedstat(struct task_struct *task, char *buffer)
+static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
+                              struct pid *pid, struct task_struct *task)
 {
-        return sprintf(buffer, "%llu %llu %lu\n",
+        return seq_printf(m, "%llu %llu %lu\n",
                        (unsigned long long)task->se.sum_exec_runtime,
                        (unsigned long long)task->sched_info.run_delay,
                        task->sched_info.pcount);
@@ -404,7 +407,8 @@ static const struct file_operations proc_cpuset_operations = {
 };
 #endif
-static int proc_oom_score(struct task_struct *task, char *buffer)
+static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
+                          struct pid *pid, struct task_struct *task)
 {
        unsigned long totalpages = totalram_pages + total_swap_pages;
        unsigned long points = 0;
@@ -414,12 +418,12 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
                points = oom_badness(task, NULL, NULL, totalpages) *
                                                1000 / totalpages;
        read_unlock(&tasklist_lock);
-        return sprintf(buffer, "%lu\n", points);
+        return seq_printf(m, "%lu\n", points);
 }
 struct limit_names {
-        char *name;
+        const char *name;
-        char *unit;
+        const char *unit;
 };
 static const struct limit_names lnames[RLIM_NLIMITS] = {
@@ -442,12 +446,11 @@ static const struct limit_names lnames[RLIM_NLIMITS] = {
 };
 /* Display limits for a process */
-static int proc_pid_limits(struct task_struct *task, char *buffer)
+static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
+                           struct pid *pid, struct task_struct *task)
 {
        unsigned int i;
-        int count = 0;
        unsigned long flags;
-        char *bufptr = buffer;
        struct rlimit rlim[RLIM_NLIMITS];
@@ -459,35 +462,34 @@ static int proc_pid_limits(struct task_struct *task, char *buffer)
        /*
         * print the file header
         */
-        count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n",
+       seq_printf(m, "%-25s %-20s %-20s %-10s\n",
                        "Limit", "Soft Limit", "Hard Limit", "Units");
        for (i = 0; i < RLIM_NLIMITS; i++) {
                if (rlim[i].rlim_cur == RLIM_INFINITY)
-                        count += sprintf(&bufptr[count], "%-25s %-20s ",
+                        seq_printf(m, "%-25s %-20s ",
                                         lnames[i].name, "unlimited");
                else
-                        count += sprintf(&bufptr[count], "%-25s %-20lu ",
+                        seq_printf(m, "%-25s %-20lu ",
                                         lnames[i].name, rlim[i].rlim_cur);
                if (rlim[i].rlim_max == RLIM_INFINITY)
-                        count += sprintf(&bufptr[count], "%-20s ", "unlimited");
+                        seq_printf(m, "%-20s ", "unlimited");
                else
-                        count += sprintf(&bufptr[count], "%-20lu ",
+                        seq_printf(m, "%-20lu ", rlim[i].rlim_max);
-                                         rlim[i].rlim_max);
                if (lnames[i].unit)
-                        count += sprintf(&bufptr[count], "%-10s\n",
+                        seq_printf(m, "%-10s\n", lnames[i].unit);
-                                         lnames[i].unit);
                else
-                        count += sprintf(&bufptr[count], "\n");
+                        seq_putc(m, '\n');
        }
-        return count;
+        return 0;
 }
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-static int proc_pid_syscall(struct task_struct *task, char *buffer)
+static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
+                            struct pid *pid, struct task_struct *task)
 {
        long nr;
        unsigned long args[6], sp, pc;
@@ -496,11 +498,11 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
                return res;
        if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
-                res = sprintf(buffer, "running\n");
+                seq_puts(m, "running\n");
        else if (nr < 0)
-                res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+                seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
        else
-                res = sprintf(buffer,
+                seq_printf(m,
                       "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
                       nr,
                       args[0], args[1], args[2], args[3], args[4], args[5],
@@ -598,43 +600,6 @@ static const struct inode_operations proc_def_inode_operations = {
        .setattr        = proc_setattr,
 };
-#define PROC_BLOCK_SIZE (3*1024)                /* 4K page size but our output routines use some slack for overruns */
-static ssize_t proc_info_read(struct file * file, char __user * buf,
-                          size_t count, loff_t *ppos)
-{
-        struct inode * inode = file_inode(file);
-        unsigned long page;
-        ssize_t length;
-        struct task_struct *task = get_proc_task(inode);
-        length = -ESRCH;
-        if (!task)
-                goto out_no_task;
-        if (count > PROC_BLOCK_SIZE)
-                count = PROC_BLOCK_SIZE;
-        length = -ENOMEM;
-        if (!(page = __get_free_page(GFP_TEMPORARY)))
-                goto out;
-        length = PROC_I(inode)->op.proc_read(task, (char*)page);
-        if (length >= 0)
-                length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
-        free_page(page);
-out:
-        put_task_struct(task);
-out_no_task:
-        return length;
-}
-static const struct file_operations proc_info_file_operations = {
-        .read           = proc_info_read,
-        .llseek         = generic_file_llseek,
-};
 static int proc_single_show(struct seq_file *m, void *v)
 {
        struct inode *inode = m->private;
@@ -2056,7 +2021,7 @@ static int show_timer(struct seq_file *m, void *v)
        struct k_itimer *timer;
        struct timers_private *tp = m->private;
        int notify;
-        static char *nstr[] = {
+        static const char * const nstr[] = {
                [SIGEV_SIGNAL] = "signal",
                [SIGEV_NONE] = "none",
                [SIGEV_THREAD] = "thread",
@@ -2392,7 +2357,7 @@ static const struct file_operations proc_coredump_filter_operations = {
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
+static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
 {
        struct task_io_accounting acct = task->ioac;
        unsigned long flags;
@@ -2416,7 +2381,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
                unlock_task_sighand(task, &flags);
        }
-        result = sprintf(buffer,
+        result = seq_printf(m,
                        "rchar: %llu\n"
                        "wchar: %llu\n"
                        "syscr: %llu\n"
@@ -2436,20 +2401,22 @@ out_unlock:
        return result;
 }
-static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
+static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+                                  struct pid *pid, struct task_struct *task)
 {
-        return do_io_accounting(task, buffer, 0);
+        return do_io_accounting(task, m, 0);
 }
-static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
+static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
+                                   struct pid *pid, struct task_struct *task)
 {
-        return do_io_accounting(task, buffer, 1);
+        return do_io_accounting(task, m, 1);
 }
 #endif /* CONFIG_TASK_IO_ACCOUNTING */
 #ifdef CONFIG_USER_NS
 static int proc_id_map_open(struct inode *inode, struct file *file,
-        struct seq_operations *seq_ops)
+        const struct seq_operations *seq_ops)
 {
        struct user_namespace *ns = NULL;
        struct task_struct *task;
@@ -2557,10 +2524,10 @@ static const struct pid_entry tgid_base_stuff[] = {
        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
 #endif
        REG("environ",    S_IRUSR, proc_environ_operations),
-        INF("auxv",       S_IRUSR, proc_pid_auxv),
+        ONE("auxv",       S_IRUSR, proc_pid_auxv),
        ONE("status",     S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
-        INF("limits",     S_IRUGO, proc_pid_limits),
+        ONE("limits",     S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
@@ -2569,9 +2536,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-        INF("syscall",    S_IRUSR, proc_pid_syscall),
+        ONE("syscall",    S_IRUSR, proc_pid_syscall),
 #endif
-        INF("cmdline",    S_IRUGO, proc_pid_cmdline),
+        ONE("cmdline",    S_IRUGO, proc_pid_cmdline),
        ONE("stat",       S_IRUGO, proc_tgid_stat),
        ONE("statm",      S_IRUGO, proc_pid_statm),
        REG("maps",       S_IRUGO, proc_pid_maps_operations),
@@ -2594,13 +2561,13 @@ static const struct pid_entry tgid_base_stuff[] = {
        DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
 #endif
 #ifdef CONFIG_KALLSYMS
-        INF("wchan",      S_IRUGO, proc_pid_wchan),
+        ONE("wchan",      S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
        ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
-        INF("schedstat",  S_IRUGO, proc_pid_schedstat),
+        ONE("schedstat",  S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
        REG("latency",  S_IRUGO, proc_lstats_operations),
@@ -2611,7 +2578,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_CGROUPS
        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
-        INF("oom_score",  S_IRUGO, proc_oom_score),
+        ONE("oom_score",  S_IRUGO, proc_oom_score),
        REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
@@ -2625,10 +2592,10 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-        INF("io",       S_IRUSR, proc_tgid_io_accounting),
+        ONE("io",       S_IRUSR, proc_tgid_io_accounting),
 #endif
 #ifdef CONFIG_HARDWALL
-        INF("hardwall",   S_IRUGO, proc_pid_hardwall),
+        ONE("hardwall",   S_IRUGO, proc_pid_hardwall),
 #endif
 #ifdef CONFIG_USER_NS
        REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
@@ -2780,12 +2747,12 @@ out:
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
 {
-        int result = 0;
+        int result = -ENOENT;
        struct task_struct *task;
        unsigned tgid;
        struct pid_namespace *ns;
-        tgid = name_to_int(dentry);
+        tgid = name_to_int(&dentry->d_name);
        if (tgid == ~0U)
                goto out;
@@ -2847,7 +2814,7 @@ retry:
        return iter;
 }
-#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file *file, struct dir_context *ctx)
@@ -2859,14 +2826,19 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
        if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
                return 0;
-        if (pos == TGID_OFFSET - 1) {
+        if (pos == TGID_OFFSET - 2) {
                struct inode *inode = ns->proc_self->d_inode;
                if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
                        return 0;
-                iter.tgid = 0;
+                ctx->pos = pos = pos + 1;
-        } else {
-                iter.tgid = pos - TGID_OFFSET;
        }
+        if (pos == TGID_OFFSET - 1) {
+                struct inode *inode = ns->proc_thread_self->d_inode;
+                if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
+                        return 0;
+                ctx->pos = pos = pos + 1;
+        }
+        iter.tgid = pos - TGID_OFFSET;
        iter.task = NULL;
        for (iter = next_tgid(ns, iter);
             iter.task;
@@ -2895,19 +2867,22 @@ static const struct pid_entry tid_base_stuff[] = {
        DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
        DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        DIR("ns",        S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
+#ifdef CONFIG_NET
+        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
+#endif
        REG("environ",   S_IRUSR, proc_environ_operations),
-        INF("auxv",      S_IRUSR, proc_pid_auxv),
+        ONE("auxv",      S_IRUSR, proc_pid_auxv),
        ONE("status",    S_IRUGO, proc_pid_status),
        ONE("personality", S_IRUSR, proc_pid_personality),
-        INF("limits",    S_IRUGO, proc_pid_limits),
+        ONE("limits",    S_IRUGO, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
        REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-        INF("syscall",   S_IRUSR, proc_pid_syscall),
+        ONE("syscall",   S_IRUSR, proc_pid_syscall),
 #endif
-        INF("cmdline",   S_IRUGO, proc_pid_cmdline),
+        ONE("cmdline",   S_IRUGO, proc_pid_cmdline),
        ONE("stat",      S_IRUGO, proc_tid_stat),
        ONE("statm",     S_IRUGO, proc_pid_statm),
        REG("maps",      S_IRUGO, proc_tid_maps_operations),
@@ -2932,13 +2907,13 @@ static const struct pid_entry tid_base_stuff[] = {
        DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
 #endif
 #ifdef CONFIG_KALLSYMS
-        INF("wchan",     S_IRUGO, proc_pid_wchan),
+        ONE("wchan",     S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_STACKTRACE
        ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
-        INF("schedstat", S_IRUGO, proc_pid_schedstat),
+        ONE("schedstat", S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
        REG("latency",  S_IRUGO, proc_lstats_operations),
@@ -2949,7 +2924,7 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_CGROUPS
        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
-        INF("oom_score", S_IRUGO, proc_oom_score),
+        ONE("oom_score", S_IRUGO, proc_oom_score),
        REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
@@ -2960,10 +2935,10 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-        INF("io",       S_IRUSR, proc_tid_io_accounting),
+        ONE("io",       S_IRUSR, proc_tid_io_accounting),
 #endif
 #ifdef CONFIG_HARDWALL
-        INF("hardwall",   S_IRUGO, proc_pid_hardwall),
+        ONE("hardwall",   S_IRUGO, proc_pid_hardwall),
 #endif
 #ifdef CONFIG_USER_NS
        REG("uid_map",    S_IRUGO|S_IWUSR, proc_uid_map_operations),
@@ -3033,7 +3008,7 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
        if (!leader)
                goto out_no_task;
-        tid = name_to_int(dentry);
+        tid = name_to_int(&dentry->d_name);
        if (tid == ~0U)
                goto out;
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 0788d093f5d8..955bb55fab8c 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -206,7 +206,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
 {
        struct task_struct *task = get_proc_task(dir);
        int result = -ENOENT;
-        unsigned fd = name_to_int(dentry);
+        unsigned fd = name_to_int(&dentry->d_name);
        if (!task)
                goto out_no_task;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index b7f268eb5f45..317b72641ebf 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -27,7 +27,7 @@
 #include "internal.h"
-DEFINE_SPINLOCK(proc_subdir_lock);
+static DEFINE_SPINLOCK(proc_subdir_lock);
 static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
 {
@@ -330,28 +330,28 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
                                          nlink_t nlink)
 {
        struct proc_dir_entry *ent = NULL;
-        const char *fn = name;
+        const char *fn;
-        unsigned int len;
+        struct qstr qstr;
-        /* make sure name is valid */
-        if (!name || !strlen(name))
-                goto out;
        if (xlate_proc_name(name, parent, &fn) != 0)
                goto out;
+        qstr.name = fn;
+        qstr.len = strlen(fn);
+        if (qstr.len == 0 || qstr.len >= 256) {
+                WARN(1, "name len %u\n", qstr.len);
+                return NULL;
+        }
+        if (*parent == &proc_root && name_to_int(&qstr) != ~0U) {
+                WARN(1, "create '/proc/%s' by hand\n", qstr.name);
+                return NULL;
+        }
-        /* At this point there must not be any '/' characters beyond *fn */
+        ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL);
-        if (strchr(fn, '/'))
-                goto out;
-        len = strlen(fn);
-        ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
        if (!ent)
                goto out;
-        memcpy(ent->name, fn, len + 1);
+        memcpy(ent->name, fn, qstr.len + 1);
-        ent->namelen = len;
+        ent->namelen = qstr.len;
        ent->mode = mode;
        ent->nlink = nlink;
        atomic_set(&ent->count, 1);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 0adbc02d60e3..333080d7a671 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -442,6 +442,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 int proc_fill_super(struct super_block *s)
 {
        struct inode *root_inode;
+        int ret;
        s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
        s->s_blocksize = 1024;
@@ -463,5 +464,9 @@ int proc_fill_super(struct super_block *s)
                return -ENOMEM;
        }
-        return proc_setup_self(s);
+        ret = proc_setup_self(s);
+        if (ret) {
+                return ret;
+        }
+        return proc_setup_thread_self(s);
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3ab6d14e71c5..7da13e49128a 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -52,7 +52,6 @@ struct proc_dir_entry {
 union proc_op {
        int (*proc_get_link)(struct dentry *, struct path *);
-        int (*proc_read)(struct task_struct *task, char *page);
        int (*proc_show)(struct seq_file *m,
                struct pid_namespace *ns, struct pid *pid,
                struct task_struct *task);
@@ -112,10 +111,10 @@ static inline int task_dumpable(struct task_struct *task)
        return 0;
 }
-static inline unsigned name_to_int(struct dentry *dentry)
+static inline unsigned name_to_int(const struct qstr *qstr)
 {
-        const char *name = dentry->d_name.name;
+        const char *name = qstr->name;
-        int len = dentry->d_name.len;
+        int len = qstr->len;
        unsigned n = 0;
        if (len > 1 && *name == '0')
@@ -178,8 +177,6 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i
 /*
 * generic.c
 */
-extern spinlock_t proc_subdir_lock;
 extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
 extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
                                     struct dentry *);
@@ -234,6 +231,12 @@ static inline int proc_net_init(void) { return 0; }
 extern int proc_setup_self(struct super_block *);
 /*
+ * proc_thread_self.c
+ */
+extern int proc_setup_thread_self(struct super_block *);
+extern void proc_thread_self_init(void);
+/*
 * proc_sysctl.c
 */
 #ifdef CONFIG_PROC_SYSCTL
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 39e6ef32f0bd..6df8d0722c97 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -172,7 +172,7 @@ get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
        start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK;
        end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1;
-        end = ALIGN(end, PAGE_SIZE);
+        end = PAGE_ALIGN(end);
        /* overlap check (because we have to align page */
        list_for_each_entry(tmp, head, list) {
                if (tmp->type != KCORE_VMEMMAP)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 7445af0b1aa3..aa1eee06420f 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(global_page_state(NR_WRITEBACK)),
                K(global_page_state(NR_ANON_PAGES)),
                K(global_page_state(NR_FILE_MAPPED)),
-                K(global_page_state(NR_SHMEM)),
+                K(i.sharedram),
                K(global_page_state(NR_SLAB_RECLAIMABLE) +
                                global_page_state(NR_SLAB_UNRECLAIMABLE)),
                K(global_page_state(NR_SLAB_RECLAIMABLE)),
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 4677bb7dc7c2..a63af3e0a612 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -113,9 +113,11 @@ static struct net *get_proc_task_net(struct inode *dir)
        rcu_read_lock();
        task = pid_task(proc_pid(dir), PIDTYPE_PID);
        if (task != NULL) {
-                ns = task_nsproxy(task);
+                task_lock(task);
+                ns = task->nsproxy;
                if (ns != NULL)
                        net = get_net(ns->net_ns);
+                task_unlock(task);
        }
        rcu_read_unlock();
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 71290463a1d3..f92d5dd578a4 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -632,7 +632,7 @@ out:
        return ret;
 }
-static int scan(struct ctl_table_header *head, ctl_table *table,
+static int scan(struct ctl_table_header *head, struct ctl_table *table,
                unsigned long *pos, struct file *file,
                struct dir_context *ctx)
 {
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index cb761f010300..15f327bed8c6 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -18,7 +18,7 @@
 /*
 * The /proc/tty directory inodes...
 */
-static struct proc_dir_entry *proc_tty_ldisc, *proc_tty_driver;
+static struct proc_dir_entry *proc_tty_driver;
 /*
 * This is the handler for /proc/tty/drivers
@@ -176,7 +176,7 @@ void __init proc_tty_init(void)
 {
        if (!proc_mkdir("tty", NULL))
                return;
-        proc_tty_ldisc = proc_mkdir("tty/ldisc", NULL);
+        proc_mkdir("tty/ldisc", NULL);  /* Preserved: it's userspace visible */
        /*
         * /proc/tty/driver/serial reveals the exact character counts for
         * serial links which is just too easy to abuse for inferring
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5dbadecb234d..094e44d4a6be 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -149,6 +149,8 @@ static void proc_kill_sb(struct super_block *sb)
        ns = (struct pid_namespace *)sb->s_fs_info;
        if (ns->proc_self)
                dput(ns->proc_self);
+        if (ns->proc_thread_self)
+                dput(ns->proc_thread_self);
        kill_anon_super(sb);
        put_pid_ns(ns);
 }
@@ -170,6 +172,7 @@ void __init proc_root_init(void)
                return;
        proc_self_init();
+        proc_thread_self_init();
        proc_symlink("mounts", NULL, "self/mounts");
        proc_net_init();
@@ -199,10 +202,10 @@ static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
 {
-        if (!proc_lookup(dir, dentry, flags))
+        if (!proc_pid_lookup(dir, dentry, flags))
                return NULL;
        
-        return proc_pid_lookup(dir, dentry, flags);
+        return proc_lookup(dir, dentry, flags);
 }
 static int proc_root_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index cfa63ee92c96..dfc791c42d64 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -925,15 +925,30 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
                                struct mm_walk *walk)
 {
        struct pagemapread *pm = walk->private;
-        unsigned long addr;
+        unsigned long addr = start;
        int err = 0;
-        pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
-        for (addr = start; addr < end; addr += PAGE_SIZE) {
+        while (addr < end) {
-                err = add_to_pagemap(addr, &pme, pm);
+                struct vm_area_struct *vma = find_vma(walk->mm, addr);
-                if (err)
+                pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
-                        break;
+                unsigned long vm_end;
+                if (!vma) {
+                        vm_end = end;
+                } else {
+                        vm_end = min(end, vma->vm_end);
+                        if (vma->vm_flags & VM_SOFTDIRTY)
+                                pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+                }
+                for (; addr < vm_end; addr += PAGE_SIZE) {
+                        err = add_to_pagemap(addr, &pme, pm);
+                        if (err)
+                                goto out;
+                }
        }
+out:
        return err;
 }
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
new file mode 100644
index 000000000000..59075b509df3
--- /dev/null
+++ b/fs/proc/thread_self.c
@@ -0,0 +1,85 @@
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
+/*
+ * /proc/thread_self:
+ */
+static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
+                              int buflen)
+{
+        struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+        pid_t tgid = task_tgid_nr_ns(current, ns);
+        pid_t pid = task_pid_nr_ns(current, ns);
+        char tmp[PROC_NUMBUF + 6 + PROC_NUMBUF];
+        if (!pid)
+                return -ENOENT;
+        sprintf(tmp, "%d/task/%d", tgid, pid);
+        return readlink_copy(buffer, buflen, tmp);
+}
+static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+        pid_t tgid = task_tgid_nr_ns(current, ns);
+        pid_t pid = task_pid_nr_ns(current, ns);
+        char *name = ERR_PTR(-ENOENT);
+        if (pid) {
+                name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
+                if (!name)
+                        name = ERR_PTR(-ENOMEM);
+                else
+                        sprintf(name, "%d/task/%d", tgid, pid);
+        }
+        nd_set_link(nd, name);
+        return NULL;
+}
+static const struct inode_operations proc_thread_self_inode_operations = {
+        .readlink       = proc_thread_self_readlink,
+        .follow_link    = proc_thread_self_follow_link,
+        .put_link       = kfree_put_link,
+};
+static unsigned thread_self_inum;
+int proc_setup_thread_self(struct super_block *s)
+{
+        struct inode *root_inode = s->s_root->d_inode;
+        struct pid_namespace *ns = s->s_fs_info;
+        struct dentry *thread_self;
+        mutex_lock(&root_inode->i_mutex);
+        thread_self = d_alloc_name(s->s_root, "thread-self");
+        if (thread_self) {
+                struct inode *inode = new_inode_pseudo(s);
+                if (inode) {
+                        inode->i_ino = thread_self_inum;
+                        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+                        inode->i_mode = S_IFLNK | S_IRWXUGO;
+                        inode->i_uid = GLOBAL_ROOT_UID;
+                        inode->i_gid = GLOBAL_ROOT_GID;
+                        inode->i_op = &proc_thread_self_inode_operations;
+                        d_add(thread_self, inode);
+                } else {
+                        dput(thread_self);
+                        thread_self = ERR_PTR(-ENOMEM);
+                }
+        } else {
+                thread_self = ERR_PTR(-ENOMEM);
+        }
+        mutex_unlock(&root_inode->i_mutex);
+        if (IS_ERR(thread_self)) {
+                pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
+                return PTR_ERR(thread_self);
+        }
+        ns->proc_thread_self = thread_self;
+        return 0;
+}
+void __init proc_thread_self_init(void)
+{
+        proc_alloc_inum(&thread_self_inum);
+}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 382aa890e228..a90d6d354199 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -328,6 +328,82 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz)
 * virtually contiguous user-space in ELF layout.
 */
 #ifdef CONFIG_MMU
+/*
+ * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages
+ * reported as not being ram with the zero page.
+ *
+ * @vma: vm_area_struct describing requested mapping
+ * @from: start remapping from
+ * @pfn: page frame number to start remapping to
+ * @size: remapping size
+ * @prot: protection bits
+ *
+ * Returns zero on success, -EAGAIN on failure.
+ */
+static int remap_oldmem_pfn_checked(struct vm_area_struct *vma,
+                                    unsigned long from, unsigned long pfn,
+                                    unsigned long size, pgprot_t prot)
+{
+        unsigned long map_size;
+        unsigned long pos_start, pos_end, pos;
+        unsigned long zeropage_pfn = my_zero_pfn(0);
+        size_t len = 0;
+        pos_start = pfn;
+        pos_end = pfn + (size >> PAGE_SHIFT);
+        for (pos = pos_start; pos < pos_end; ++pos) {
+                if (!pfn_is_ram(pos)) {
+                        /*
+                         * We hit a page which is not ram. Remap the continuous
+                         * region between pos_start and pos-1 and replace
+                         * the non-ram page at pos with the zero page.
+                         */
+                        if (pos > pos_start) {
+                                /* Remap continuous region */
+                                map_size = (pos - pos_start) << PAGE_SHIFT;
+                                if (remap_oldmem_pfn_range(vma, from + len,
+                                                           pos_start, map_size,
+                                                           prot))
+                                        goto fail;
+                                len += map_size;
+                        }
+                        /* Remap the zero page */
+                        if (remap_oldmem_pfn_range(vma, from + len,
+                                                   zeropage_pfn,
+                                                   PAGE_SIZE, prot))
+                                goto fail;
+                        len += PAGE_SIZE;
+                        pos_start = pos + 1;
+                }
+        }
+        if (pos > pos_start) {
+                /* Remap the rest */
+                map_size = (pos - pos_start) << PAGE_SHIFT;
+                if (remap_oldmem_pfn_range(vma, from + len, pos_start,
+                                           map_size, prot))
+                        goto fail;
+        }
+        return 0;
+fail:
+        do_munmap(vma->vm_mm, from, len);
+        return -EAGAIN;
+}
+static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
+                            unsigned long from, unsigned long pfn,
+                            unsigned long size, pgprot_t prot)
+{
+        /*
+         * Check if oldmem_pfn_is_ram was registered to avoid
+         * looping over all pages without a reason.
+         */
+        if (oldmem_pfn_is_ram)
+                return remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
+        else
+                return remap_oldmem_pfn_range(vma, from, pfn, size, prot);
+}
 static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
 {
        size_t size = vma->vm_end - vma->vm_start;
@@ -387,9 +463,9 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
                        tsz = min_t(size_t, m->offset + m->size - start, size);
                        paddr = m->paddr + start - m->offset;
-                        if (remap_oldmem_pfn_range(vma, vma->vm_start + len,
+                        if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len,
-                                                   paddr >> PAGE_SHIFT, tsz,
+                                                    paddr >> PAGE_SHIFT, tsz,
-                                                   vma->vm_page_prot))
+                                                    vma->vm_page_prot))
                                goto fail;
                        size -= tsz;
                        start += tsz;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 1a81373947f3..73ca1740d839 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -232,17 +232,15 @@ static int mounts_open_common(struct inode *inode, struct file *file,
        if (!task)
                goto err;
-        rcu_read_lock();
+        task_lock(task);
-        nsp = task_nsproxy(task);
+        nsp = task->nsproxy;
        if (!nsp || !nsp->mnt_ns) {
-                rcu_read_unlock();
+                task_unlock(task);
                put_task_struct(task);
                goto err;
        }
        ns = nsp->mnt_ns;
        get_mnt_ns(ns);
-        rcu_read_unlock();
-        task_lock(task);
        if (!task->fs) {
                task_unlock(task);
                put_task_struct(task);
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 34a1e5aa848c..9d7b9a83699e 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -394,7 +394,7 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size)
        prot = pgprot_noncached(PAGE_KERNEL);
-        pages = kmalloc(sizeof(struct page *) * page_count, GFP_KERNEL);
+        pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
        if (!pages) {
                pr_err("%s: Failed to allocate array for %u pages\n",
                       __func__, page_count);
diff --git a/fs/qnx6/Makefile b/fs/qnx6/Makefile
index 9dd06199afc9..5e6bae6fae50 100644
--- a/fs/qnx6/Makefile
+++ b/fs/qnx6/Makefile
@@ -5,3 +5,4 @@
 obj-$(CONFIG_QNX6FS_FS) += qnx6.o
 qnx6-objs := inode.o dir.o namei.o super_mmi.o
+ccflags-$(CONFIG_QNX6FS_DEBUG)  += -DDEBUG
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index 15b7d92ed60d..8d64bb5366bf 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -77,21 +77,20 @@ static int qnx6_dir_longfilename(struct inode *inode,
        if (de->de_size != 0xff) {
                /* error - long filename entries always have size 0xff
                   in direntry */
-                printk(KERN_ERR "qnx6: invalid direntry size (%i).\n",
+                pr_err("invalid direntry size (%i).\n", de->de_size);
-                                de->de_size);
                return 0;
        }
        lf = qnx6_longname(s, de, &page);
        if (IS_ERR(lf)) {
-                printk(KERN_ERR "qnx6:Error reading longname\n");
+                pr_err("Error reading longname\n");
                return 0;
        }
        lf_size = fs16_to_cpu(sbi, lf->lf_size);
        if (lf_size > QNX6_LONG_NAME_MAX) {
-                QNX6DEBUG((KERN_INFO "file %s\n", lf->lf_fname));
+                pr_debug("file %s\n", lf->lf_fname);
-                printk(KERN_ERR "qnx6:Filename too long (%i)\n", lf_size);
+                pr_err("Filename too long (%i)\n", lf_size);
                qnx6_put_page(page);
                return 0;
        }
@@ -100,10 +99,10 @@ static int qnx6_dir_longfilename(struct inode *inode,
           mmi 3g filesystem does not have that checksum */
        if (!test_opt(s, MMI_FS) && fs32_to_cpu(sbi, de->de_checksum) !=
                        qnx6_lfile_checksum(lf->lf_fname, lf_size))
-                printk(KERN_INFO "qnx6: long filename checksum error.\n");
+                pr_info("long filename checksum error.\n");
-        QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n",
+        pr_debug("qnx6_readdir:%.*s inode:%u\n",
-                                        lf_size, lf->lf_fname, de_inode));
+                 lf_size, lf->lf_fname, de_inode);
        if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) {
                qnx6_put_page(page);
                return 0;
@@ -136,7 +135,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx)
                int i = start;
                if (IS_ERR(page)) {
-                        printk(KERN_ERR "qnx6_readdir: read failed\n");
+                        pr_err("%s(): read failed\n", __func__);
                        ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
                        return PTR_ERR(page);
                }
@@ -159,9 +158,9 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx)
                                        break;
                                }
                        } else {
-                                QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s"
+                                pr_debug("%s():%.*s inode:%u\n",
-                                   " inode:%u\n", size, de->de_fname,
+                                         __func__, size, de->de_fname,
-                                                        no_inode));
+                                         no_inode);
                                if (!dir_emit(ctx, de->de_fname, size,
                                      no_inode, DT_UNKNOWN)) {
                                        done = true;
@@ -259,8 +258,7 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
                                        if (ino)
                                                goto found;
                                } else
-                                        printk(KERN_ERR "qnx6: undefined "
+                                        pr_err("undefined filename size in inode.\n");
-                                                "filename size in inode.\n");
                        }
                        qnx6_put_page(page);
                }
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 65cdaab3ed49..44e73923670d 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -73,8 +73,8 @@ static int qnx6_get_block(struct inode *inode, sector_t iblock,
 {
        unsigned phys;
-        QNX6DEBUG((KERN_INFO "qnx6: qnx6_get_block inode=[%ld] iblock=[%ld]\n",
+        pr_debug("qnx6_get_block inode=[%ld] iblock=[%ld]\n",
-                        inode->i_ino, (unsigned long)iblock));
+                 inode->i_ino, (unsigned long)iblock);
        phys = qnx6_block_map(inode, iblock);
        if (phys) {
@@ -87,7 +87,7 @@ static int qnx6_get_block(struct inode *inode, sector_t iblock,
 static int qnx6_check_blockptr(__fs32 ptr)
 {
        if (ptr == ~(__fs32)0) {
-                printk(KERN_ERR "qnx6: hit unused blockpointer.\n");
+                pr_err("hit unused blockpointer.\n");
                return 0;
        }
        return 1;
@@ -127,8 +127,7 @@ static unsigned qnx6_block_map(struct inode *inode, unsigned no)
        levelptr = no >> bitdelta;
        if (levelptr > QNX6_NO_DIRECT_POINTERS - 1) {
-                printk(KERN_ERR "qnx6:Requested file block number (%u) too big.",
+                pr_err("Requested file block number (%u) too big.", no);
-                                no);
                return 0;
        }
@@ -137,8 +136,7 @@ static unsigned qnx6_block_map(struct inode *inode, unsigned no)
        for (i = 0; i < depth; i++) {
                bh = sb_bread(s, block);
                if (!bh) {
-                        printk(KERN_ERR "qnx6:Error reading block (%u)\n",
+                        pr_err("Error reading block (%u)\n", block);
-                                        block);
                        return 0;
                }
                bitdelta -= ptrbits;
@@ -207,26 +205,16 @@ void qnx6_superblock_debug(struct qnx6_super_block *sb, struct super_block *s)
 {
        struct qnx6_sb_info *sbi = QNX6_SB(s);
-        QNX6DEBUG((KERN_INFO "magic: %08x\n",
+        pr_debug("magic: %08x\n", fs32_to_cpu(sbi, sb->sb_magic));
-                                fs32_to_cpu(sbi, sb->sb_magic)));
+        pr_debug("checksum: %08x\n", fs32_to_cpu(sbi, sb->sb_checksum));
-        QNX6DEBUG((KERN_INFO "checksum: %08x\n",
+        pr_debug("serial: %llx\n", fs64_to_cpu(sbi, sb->sb_serial));
-                                fs32_to_cpu(sbi, sb->sb_checksum)));
+        pr_debug("flags: %08x\n", fs32_to_cpu(sbi, sb->sb_flags));
-        QNX6DEBUG((KERN_INFO "serial: %llx\n",
+        pr_debug("blocksize: %08x\n", fs32_to_cpu(sbi, sb->sb_blocksize));
-                                fs64_to_cpu(sbi, sb->sb_serial)));
+        pr_debug("num_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_num_inodes));
-        QNX6DEBUG((KERN_INFO "flags: %08x\n",
+        pr_debug("free_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_free_inodes));
-                                fs32_to_cpu(sbi, sb->sb_flags)));
+        pr_debug("num_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_num_blocks));
-        QNX6DEBUG((KERN_INFO "blocksize: %08x\n",
+        pr_debug("free_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_free_blocks));
-                                fs32_to_cpu(sbi, sb->sb_blocksize)));
+        pr_debug("inode_levels: %02x\n", sb->Inode.levels);
-        QNX6DEBUG((KERN_INFO "num_inodes: %08x\n",
-                                fs32_to_cpu(sbi, sb->sb_num_inodes)));
-        QNX6DEBUG((KERN_INFO "free_inodes: %08x\n",
-                                fs32_to_cpu(sbi, sb->sb_free_inodes)));
-        QNX6DEBUG((KERN_INFO "num_blocks: %08x\n",
-                                fs32_to_cpu(sbi, sb->sb_num_blocks)));
-        QNX6DEBUG((KERN_INFO "free_blocks: %08x\n",
-                                fs32_to_cpu(sbi, sb->sb_free_blocks)));
-        QNX6DEBUG((KERN_INFO "inode_levels: %02x\n",
-                                sb->Inode.levels));
 }
 #endif
@@ -277,7 +265,7 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,
           start with the first superblock */
        bh = sb_bread(s, offset);
        if (!bh) {
-                printk(KERN_ERR "qnx6: unable to read the first superblock\n");
+                pr_err("unable to read the first superblock\n");
                return NULL;
        }
        sb = (struct qnx6_super_block *)bh->b_data;
@@ -285,20 +273,16 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,
                sbi->s_bytesex = BYTESEX_BE;
                if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) {
                        /* we got a big endian fs */
-                        QNX6DEBUG((KERN_INFO "qnx6: fs got different"
+                        pr_debug("fs got different endianness.\n");
-                                        " endianness.\n"));
                        return bh;
                } else
                        sbi->s_bytesex = BYTESEX_LE;
                if (!silent) {
                        if (offset == 0) {
-                                printk(KERN_ERR "qnx6: wrong signature (magic)"
+                                pr_err("wrong signature (magic) in superblock #1.\n");
-                                        " in superblock #1.\n");
                        } else {
-                                printk(KERN_INFO "qnx6: wrong signature (magic)"
+                                pr_info("wrong signature (magic) at position (0x%lx) - will try alternative position (0x0000).\n",
-                                        " at position (0x%lx) - will try"
+                                        offset * s->s_blocksize);
-                                        " alternative position (0x0000).\n",
-                                                offset * s->s_blocksize);
                        }
                }
                brelse(bh);
@@ -329,13 +313,13 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
        /* Superblock always is 512 Byte long */
        if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) {
-                printk(KERN_ERR "qnx6: unable to set blocksize\n");
+                pr_err("unable to set blocksize\n");
                goto outnobh;
        }
        /* parse the mount-options */
        if (!qnx6_parse_options((char *) data, s)) {
-                printk(KERN_ERR "qnx6: invalid mount options.\n");
+                pr_err("invalid mount options.\n");
                goto outnobh;
        }
        if (test_opt(s, MMI_FS)) {
@@ -355,7 +339,7 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
                /* try again without bootblock offset */
                bh1 = qnx6_check_first_superblock(s, 0, silent);
                if (!bh1) {
-                        printk(KERN_ERR "qnx6: unable to read the first superblock\n");
+                        pr_err("unable to read the first superblock\n");
                        goto outnobh;
                }
                /* seems that no bootblock at partition start */
@@ -370,13 +354,13 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
        /* checksum check - start at byte 8 and end at byte 512 */
        if (fs32_to_cpu(sbi, sb1->sb_checksum) !=
                        crc32_be(0, (char *)(bh1->b_data + 8), 504)) {
-                printk(KERN_ERR "qnx6: superblock #1 checksum error\n");
+                pr_err("superblock #1 checksum error\n");
                goto out;
        }
        /* set new blocksize */
        if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) {
-                printk(KERN_ERR "qnx6: unable to set blocksize\n");
+                pr_err("unable to set blocksize\n");
                goto out;
        }
        /* blocksize invalidates bh - pull it back in */
@@ -398,21 +382,20 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
        /* next the second superblock */
        bh2 = sb_bread(s, offset);
        if (!bh2) {
-                printk(KERN_ERR "qnx6: unable to read the second superblock\n");
+                pr_err("unable to read the second superblock\n");
                goto out;
        }
        sb2 = (struct qnx6_super_block *)bh2->b_data;
        if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) {
                if (!silent)
-                        printk(KERN_ERR "qnx6: wrong signature (magic)"
+                        pr_err("wrong signature (magic) in superblock #2.\n");
-                                        " in superblock #2.\n");
                goto out;
        }
        /* checksum check - start at byte 8 and end at byte 512 */
        if (fs32_to_cpu(sbi, sb2->sb_checksum) !=
                                crc32_be(0, (char *)(bh2->b_data + 8), 504)) {
-                printk(KERN_ERR "qnx6: superblock #2 checksum error\n");
+                pr_err("superblock #2 checksum error\n");
                goto out;
        }
@@ -422,25 +405,24 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
                sbi->sb_buf = bh1;
                sbi->sb = (struct qnx6_super_block *)bh1->b_data;
                brelse(bh2);
-                printk(KERN_INFO "qnx6: superblock #1 active\n");
+                pr_info("superblock #1 active\n");
        } else {
                /* superblock #2 active */
                sbi->sb_buf = bh2;
                sbi->sb = (struct qnx6_super_block *)bh2->b_data;
                brelse(bh1);
-                printk(KERN_INFO "qnx6: superblock #2 active\n");
+                pr_info("superblock #2 active\n");
        }
 mmi_success:
        /* sanity check - limit maximum indirect pointer levels */
        if (sb1->Inode.levels > QNX6_PTR_MAX_LEVELS) {
-                printk(KERN_ERR "qnx6: too many inode levels (max %i, sb %i)\n",
+                pr_err("too many inode levels (max %i, sb %i)\n",
-                        QNX6_PTR_MAX_LEVELS, sb1->Inode.levels);
+                       QNX6_PTR_MAX_LEVELS, sb1->Inode.levels);
                goto out;
        }
        if (sb1->Longfile.levels > QNX6_PTR_MAX_LEVELS) {
-                printk(KERN_ERR "qnx6: too many longfilename levels"
+                pr_err("too many longfilename levels (max %i, sb %i)\n",
-                                " (max %i, sb %i)\n",
+                       QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels);
-                        QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels);
                goto out;
        }
        s->s_op = &qnx6_sops;
@@ -460,7 +442,7 @@ mmi_success:
        /* prefetch root inode */
        root = qnx6_iget(s, QNX6_ROOT_INO);
        if (IS_ERR(root)) {
-                printk(KERN_ERR "qnx6: get inode failed\n");
+                pr_err("get inode failed\n");
                ret = PTR_ERR(root);
                goto out2;
        }
@@ -474,7 +456,7 @@ mmi_success:
        errmsg = qnx6_checkroot(s);
        if (errmsg != NULL) {
                if (!silent)
-                        printk(KERN_ERR "qnx6: %s\n", errmsg);
+                        pr_err("%s\n", errmsg);
                goto out3;
        }
        return 0;
@@ -555,8 +537,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
        inode->i_mode = 0;
        if (ino == 0) {
-                printk(KERN_ERR "qnx6: bad inode number on dev %s: %u is "
+                pr_err("bad inode number on dev %s: %u is out of range\n",
-                                "out of range\n",
                       sb->s_id, ino);
                iget_failed(inode);
                return ERR_PTR(-EIO);
@@ -566,8 +547,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
        mapping = sbi->inodes->i_mapping;
        page = read_mapping_page(mapping, n, NULL);
        if (IS_ERR(page)) {
-                printk(KERN_ERR "qnx6: major problem: unable to read inode from "
+                pr_err("major problem: unable to read inode from dev %s\n",
-                       "dev %s\n", sb->s_id);
+                       sb->s_id);
                iget_failed(inode);
                return ERR_CAST(page);
        }
@@ -689,7 +670,7 @@ static int __init init_qnx6_fs(void)
                return err;
        }
-        printk(KERN_INFO "QNX6 filesystem 1.0.0 registered.\n");
+        pr_info("QNX6 filesystem 1.0.0 registered.\n");
        return 0;
 }
diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c
index 0561326a94f5..6c1a323137dd 100644
--- a/fs/qnx6/namei.c
+++ b/fs/qnx6/namei.c
@@ -29,12 +29,12 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry,
                foundinode = qnx6_iget(dir->i_sb, ino);
                qnx6_put_page(page);
                if (IS_ERR(foundinode)) {
-                        QNX6DEBUG((KERN_ERR "qnx6: lookup->iget -> "
+                        pr_debug("lookup->iget ->  error %ld\n",
-                                " error %ld\n", PTR_ERR(foundinode)));
+                                 PTR_ERR(foundinode));
                        return ERR_CAST(foundinode);
                }
        } else {
-                QNX6DEBUG((KERN_INFO "qnx6_lookup: not found %s\n", name));
+                pr_debug("%s(): not found %s\n", __func__, name);
                return NULL;
        }
        d_add(dentry, foundinode);
diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h
index b00fcc960d37..d3fb2b698800 100644
--- a/fs/qnx6/qnx6.h
+++ b/fs/qnx6/qnx6.h
@@ -10,6 +10,12 @@
 *
 */
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/fs.h>
 #include <linux/pagemap.h>
@@ -19,12 +25,6 @@ typedef __u64 __bitwise __fs64;
 #include <linux/qnx6_fs.h>
-#ifdef CONFIG_QNX6FS_DEBUG
-#define QNX6DEBUG(X) printk X
-#else
-#define QNX6DEBUG(X) (void) 0
-#endif
 struct qnx6_sb_info {
        struct buffer_head      *sb_buf;        /* superblock buffer */
        struct qnx6_super_block *sb;            /* our superblock */
diff --git a/fs/qnx6/super_mmi.c b/fs/qnx6/super_mmi.c
index 29c32cba62d6..62aaf3e3126a 100644
--- a/fs/qnx6/super_mmi.c
+++ b/fs/qnx6/super_mmi.c
@@ -44,15 +44,14 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
           start with the first superblock */
        bh1 = sb_bread(s, 0);
        if (!bh1) {
-                printk(KERN_ERR "qnx6: Unable to read first mmi superblock\n");
+                pr_err("Unable to read first mmi superblock\n");
                return NULL;
        }
        sb1 = (struct qnx6_mmi_super_block *)bh1->b_data;
        sbi = QNX6_SB(s);
        if (fs32_to_cpu(sbi, sb1->sb_magic) != QNX6_SUPER_MAGIC) {
                if (!silent) {
-                        printk(KERN_ERR "qnx6: wrong signature (magic) in"
+                        pr_err("wrong signature (magic) in superblock #1.\n");
-                                        " superblock #1.\n");
                        goto out;
                }
        }
@@ -60,7 +59,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
        /* checksum check - start at byte 8 and end at byte 512 */
        if (fs32_to_cpu(sbi, sb1->sb_checksum) !=
                                crc32_be(0, (char *)(bh1->b_data + 8), 504)) {
-                printk(KERN_ERR "qnx6: superblock #1 checksum error\n");
+                pr_err("superblock #1 checksum error\n");
                goto out;
        }
@@ -70,7 +69,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
        /* set new blocksize */
        if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) {
-                printk(KERN_ERR "qnx6: unable to set blocksize\n");
+                pr_err("unable to set blocksize\n");
                goto out;
        }
        /* blocksize invalidates bh - pull it back in */
@@ -83,27 +82,26 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
        /* read second superblock */
        bh2 = sb_bread(s, offset);
        if (!bh2) {
-                printk(KERN_ERR "qnx6: unable to read the second superblock\n");
+                pr_err("unable to read the second superblock\n");
                goto out;
        }
        sb2 = (struct qnx6_mmi_super_block *)bh2->b_data;
        if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) {
                if (!silent)
-                        printk(KERN_ERR "qnx6: wrong signature (magic) in"
+                        pr_err("wrong signature (magic) in superblock #2.\n");
-                                        " superblock #2.\n");
                goto out;
        }
        /* checksum check - start at byte 8 and end at byte 512 */
        if (fs32_to_cpu(sbi, sb2->sb_checksum)
                        != crc32_be(0, (char *)(bh2->b_data + 8), 504)) {
-                printk(KERN_ERR "qnx6: superblock #1 checksum error\n");
+                pr_err("superblock #1 checksum error\n");
                goto out;
        }
        qsb = kmalloc(sizeof(*qsb), GFP_KERNEL);
        if (!qsb) {
-                printk(KERN_ERR "qnx6: unable to allocate memory.\n");
+                pr_err("unable to allocate memory.\n");
                goto out;
        }
@@ -119,7 +117,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
                sbi->sb_buf = bh1;
                sbi->sb = (struct qnx6_super_block *)bh1->b_data;
                brelse(bh2);
-                printk(KERN_INFO "qnx6: superblock #1 active\n");
+                pr_info("superblock #1 active\n");
        } else {
                /* superblock #2 active */
                qnx6_mmi_copy_sb(qsb, sb2);
@@ -131,7 +129,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
                sbi->sb_buf = bh2;
                sbi->sb = (struct qnx6_super_block *)bh2->b_data;
                brelse(bh1);
-                printk(KERN_INFO "qnx6: superblock #2 active\n");
+                pr_info("superblock #2 active\n");
        }
        kfree(qsb);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 7f30bdc57d13..f2d0eee9d1f1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -96,13 +96,16 @@
 * Note that some things (eg. sb pointer, type, id) doesn't change during
 * the life of the dquot structure and so needn't to be protected by a lock
 *
- * Any operation working on dquots via inode pointers must hold dqptr_sem.  If
+ * Operation accessing dquots via inode pointers are protected by dquot_srcu.
- * operation is just reading pointers from inode (or not using them at all) the
+ * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and
- * read lock is enough. If pointers are altered function must hold write lock.
+ * synchronize_srcu(&dquot_srcu) is called after clearing pointers from
+ * inode and before dropping dquot references to avoid use of dquots after
+ * they are freed. dq_data_lock is used to serialize the pointer setting and
+ * clearing operations.
 * Special care needs to be taken about S_NOQUOTA inode flag (marking that
 * inode is a quota file). Functions adding pointers from inode to dquots have
- * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they
+ * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they
- * have to do all pointer modifications before dropping dqptr_sem. This makes
+ * have to do all pointer modifications before dropping dq_data_lock. This makes
 * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
 * then drops all pointers to dquots from an inode.
 *
@@ -116,21 +119,15 @@
 * spinlock to internal buffers before writing.
 *
 * Lock ordering (including related VFS locks) is the following:
- *   dqonoff_mutex > i_mutex > journal_lock > dqptr_sem > dquot->dq_lock >
+ *   dqonoff_mutex > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex
- *   dqio_mutex
 * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc.
- * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem >
- * dqptr_sem. But filesystem has to count with the fact that functions such as
- * dquot_alloc_space() acquire dqptr_sem and they usually have to be called
- * from inside a transaction to keep filesystem consistency after a crash. Also
- * filesystems usually want to do some IO on dquot from ->mark_dirty which is
- * called with dqptr_sem held.
 */
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
 EXPORT_SYMBOL(dq_data_lock);
+DEFINE_STATIC_SRCU(dquot_srcu);
 void __quota_error(struct super_block *sb, const char *func,
                   const char *fmt, ...)
@@ -733,7 +730,6 @@ static struct shrinker dqcache_shrinker = {
 /*
 * Put reference to dquot
- * NOTE: If you change this function please check whether dqput_blocks() works right...
 */
 void dqput(struct dquot *dquot)
 {
@@ -963,46 +959,33 @@ static void add_dquot_ref(struct super_block *sb, int type)
 }
 /*
- * Return 0 if dqput() won't block.
- * (note that 1 doesn't necessarily mean blocking)
- */
-static inline int dqput_blocks(struct dquot *dquot)
-{
-        if (atomic_read(&dquot->dq_count) <= 1)
-                return 1;
-        return 0;
-}
-/*
 * Remove references to dquots from inode and add dquot to list for freeing
 * if we have the last reference to dquot
- * We can't race with anybody because we hold dqptr_sem for writing...
 */
-static int remove_inode_dquot_ref(struct inode *inode, int type,
+static void remove_inode_dquot_ref(struct inode *inode, int type,
-                                  struct list_head *tofree_head)
+                                   struct list_head *tofree_head)
 {
        struct dquot *dquot = inode->i_dquot[type];
        inode->i_dquot[type] = NULL;
-        if (dquot) {
+        if (!dquot)
-                if (dqput_blocks(dquot)) {
+                return;
-#ifdef CONFIG_QUOTA_DEBUG
-                        if (atomic_read(&dquot->dq_count) != 1)
+        if (list_empty(&dquot->dq_free)) {
-                                quota_error(inode->i_sb, "Adding dquot with "
+                /*
-                                            "dq_count %d to dispose list",
+                 * The inode still has reference to dquot so it can't be in the
-                                            atomic_read(&dquot->dq_count));
+                 * free list
-#endif
+                 */
-                        spin_lock(&dq_list_lock);
+                spin_lock(&dq_list_lock);
-                        /* As dquot must have currently users it can't be on
+                list_add(&dquot->dq_free, tofree_head);
-                         * the free list... */
+                spin_unlock(&dq_list_lock);
-                        list_add(&dquot->dq_free, tofree_head);
+        } else {
-                        spin_unlock(&dq_list_lock);
+                /*
-                        return 1;
+                 * Dquot is already in a list to put so we won't drop the last
-                }
+                 * reference here.
-                else
+                 */
-                        dqput(dquot);   /* We have guaranteed we won't block */
+                dqput(dquot);
        }
-        return 0;
 }
 /*
@@ -1037,13 +1020,15 @@ static void remove_dquot_ref(struct super_block *sb, int type,
                 *  We have to scan also I_NEW inodes because they can already
                 *  have quota pointer initialized. Luckily, we need to touch
                 *  only quota pointers and these have separate locking
-                 *  (dqptr_sem).
+                 *  (dq_data_lock).
                 */
+                spin_lock(&dq_data_lock);
                if (!IS_NOQUOTA(inode)) {
                        if (unlikely(inode_get_rsv_space(inode) > 0))
                                reserved = 1;
                        remove_inode_dquot_ref(inode, type, tofree_head);
                }
+                spin_unlock(&dq_data_lock);
        }
        spin_unlock(&inode_sb_list_lock);
 #ifdef CONFIG_QUOTA_DEBUG
@@ -1061,9 +1046,8 @@ static void drop_dquot_ref(struct super_block *sb, int type)
        LIST_HEAD(tofree_head);
        if (sb->dq_op) {
-                down_write(&sb_dqopt(sb)->dqptr_sem);
                remove_dquot_ref(sb, type, &tofree_head);
-                up_write(&sb_dqopt(sb)->dqptr_sem);
+                synchronize_srcu(&dquot_srcu);
                put_dquot_list(&tofree_head);
        }
 }
@@ -1394,21 +1378,16 @@ static int dquot_active(const struct inode *inode)
 /*
 * Initialize quota pointers in inode
 *
- * We do things in a bit complicated way but by that we avoid calling
- * dqget() and thus filesystem callbacks under dqptr_sem.
- *
 * It is better to call this function outside of any transaction as it
 * might need a lot of space in journal for dquot structure allocation.
 */
 static void __dquot_initialize(struct inode *inode, int type)
 {
-        int cnt;
+        int cnt, init_needed = 0;
        struct dquot *got[MAXQUOTAS];
        struct super_block *sb = inode->i_sb;
        qsize_t rsv;
-        /* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
        if (!dquot_active(inode))
                return;
@@ -1418,6 +1397,15 @@ static void __dquot_initialize(struct inode *inode, int type)
                got[cnt] = NULL;
                if (type != -1 && cnt != type)
                        continue;
+                /*
+                 * The i_dquot should have been initialized in most cases,
+                 * we check it without locking here to avoid unnecessary
+                 * dqget()/dqput() calls.
+                 */
+                if (inode->i_dquot[cnt])
+                        continue;
+                init_needed = 1;
                switch (cnt) {
                case USRQUOTA:
                        qid = make_kqid_uid(inode->i_uid);
@@ -1429,7 +1417,11 @@ static void __dquot_initialize(struct inode *inode, int type)
                got[cnt] = dqget(sb, qid);
        }
-        down_write(&sb_dqopt(sb)->dqptr_sem);
+        /* All required i_dquot has been initialized */
+        if (!init_needed)
+                return;
+        spin_lock(&dq_data_lock);
        if (IS_NOQUOTA(inode))
                goto out_err;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1449,15 +1441,12 @@ static void __dquot_initialize(struct inode *inode, int type)
                         * did a write before quota was turned on
                         */
                        rsv = inode_get_rsv_space(inode);
-                        if (unlikely(rsv)) {
+                        if (unlikely(rsv))
-                                spin_lock(&dq_data_lock);
                                dquot_resv_space(inode->i_dquot[cnt], rsv);
-                                spin_unlock(&dq_data_lock);
-                        }
                }
        }
 out_err:
-        up_write(&sb_dqopt(sb)->dqptr_sem);
+        spin_unlock(&dq_data_lock);
        /* Drop unused references */
        dqput_all(got);
 }
@@ -1469,19 +1458,24 @@ void dquot_initialize(struct inode *inode)
 EXPORT_SYMBOL(dquot_initialize);
 /*
- *      Release all quotas referenced by inode
+ * Release all quotas referenced by inode.
+ *
+ * This function only be called on inode free or converting
+ * a file to quota file, no other users for the i_dquot in
+ * both cases, so we needn't call synchronize_srcu() after
+ * clearing i_dquot.
 */
 static void __dquot_drop(struct inode *inode)
 {
        int cnt;
        struct dquot *put[MAXQUOTAS];
-        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                put[cnt] = inode->i_dquot[cnt];
                inode->i_dquot[cnt] = NULL;
        }
-        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        spin_unlock(&dq_data_lock);
        dqput_all(put);
 }
@@ -1599,15 +1593,11 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
 */
 int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 {
-        int cnt, ret = 0;
+        int cnt, ret = 0, index;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot **dquots = inode->i_dquot;
        int reserve = flags & DQUOT_SPACE_RESERVE;
-        /*
-         * First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex
-         */
        if (!dquot_active(inode)) {
                inode_incr_space(inode, number, reserve);
                goto out;
@@ -1616,7 +1606,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;
-        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!dquots[cnt])
@@ -1643,7 +1633,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
                goto out_flush_warn;
        mark_all_dquot_dirty(dquots);
 out_flush_warn:
-        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
 out:
        return ret;
@@ -1655,17 +1645,16 @@ EXPORT_SYMBOL(__dquot_alloc_space);
 */
 int dquot_alloc_inode(const struct inode *inode)
 {
-        int cnt, ret = 0;
+        int cnt, ret = 0, index;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot * const *dquots = inode->i_dquot;
-        /* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
        if (!dquot_active(inode))
                return 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;
-        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!dquots[cnt])
@@ -1685,7 +1674,7 @@ warn_put_all:
        spin_unlock(&dq_data_lock);
        if (ret == 0)
                mark_all_dquot_dirty(dquots);
-        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
        return ret;
 }
@@ -1696,14 +1685,14 @@ EXPORT_SYMBOL(dquot_alloc_inode);
 */
 int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
-        int cnt;
+        int cnt, index;
        if (!dquot_active(inode)) {
                inode_claim_rsv_space(inode, number);
                return 0;
        }
-        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&dq_data_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1715,7 +1704,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
        inode_claim_rsv_space(inode, number);
        spin_unlock(&dq_data_lock);
        mark_all_dquot_dirty(inode->i_dquot);
-        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        srcu_read_unlock(&dquot_srcu, index);
        return 0;
 }
 EXPORT_SYMBOL(dquot_claim_space_nodirty);
@@ -1725,14 +1714,14 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
 */
 void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
 {
-        int cnt;
+        int cnt, index;
        if (!dquot_active(inode)) {
                inode_reclaim_rsv_space(inode, number);
                return;
        }
-        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&dq_data_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1744,7 +1733,7 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
        inode_reclaim_rsv_space(inode, number);
        spin_unlock(&dq_data_lock);
        mark_all_dquot_dirty(inode->i_dquot);
-        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        srcu_read_unlock(&dquot_srcu, index);
        return;
 }
 EXPORT_SYMBOL(dquot_reclaim_space_nodirty);
@@ -1757,16 +1746,14 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
        unsigned int cnt;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot **dquots = inode->i_dquot;
-        int reserve = flags & DQUOT_SPACE_RESERVE;
+        int reserve = flags & DQUOT_SPACE_RESERVE, index;
-        /* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
        if (!dquot_active(inode)) {
                inode_decr_space(inode, number, reserve);
                return;
        }
-        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                int wtype;
@@ -1789,7 +1776,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
                goto out_unlock;
        mark_all_dquot_dirty(dquots);
 out_unlock:
-        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
 }
 EXPORT_SYMBOL(__dquot_free_space);
@@ -1802,13 +1789,12 @@ void dquot_free_inode(const struct inode *inode)
        unsigned int cnt;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot * const *dquots = inode->i_dquot;
+        int index;
-        /* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
        if (!dquot_active(inode))
                return;
-        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                int wtype;
@@ -1823,7 +1809,7 @@ void dquot_free_inode(const struct inode *inode)
        }
        spin_unlock(&dq_data_lock);
        mark_all_dquot_dirty(dquots);
-        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
 }
 EXPORT_SYMBOL(dquot_free_inode);
@@ -1837,6 +1823,8 @@ EXPORT_SYMBOL(dquot_free_inode);
 * This operation can block, but only after everything is updated
 * A transaction must be started when entering this function.
 *
+ * We are holding reference on transfer_from & transfer_to, no need to
+ * protect them by srcu_read_lock().
 */
 int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 {
@@ -1849,8 +1837,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        struct dquot_warn warn_from_inodes[MAXQUOTAS];
        struct dquot_warn warn_from_space[MAXQUOTAS];
-        /* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
        if (IS_NOQUOTA(inode))
                return 0;
        /* Initialize the arrays */
@@ -1859,12 +1845,12 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
                warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN;
                warn_from_space[cnt].w_type = QUOTA_NL_NOWARN;
        }
-        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        spin_lock(&dq_data_lock);
        if (IS_NOQUOTA(inode)) {        /* File without quota accounting? */
-                up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+                spin_unlock(&dq_data_lock);
                return 0;
        }
-        spin_lock(&dq_data_lock);
        cur_space = inode_get_bytes(inode);
        rsv_space = inode_get_rsv_space(inode);
        space = cur_space + rsv_space;
@@ -1918,7 +1904,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
                inode->i_dquot[cnt] = transfer_to[cnt];
        }
        spin_unlock(&dq_data_lock);
-        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        mark_all_dquot_dirty(transfer_from);
        mark_all_dquot_dirty(transfer_to);
@@ -1932,7 +1917,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
        return 0;
 over_quota:
        spin_unlock(&dq_data_lock);
-        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        flush_warnings(warn_to);
        return ret;
 }
diff --git a/fs/quota/kqid.c b/fs/quota/kqid.c
index 2f97b0e2c501..ebc5e6285800 100644
--- a/fs/quota/kqid.c
+++ b/fs/quota/kqid.c
@@ -55,7 +55,7 @@ EXPORT_SYMBOL(qid_lt);
 /**
 *      from_kqid - Create a qid from a kqid user-namespace pair.
 *      @targ: The user namespace we want a qid in.
- *      @kuid: The kernel internal quota identifier to start with.
+ *      @kqid: The kernel internal quota identifier to start with.
 *
 *      Map @kqid into the user-namespace specified by @targ and
 *      return the resulting qid.
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 72d29177998e..bb2869f5dfd8 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -32,8 +32,7 @@ static struct genl_family quota_genl_family = {
 /**
 * quota_send_warning - Send warning to userspace about exceeded quota
- * @type: The quota type: USRQQUOTA, GRPQUOTA,...
+ * @qid: The kernel internal quota identifier.
- * @id: The user or group id of the quota that was exceeded
 * @dev: The device on which the fs is mounted (sb->s_dev)
 * @warntype: The type of the warning: QUOTA_NL_...
 *
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ff3f0b3cfdb3..75621649dbd7 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -79,13 +79,13 @@ static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
 {
        __u32 fmt;
-        down_read(&sb_dqopt(sb)->dqptr_sem);
+        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
        if (!sb_has_quota_active(sb, type)) {
-                up_read(&sb_dqopt(sb)->dqptr_sem);
+                mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
                return -ESRCH;
        }
        fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
-        up_read(&sb_dqopt(sb)->dqptr_sem);
+        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        if (copy_to_user(addr, &fmt, sizeof(fmt)))
                return -EFAULT;
        return 0;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index dda012ad4208..bbafbde3471a 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -222,7 +222,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
        /* gang-find the pages */
        ret = -ENOMEM;
-        pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
+        pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL);
        if (!pages)
                goto out_free;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index d9f5a60dd59b..0a7dc941aaf4 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -9,7 +9,7 @@
 #include <linux/stat.h>
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 extern const struct reiserfs_key MIN_KEY;
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 54fdf196bfb2..9c02d96d3a42 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -10,7 +10,7 @@
 * and using buffers obtained after all above.
 */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/time.h>
 #include "reiserfs.h"
 #include <linux/buffer_head.h>
@@ -286,12 +286,14 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
        return 0;
 }
-static void balance_leaf_insert_left(struct tree_balance *tb,
+static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
-                                     struct item_head *ih, const char *body)
+                                             struct item_head *const ih,
+                                             const char * const body)
 {
        int ret;
        struct buffer_info bi;
        int n = B_NR_ITEMS(tb->L[0]);
+        unsigned body_shift_bytes = 0;
        if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
                /* part of new item falls into L[0] */
@@ -329,7 +331,7 @@ static void balance_leaf_insert_left(struct tree_balance *tb,
                put_ih_item_len(ih, new_item_len);
                if (tb->lbytes > tb->zeroes_num) {
-                        body += (tb->lbytes - tb->zeroes_num);
+                        body_shift_bytes = tb->lbytes - tb->zeroes_num;
                        tb->zeroes_num = 0;
                } else
                        tb->zeroes_num -= tb->lbytes;
@@ -349,11 +351,12 @@ static void balance_leaf_insert_left(struct tree_balance *tb,
                tb->insert_size[0] = 0;
                tb->zeroes_num = 0;
        }
+        return body_shift_bytes;
 }
 static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
-                                                 struct item_head *ih,
+                                                 struct item_head * const ih,
-                                                 const char *body)
+                                                 const char * const body)
 {
        int n = B_NR_ITEMS(tb->L[0]);
        struct buffer_info bi;
@@ -413,17 +416,18 @@ static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
        tb->pos_in_item -= tb->lbytes;
 }
-static void balance_leaf_paste_left_shift(struct tree_balance *tb,
+static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb,
-                                          struct item_head *ih,
+                                                  struct item_head * const ih,
-                                          const char *body)
+                                                  const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        int n = B_NR_ITEMS(tb->L[0]);
        struct buffer_info bi;
+        int body_shift_bytes = 0;
        if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
                balance_leaf_paste_left_shift_dirent(tb, ih, body);
-                return;
+                return 0;
        }
        RFALSE(tb->lbytes <= 0,
@@ -497,7 +501,7 @@ static void balance_leaf_paste_left_shift(struct tree_balance *tb,
                 * insert_size[0]
                 */
                if (l_n > tb->zeroes_num) {
-                        body += (l_n - tb->zeroes_num);
+                        body_shift_bytes = l_n - tb->zeroes_num;
                        tb->zeroes_num = 0;
                } else
                        tb->zeroes_num -= l_n;
@@ -526,13 +530,14 @@ static void balance_leaf_paste_left_shift(struct tree_balance *tb,
                 */
                leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
        }
+        return body_shift_bytes;
 }
 /* appended item will be in L[0] in whole */
 static void balance_leaf_paste_left_whole(struct tree_balance *tb,
-                                          struct item_head *ih,
+                                          struct item_head * const ih,
-                                          const char *body)
+                                          const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        int n = B_NR_ITEMS(tb->L[0]);
@@ -584,39 +589,44 @@ static void balance_leaf_paste_left_whole(struct tree_balance *tb,
        tb->zeroes_num = 0;
 }
-static void balance_leaf_paste_left(struct tree_balance *tb,
+static unsigned int balance_leaf_paste_left(struct tree_balance *tb,
-                                    struct item_head *ih, const char *body)
+                                            struct item_head * const ih,
+                                            const char * const body)
 {
        /* we must shift the part of the appended item */
        if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1)
-                balance_leaf_paste_left_shift(tb, ih, body);
+                return balance_leaf_paste_left_shift(tb, ih, body);
        else
                balance_leaf_paste_left_whole(tb, ih, body);
+        return 0;
 }
 /* Shift lnum[0] items from S[0] to the left neighbor L[0] */
-static void balance_leaf_left(struct tree_balance *tb, struct item_head *ih,
+static unsigned int balance_leaf_left(struct tree_balance *tb,
-                              const char *body, int flag)
+                                      struct item_head * const ih,
+                                      const char * const body, int flag)
 {
        if (tb->lnum[0] <= 0)
-                return;
+                return 0;
        /* new item or it part falls to L[0], shift it too */
        if (tb->item_pos < tb->lnum[0]) {
                BUG_ON(flag != M_INSERT && flag != M_PASTE);
                if (flag == M_INSERT)
-                        balance_leaf_insert_left(tb, ih, body);
+                        return balance_leaf_insert_left(tb, ih, body);
                else /* M_PASTE */
-                        balance_leaf_paste_left(tb, ih, body);
+                        return balance_leaf_paste_left(tb, ih, body);
        } else
                /* new item doesn't fall into L[0] */
                leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+        return 0;
 }
 static void balance_leaf_insert_right(struct tree_balance *tb,
-                                      struct item_head *ih, const char *body)
+                                      struct item_head * const ih,
+                                      const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
@@ -704,7 +714,8 @@ static void balance_leaf_insert_right(struct tree_balance *tb,
 static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
-                                     struct item_head *ih, const char *body)
+                                     struct item_head * const ih,
+                                     const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        struct buffer_info bi;
@@ -754,7 +765,8 @@ static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
 }
 static void balance_leaf_paste_right_shift(struct tree_balance *tb,
-                                     struct item_head *ih, const char *body)
+                                     struct item_head * const ih,
+                                     const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        int n_shift, n_rem, r_zeroes_number, version;
@@ -831,7 +843,8 @@ static void balance_leaf_paste_right_shift(struct tree_balance *tb,
 }
 static void balance_leaf_paste_right_whole(struct tree_balance *tb,
-                                     struct item_head *ih, const char *body)
+                                     struct item_head * const ih,
+                                     const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        int n = B_NR_ITEMS(tbS0);
@@ -874,7 +887,8 @@ static void balance_leaf_paste_right_whole(struct tree_balance *tb,
 }
 static void balance_leaf_paste_right(struct tree_balance *tb,
-                                     struct item_head *ih, const char *body)
+                                     struct item_head * const ih,
+                                     const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        int n = B_NR_ITEMS(tbS0);
@@ -896,8 +910,9 @@ static void balance_leaf_paste_right(struct tree_balance *tb,
 }
 /* shift rnum[0] items from S[0] to the right neighbor R[0] */
-static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih,
+static void balance_leaf_right(struct tree_balance *tb,
-                               const char *body, int flag)
+                               struct item_head * const ih,
+                               const char * const body, int flag)
 {
        if (tb->rnum[0] <= 0)
                return;
@@ -911,8 +926,8 @@ static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih,
 }
 static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
-                                          struct item_head *ih,
+                                          struct item_head * const ih,
-                                          const char *body,
+                                          const char * const body,
                                          struct item_head *insert_key,
                                          struct buffer_head **insert_ptr,
                                          int i)
@@ -1003,8 +1018,8 @@ static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
 /* we append to directory item */
 static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
-                                         struct item_head *ih,
+                                         struct item_head * const ih,
-                                         const char *body,
+                                         const char * const body,
                                         struct item_head *insert_key,
                                         struct buffer_head **insert_ptr,
                                         int i)
@@ -1058,8 +1073,8 @@ static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
 }
 static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
-                                         struct item_head *ih,
+                                         struct item_head * const ih,
-                                         const char *body,
+                                         const char * const body,
                                         struct item_head *insert_key,
                                         struct buffer_head **insert_ptr,
                                         int i)
@@ -1131,8 +1146,8 @@ static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
 }
 static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
-                                               struct item_head *ih,
+                                               struct item_head * const ih,
-                                               const char *body,
+                                               const char * const body,
                                               struct item_head *insert_key,
                                               struct buffer_head **insert_ptr,
                                               int i)
@@ -1184,8 +1199,8 @@ static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
 }
 static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
-                                         struct item_head *ih,
+                                         struct item_head * const ih,
-                                         const char *body,
+                                         const char * const body,
                                         struct item_head *insert_key,
                                         struct buffer_head **insert_ptr,
                                         int i)
@@ -1214,8 +1229,8 @@ static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
 /* Fill new nodes that appear in place of S[0] */
 static void balance_leaf_new_nodes(struct tree_balance *tb,
-                                   struct item_head *ih,
+                                   struct item_head * const ih,
-                                   const char *body,
+                                   const char * const body,
                                   struct item_head *insert_key,
                                   struct buffer_head **insert_ptr,
                                   int flag)
@@ -1254,8 +1269,8 @@ static void balance_leaf_new_nodes(struct tree_balance *tb,
 }
 static void balance_leaf_finish_node_insert(struct tree_balance *tb,
-                                            struct item_head *ih,
+                                            struct item_head * const ih,
-                                            const char *body)
+                                            const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        struct buffer_info bi;
@@ -1271,8 +1286,8 @@ static void balance_leaf_finish_node_insert(struct tree_balance *tb,
 }
 static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
-                                                  struct item_head *ih,
+                                                  struct item_head * const ih,
-                                                  const char *body)
+                                                  const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        struct item_head *pasted = item_head(tbS0, tb->item_pos);
@@ -1305,8 +1320,8 @@ static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
 }
 static void balance_leaf_finish_node_paste(struct tree_balance *tb,
-                                           struct item_head *ih,
+                                           struct item_head * const ih,
-                                           const char *body)
+                                           const char * const body)
 {
        struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
        struct buffer_info bi;
@@ -1349,8 +1364,8 @@ static void balance_leaf_finish_node_paste(struct tree_balance *tb,
 * of the affected item which remains in S
 */
 static void balance_leaf_finish_node(struct tree_balance *tb,
-                                      struct item_head *ih,
+                                      struct item_head * const ih,
-                                      const char *body, int flag)
+                                      const char * const body, int flag)
 {
        /* if we must insert or append into buffer S[0] */
        if (0 <= tb->item_pos && tb->item_pos < tb->s0num) {
@@ -1402,7 +1417,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,
            && is_indirect_le_ih(item_head(tbS0, tb->item_pos)))
                tb->pos_in_item *= UNFM_P_SIZE;
-        balance_leaf_left(tb, ih, body, flag);
+        body += balance_leaf_left(tb, ih, body, flag);
        /* tb->lnum[0] > 0 */
        /* Calculate new item position */
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index db9e80ba53a0..751dd3f4346b 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -6,7 +6,7 @@
 #include "reiserfs.h"
 #include "acl.h"
 #include "xattr.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index 73231b1ebdbe..b751eea32e20 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -2,7 +2,7 @@
 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
 */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/string.h>
 #include <linux/time.h>
 #include "reiserfs.h"
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 63b2b0ec49e6..a7eec9888f10 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,7 +11,7 @@
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unaligned.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 501ed6811a2b..6ec8a30a0911 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -7,7 +7,7 @@
 #include <linux/mount.h>
 #include "reiserfs.h"
 #include <linux/time.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/pagemap.h>
 #include <linux/compat.h>
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index cfaee912ee09..aca73dd73906 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -54,7 +54,7 @@ static void sd_print_item(struct item_head *ih, char *item)
        } else {
                struct stat_data *sd = (struct stat_data *)item;
-                printk("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd),
+                printk("\t0%-6o | %6llu | %2u | %d | %s\n", sd_v2_mode(sd),
                       (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
                       sd_v2_rdev(sd), print_time(sd_v2_mtime(sd)));
        }
@@ -408,7 +408,7 @@ static void direntry_print_item(struct item_head *ih, char *item)
                        namebuf[namelen + 2] = 0;
                }
-                printk("%d:  %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n",
+                printk("%d:  %-15s%-15d%-15d%-15lld%-15lld(%s)\n",
                       i, namebuf,
                       deh_dir_id(deh), deh_objectid(deh),
                       GET_HASH_VALUE(deh_offset(deh)),
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e8870de4627e..a88b1b3e7db3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1947,8 +1947,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
                }
        }
-        /* wait for all commits to finish */
-        cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
        /*
         * We must release the write lock here because
@@ -1956,8 +1954,14 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
         */
        reiserfs_write_unlock(sb);
+        /*
+         * Cancel flushing of old commits. Note that neither of these works
+         * will be requeued because superblock is being shutdown and doesn't
+         * have MS_ACTIVE set.
+         */
        cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
-        flush_workqueue(REISERFS_SB(sb)->commit_wq);
+        /* wait for all commits to finish */
+        cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
        free_journal_ram(sb);
@@ -4292,9 +4296,15 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
        if (flush) {
                flush_commit_list(sb, jl, 1);
                flush_journal_list(sb, jl, 1);
-        } else if (!(jl->j_state & LIST_COMMIT_PENDING))
+        } else if (!(jl->j_state & LIST_COMMIT_PENDING)) {
-                queue_delayed_work(REISERFS_SB(sb)->commit_wq,
+                /*
-                                   &journal->j_work, HZ / 10);
+                 * Avoid queueing work when sb is being shut down. Transaction
+                 * will be flushed on journal shutdown.
+                 */
+                if (sb->s_flags & MS_ACTIVE)
+                        queue_delayed_work(REISERFS_SB(sb)->commit_wq,
+                                           &journal->j_work, HZ / 10);
+        }
        /*
         * if the next transaction has any chance of wrapping, flush
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index d6744c8b24e1..249594a821e0 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -2,7 +2,7 @@
 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
 */
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/string.h>
 #include <linux/time.h>
 #include "reiserfs.h"
@@ -899,8 +899,9 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
 /* insert item into the leaf node in position before */
 void leaf_insert_into_buf(struct buffer_info *bi, int before,
-                          struct item_head *inserted_item_ih,
+                          struct item_head * const inserted_item_ih,
-                          const char *inserted_item_body, int zeros_number)
+                          const char * const inserted_item_body,
+                          int zeros_number)
 {
        struct buffer_head *bh = bi->bi_bh;
        int nr, free_space;
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index c9b47e91baf8..ae1dc841db3a 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -17,7 +17,7 @@ static char off_buf[80];
 static char *reiserfs_cpu_offset(struct cpu_key *key)
 {
        if (cpu_key_k_type(key) == TYPE_DIRENTRY)
-                sprintf(off_buf, "%Lu(%Lu)",
+                sprintf(off_buf, "%llu(%llu)",
                        (unsigned long long)
                        GET_HASH_VALUE(cpu_key_k_offset(key)),
                        (unsigned long long)
@@ -34,7 +34,7 @@ static char *le_offset(struct reiserfs_key *key)
        version = le_key_version(key);
        if (le_key_k_type(version, key) == TYPE_DIRENTRY)
-                sprintf(off_buf, "%Lu(%Lu)",
+                sprintf(off_buf, "%llu(%llu)",
                        (unsigned long long)
                        GET_HASH_VALUE(le_key_k_offset(version, key)),
                        (unsigned long long)
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 02b0b7d0f7d5..621b9f381fe1 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -11,7 +11,7 @@
 #include <linux/module.h>
 #include <linux/time.h>
 #include <linux/seq_file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "reiserfs.h"
 #include <linux/init.h>
 #include <linux/proc_fs.h>
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index bf53888c7f59..735c2c2b4536 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -3216,11 +3216,12 @@ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes);
 void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first,
                       int del_num, int del_bytes);
 void leaf_insert_into_buf(struct buffer_info *bi, int before,
-                          struct item_head *inserted_item_ih,
+                          struct item_head * const inserted_item_ih,
-                          const char *inserted_item_body, int zeros_number);
+                          const char * const inserted_item_body,
-void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
-                          int pos_in_item, int paste_size, const char *body,
                          int zeros_number);
+void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
+                          int pos_in_item, int paste_size,
+                          const char * const body, int zeros_number);
 void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
                          int pos_in_item, int cut_size);
 void leaf_paste_entries(struct buffer_info *bi, int item_num, int before,
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index dd44468edc2b..24cbe013240f 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -2006,7 +2006,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
                                            &s_search_path) == POSITION_FOUND);
        RFALSE(file_size > ROUND_UP(new_file_size),
-               "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d",
+               "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d",
               new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
 update_and_out:
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index a392cef6acc6..d46e88a33b02 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -15,7 +15,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/time.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "reiserfs.h"
 #include "acl.h"
 #include "xattr.h"
@@ -100,7 +100,11 @@ void reiserfs_schedule_old_flush(struct super_block *s)
        struct reiserfs_sb_info *sbi = REISERFS_SB(s);
        unsigned long delay;
-        if (s->s_flags & MS_RDONLY)
+        /*
+         * Avoid scheduling flush when sb is being shut down. It can race
+         * with journal shutdown and free still queued delayed work.
+         */
+        if (s->s_flags & MS_RDONLY || !(s->s_flags & MS_ACTIVE))
                return;
        spin_lock(&sbi->old_work_lock);
@@ -331,7 +335,7 @@ static int finish_unfinished(struct super_block *s)
                         * not completed truncate found. New size was
                         * committed together with "save" link
                         */
-                        reiserfs_info(s, "Truncating %k to %Ld ..",
+                        reiserfs_info(s, "Truncating %k to %lld ..",
                                      INODE_PKEY(inode), inode->i_size);
                        /* don't update modification time */
@@ -1577,7 +1581,7 @@ static int read_super_block(struct super_block *s, int offset)
        rs = (struct reiserfs_super_block *)bh->b_data;
        if (sb_blocksize(rs) != s->s_blocksize) {
                reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
-                                 "filesystem on (dev %s, block %Lu, size %lu)",
+                                 "filesystem on (dev %s, block %llu, size %lu)",
                                 s->s_id,
                                 (unsigned long long)bh->b_blocknr,
                                 s->s_blocksize);
@@ -2441,8 +2445,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
        struct buffer_head tmp_bh, *bh;
        if (!current->journal_info) {
-                printk(KERN_WARNING "reiserfs: Quota write (off=%Lu, len=%Lu)"
+                printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n",
-                        " cancelled because transaction is not started.\n",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ca416d099e7d..7c36898af402 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -45,7 +45,7 @@
 #include <linux/xattr.h>
 #include "xattr.h"
 #include "acl.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <net/checksum.h>
 #include <linux/stat.h>
 #include <linux/quotaops.h>
@@ -84,6 +84,7 @@ static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 static int xattr_unlink(struct inode *dir, struct dentry *dentry)
 {
        int error;
        BUG_ON(!mutex_is_locked(&dir->i_mutex));
        mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
@@ -98,6 +99,7 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
 static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
 {
        int error;
        BUG_ON(!mutex_is_locked(&dir->i_mutex));
        mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
@@ -117,6 +119,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
 {
        struct dentry *privroot = REISERFS_SB(sb)->priv_root;
        struct dentry *xaroot;
        if (!privroot->d_inode)
                return ERR_PTR(-ENODATA);
@@ -127,6 +130,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
                xaroot = ERR_PTR(-ENODATA);
        else if (!xaroot->d_inode) {
                int err = -ENODATA;
                if (xattr_may_create(flags))
                        err = xattr_mkdir(privroot->d_inode, xaroot, 0700);
                if (err) {
@@ -157,6 +161,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
        xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
        if (!IS_ERR(xadir) && !xadir->d_inode) {
                int err = -ENODATA;
                if (xattr_may_create(flags))
                        err = xattr_mkdir(xaroot->d_inode, xadir, 0700);
                if (err) {
@@ -188,6 +193,7 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
 {
        struct reiserfs_dentry_buf *dbuf = buf;
        struct dentry *dentry;
        WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex));
        if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
@@ -218,6 +224,7 @@ static void
 cleanup_dentry_buf(struct reiserfs_dentry_buf *buf)
 {
        int i;
        for (i = 0; i < buf->count; i++)
                if (buf->dentries[i])
                        dput(buf->dentries[i]);
@@ -283,11 +290,13 @@ static int reiserfs_for_each_xattr(struct inode *inode,
                int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
                             4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
                struct reiserfs_transaction_handle th;
                reiserfs_write_lock(inode->i_sb);
                err = journal_begin(&th, inode->i_sb, blocks);
                reiserfs_write_unlock(inode->i_sb);
                if (!err) {
                        int jerror;
                        mutex_lock_nested(&dir->d_parent->d_inode->i_mutex,
                                          I_MUTEX_XATTR);
                        err = action(dir, data);
@@ -340,6 +349,7 @@ static int chown_one_xattr(struct dentry *dentry, void *data)
 int reiserfs_delete_xattrs(struct inode *inode)
 {
        int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL);
        if (err)
                reiserfs_warning(inode->i_sb, "jdm-20004",
                                 "Couldn't delete all xattrs (%d)\n", err);
@@ -350,6 +360,7 @@ int reiserfs_delete_xattrs(struct inode *inode)
 int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
 {
        int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs);
        if (err)
                reiserfs_warning(inode->i_sb, "jdm-20007",
                                 "Couldn't chown all xattrs (%d)\n", err);
@@ -439,6 +450,7 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 static void update_ctime(struct inode *inode)
 {
        struct timespec now = current_fs_time(inode->i_sb);
        if (inode_unhashed(inode) || !inode->i_nlink ||
            timespec_equal(&inode->i_ctime, &now))
                return;
@@ -514,6 +526,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
                size_t chunk;
                size_t skip = 0;
                size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1));
                if (buffer_size - buffer_pos > PAGE_CACHE_SIZE)
                        chunk = PAGE_CACHE_SIZE;
                else
@@ -530,6 +543,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
                if (file_pos == 0) {
                        struct reiserfs_xattr_header *rxh;
                        skip = file_pos = sizeof(struct reiserfs_xattr_header);
                        if (chunk + skip > PAGE_CACHE_SIZE)
                                chunk = PAGE_CACHE_SIZE - skip;
@@ -659,6 +673,7 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
                size_t chunk;
                char *data;
                size_t skip = 0;
                if (isize - file_pos > PAGE_CACHE_SIZE)
                        chunk = PAGE_CACHE_SIZE;
                else
@@ -792,6 +807,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 int reiserfs_removexattr(struct dentry *dentry, const char *name)
 {
        const struct xattr_handler *handler;
        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
        if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
@@ -813,9 +829,11 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
 {
        struct listxattr_buf *b = (struct listxattr_buf *)buf;
        size_t size;
        if (name[0] != '.' ||
            (namelen != 1 && (name[1] != '.' || namelen != 2))) {
                const struct xattr_handler *handler;
                handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
                                                    name);
                if (!handler)   /* Unsupported xattr name */
@@ -885,6 +903,7 @@ static int create_privroot(struct dentry *dentry)
 {
        int err;
        struct inode *inode = dentry->d_parent->d_inode;
        WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
        err = xattr_mkdir(inode, dentry, 0700);
@@ -1015,6 +1034,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
                mutex_lock(&privroot->d_inode->i_mutex);
                if (!REISERFS_SB(s)->xattr_root) {
                        struct dentry *dentry;
                        dentry = lookup_one_len(XAROOT_NAME, privroot,
                                                strlen(XAROOT_NAME));
                        if (!IS_ERR(dentry))
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 44503e293790..4b34b9dc03dd 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -9,7 +9,7 @@
 #include <linux/posix_acl_xattr.h>
 #include "xattr.h"
 #include "acl.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
                            struct inode *inode, int type,
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 800a3cef6f62..e7f8939a4cb5 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -6,7 +6,7 @@
 #include <linux/slab.h>
 #include "xattr.h"
 #include <linux/security.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 static int
 security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index a0035719f66b..5eeb0c48ba46 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -5,7 +5,7 @@
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
 #include "xattr.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 static int
 trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 8667491ae7c3..e50eab046471 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -4,7 +4,7 @@
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
 #include "xattr.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 static int
 user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index ef90e8bca95a..e98dd88197d5 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -56,6 +56,8 @@
 * 2 of the Licence, or (at your option) any later version.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
@@ -380,7 +382,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
 eio:
        ret = -EIO;
 error:
-        printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
+        pr_err("read error for inode 0x%lx\n", pos);
        return ERR_PTR(ret);
 }
@@ -390,6 +392,7 @@ error:
 static struct inode *romfs_alloc_inode(struct super_block *sb)
 {
        struct romfs_inode_info *inode;
        inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
        return inode ? &inode->vfs_inode : NULL;
 }
@@ -400,6 +403,7 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
 static void romfs_i_callback(struct rcu_head *head)
 {
        struct inode *inode = container_of(head, struct inode, i_rcu);
        kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
@@ -507,15 +511,13 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
        if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
            img_size < ROMFH_SIZE) {
                if (!silent)
-                        printk(KERN_WARNING "VFS:"
+                        pr_warn("VFS: Can't find a romfs filesystem on dev %s.\n",
-                               " Can't find a romfs filesystem on dev %s.\n",
                               sb->s_id);
                goto error_rsb_inval;
        }
        if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
-                printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
+                pr_err("bad initial checksum on dev %s.\n", sb->s_id);
-                       sb->s_id);
                goto error_rsb_inval;
        }
@@ -523,8 +525,8 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
        len = strnlen(rsb->name, ROMFS_MAXFN);
        if (!silent)
-                printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
+                pr_notice("Mounting image '%*.*s' through %s\n",
-                       (unsigned) len, (unsigned) len, rsb->name, storage);
+                          (unsigned) len, (unsigned) len, rsb->name, storage);
        kfree(rsb);
        rsb = NULL;
@@ -614,7 +616,7 @@ static int __init init_romfs_fs(void)
 {
        int ret;
-        printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
+        pr_info("ROMFS MTD (C) 2007 Red Hat, Inc.\n");
        romfs_inode_cachep =
                kmem_cache_create("romfs_i",
@@ -623,13 +625,12 @@ static int __init init_romfs_fs(void)
                                  romfs_i_init_once);
        if (!romfs_inode_cachep) {
-                printk(KERN_ERR
+                pr_err("Failed to initialise inode cache\n");
-                       "ROMFS error: Failed to initialise inode cache\n");
                return -ENOMEM;
        }
        ret = register_filesystem(&romfs_fs_type);
        if (ret) {
-                printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
+                pr_err("Failed to register filesystem\n");
                goto error_register;
        }
        return 0;
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 62a0de6632e1..43e7a7eddac0 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -44,7 +44,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
        pages = end_index - start_index + 1;
-        page = kmalloc(sizeof(void *) * pages, GFP_KERNEL);
+        page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL);
        if (page == NULL)
                return res;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 031c8d67fd51..5056babe00df 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -27,6 +27,8 @@
 * the filesystem.
 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/slab.h>
@@ -448,8 +450,7 @@ static int __init init_squashfs_fs(void)
                return err;
        }
-        printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) "
+        pr_info("version 4.0 (2009/01/31) Phillip Lougher\n");
-                "Phillip Lougher\n");
        return 0;
 }
diff --git a/fs/super.c b/fs/super.c
index d20d5b11dedf..b9a214d2fe98 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,7 +22,6 @@
 #include <linux/export.h>
 #include <linux/slab.h>
-#include <linux/acct.h>
 #include <linux/blkdev.h>
 #include <linux/mount.h>
 #include <linux/security.h>
@@ -218,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
        mutex_init(&s->s_dquot.dqio_mutex);
        mutex_init(&s->s_dquot.dqonoff_mutex);
-        init_rwsem(&s->s_dquot.dqptr_sem);
        s->s_maxbytes = MAX_NON_LFS;
        s->s_op = &default_op;
        s->s_time_gran = 1000000000;
@@ -702,12 +700,22 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                return -EACCES;
 #endif
-        if (flags & MS_RDONLY)
-                acct_auto_close(sb);
-        shrink_dcache_sb(sb);
        remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+        if (remount_ro) {
+                if (sb->s_pins.first) {
+                        up_write(&sb->s_umount);
+                        sb_pin_kill(sb);
+                        down_write(&sb->s_umount);
+                        if (!sb->s_root)
+                                return 0;
+                        if (sb->s_writers.frozen != SB_UNFROZEN)
+                                return -EBUSY;
+                        remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
+                }
+        }
+        shrink_dcache_sb(sb);
        /* If we are remounting RDONLY and current sb is read/write,
           make sure there are no rw files opened */
        if (remount_ro) {
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 0013142c0475..80c350216ea8 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -35,8 +35,9 @@ struct timerfd_ctx {
        ktime_t moffs;
        wait_queue_head_t wqh;
        u64 ticks;
-        int expired;
        int clockid;
+        short unsigned expired;
+        short unsigned settime_flags;   /* to show in fdinfo */
        struct rcu_head rcu;
        struct list_head clist;
        bool might_cancel;
@@ -92,7 +93,7 @@ static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
 */
 void timerfd_clock_was_set(void)
 {
-        ktime_t moffs = ktime_get_monotonic_offset();
+        ktime_t moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
        struct timerfd_ctx *ctx;
        unsigned long flags;
@@ -125,7 +126,7 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx)
 {
        if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX)
                return false;
-        ctx->moffs = ktime_get_monotonic_offset();
+        ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
        return true;
 }
@@ -196,6 +197,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
                if (timerfd_canceled(ctx))
                        return -ECANCELED;
        }
+        ctx->settime_flags = flags & TFD_SETTIME_FLAGS;
        return 0;
 }
@@ -284,11 +287,77 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
        return res;
 }
+#ifdef CONFIG_PROC_FS
+static int timerfd_show(struct seq_file *m, struct file *file)
+{
+        struct timerfd_ctx *ctx = file->private_data;
+        struct itimerspec t;
+        spin_lock_irq(&ctx->wqh.lock);
+        t.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
+        t.it_interval = ktime_to_timespec(ctx->tintv);
+        spin_unlock_irq(&ctx->wqh.lock);
+        return seq_printf(m,
+                          "clockid: %d\n"
+                          "ticks: %llu\n"
+                          "settime flags: 0%o\n"
+                          "it_value: (%llu, %llu)\n"
+                          "it_interval: (%llu, %llu)\n",
+                          ctx->clockid, (unsigned long long)ctx->ticks,
+                          ctx->settime_flags,
+                          (unsigned long long)t.it_value.tv_sec,
+                          (unsigned long long)t.it_value.tv_nsec,
+                          (unsigned long long)t.it_interval.tv_sec,
+                          (unsigned long long)t.it_interval.tv_nsec);
+}
+#else
+#define timerfd_show NULL
+#endif
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+        struct timerfd_ctx *ctx = file->private_data;
+        int ret = 0;
+        switch (cmd) {
+        case TFD_IOC_SET_TICKS: {
+                u64 ticks;
+                if (copy_from_user(&ticks, (u64 __user *)arg, sizeof(ticks)))
+                        return -EFAULT;
+                if (!ticks)
+                        return -EINVAL;
+                spin_lock_irq(&ctx->wqh.lock);
+                if (!timerfd_canceled(ctx)) {
+                        ctx->ticks = ticks;
+                        if (ticks)
+                                wake_up_locked(&ctx->wqh);
+                } else
+                        ret = -ECANCELED;
+                spin_unlock_irq(&ctx->wqh.lock);
+                break;
+        }
+        default:
+                ret = -ENOTTY;
+                break;
+        }
+        return ret;
+}
+#else
+#define timerfd_ioctl NULL
+#endif
 static const struct file_operations timerfd_fops = {
        .release        = timerfd_release,
        .poll           = timerfd_poll,
        .read           = timerfd_read,
        .llseek         = noop_llseek,
+        .show_fdinfo    = timerfd_show,
+        .unlocked_ioctl = timerfd_ioctl,
 };
 static int timerfd_fget(int fd, struct fd *p)
@@ -336,7 +405,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
        else
                hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
-        ctx->moffs = ktime_get_monotonic_offset();
+        ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
        ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
                               O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index ff8229340cd5..aa13ad053b14 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -174,7 +174,6 @@ static int do_commit(struct ubifs_info *c)
        if (err)
                goto out;
-        mutex_lock(&c->mst_mutex);
        c->mst_node->cmt_no      = cpu_to_le64(c->cmt_no);
        c->mst_node->log_lnum    = cpu_to_le32(new_ltail_lnum);
        c->mst_node->root_lnum   = cpu_to_le32(zroot.lnum);
@@ -204,7 +203,6 @@ static int do_commit(struct ubifs_info *c)
        else
                c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
        err = ubifs_write_master(c);
-        mutex_unlock(&c->mst_mutex);
        if (err)
                goto out;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 2290d5866725..fb08b0c514b6 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -431,7 +431,7 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
 /**
 * wbuf_timer_callback - write-buffer timer callback function.
- * @data: timer data (write-buffer descriptor)
+ * @timer: timer data (write-buffer descriptor)
 *
 * This function is called when the write-buffer timer expires.
 */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index a902c5919e42..a47ddfc9be6b 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -240,6 +240,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
        if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
                c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+                ubifs_assert(c->lhead_lnum != c->ltail_lnum);
                c->lhead_offs = 0;
        }
@@ -404,15 +405,14 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
        /* Switch to the next log LEB */
        if (c->lhead_offs) {
                c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+                ubifs_assert(c->lhead_lnum != c->ltail_lnum);
                c->lhead_offs = 0;
        }
-        if (c->lhead_offs == 0) {
+        /* Must ensure next LEB has been unmapped */
-                /* Must ensure next LEB has been unmapped */
+        err = ubifs_leb_unmap(c, c->lhead_lnum);
-                err = ubifs_leb_unmap(c, c->lhead_lnum);
+        if (err)
-                if (err)
+                goto out;
-                        goto out;
-        }
        len = ALIGN(len, c->min_io_size);
        dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index d46b19ec1815..421bd0a80424 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1464,7 +1464,6 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
                        return ERR_CAST(nnode);
        }
        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
-        shft -= UBIFS_LPT_FANOUT_SHIFT;
        pnode = ubifs_get_pnode(c, nnode, iip);
        if (IS_ERR(pnode))
                return ERR_CAST(pnode);
@@ -1604,7 +1603,6 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
                        return ERR_CAST(nnode);
        }
        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
-        shft -= UBIFS_LPT_FANOUT_SHIFT;
        pnode = ubifs_get_pnode(c, nnode, iip);
        if (IS_ERR(pnode))
                return ERR_CAST(pnode);
@@ -1964,7 +1962,6 @@ again:
                }
        }
        iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
-        shft -= UBIFS_LPT_FANOUT_SHIFT;
        pnode = scan_get_pnode(c, path + h, nnode, iip);
        if (IS_ERR(pnode)) {
                err = PTR_ERR(pnode);
@@ -2198,6 +2195,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
                                          lprops->dirty);
                                return -EINVAL;
                        }
+                        break;
                case LPROPS_FREEABLE:
                case LPROPS_FRDI_IDX:
                        if (lprops->free + lprops->dirty != c->leb_size) {
@@ -2206,6 +2204,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
                                          lprops->dirty);
                                return -EINVAL;
                        }
+                        break;
                }
        }
        return 0;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 45d4e96a6bac..d9c02928e992 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -304,7 +304,6 @@ static int layout_cnodes(struct ubifs_info *c)
                        ubifs_assert(lnum >= c->lpt_first &&
                                     lnum <= c->lpt_last);
                }
-                done_ltab = 1;
                c->ltab_lnum = lnum;
                c->ltab_offs = offs;
                offs += c->ltab_sz;
@@ -514,7 +513,6 @@ static int write_cnodes(struct ubifs_info *c)
                        if (err)
                                return err;
                }
-                done_ltab = 1;
                ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
                offs += c->ltab_sz;
                dbg_chk_lpt_sz(c, 1, c->ltab_sz);
@@ -1941,6 +1939,11 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
                                pr_err("LEB %d:%d, nnode, ",
                                       lnum, offs);
                        err = ubifs_unpack_nnode(c, p, &nnode);
+                        if (err) {
+                                pr_err("failed to unpack_node, error %d\n",
+                                       err);
+                                break;
+                        }
                        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
                                pr_cont("%d:%d", nnode.nbranch[i].lnum,
                                       nnode.nbranch[i].offs);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index ab83ace9910a..1a4bb9e8b3b8 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -352,10 +352,9 @@ int ubifs_read_master(struct ubifs_info *c)
 * ubifs_write_master - write master node.
 * @c: UBIFS file-system description object
 *
- * This function writes the master node. The caller has to take the
+ * This function writes the master node. Returns zero in case of success and a
- * @c->mst_mutex lock before calling this function. Returns zero in case of
+ * negative error code in case of failure. The master node is written twice to
- * success and a negative error code in case of failure. The master node is
+ * enable recovery.
- * written twice to enable recovery.
 */
 int ubifs_write_master(struct ubifs_info *c)
 {
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index f1c3e5a1b315..4409f486ecef 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -346,7 +346,6 @@ static int write_orph_nodes(struct ubifs_info *c, int atomic)
                int lnum;
                /* Unmap any unused LEBs after consolidation */
-                lnum = c->ohead_lnum + 1;
                for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) {
                        err = ubifs_leb_unmap(c, lnum);
                        if (err)
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index c14adb2f420c..c640938f62f0 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -596,7 +596,6 @@ static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
 * drop_last_node - drop the last node.
 * @sleb: scanned LEB information
 * @offs: offset of dropped nodes is returned here
- * @grouped: non-zero if whole group of nodes have to be dropped
 *
 * This is a helper function for 'ubifs_recover_leb()' which drops the last
 * node of the scanned LEB.
@@ -629,8 +628,8 @@ static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
 *
 * This function does a scan of a LEB, but caters for errors that might have
 * been caused by the unclean unmount from which we are attempting to recover.
- * Returns %0 in case of success, %-EUCLEAN if an unrecoverable corruption is
+ * Returns the scanned information on success and a negative error code on
- * found, and a negative error code in case of failure.
+ * failure.
 */
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                                         int offs, void *sbuf, int jhead)
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 4c37607a958e..79c6dbbc0e04 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -332,6 +332,8 @@ static int create_default_filesystem(struct ubifs_info *c)
        cs->ch.node_type = UBIFS_CS_NODE;
        err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0);
        kfree(cs);
+        if (err)
+                return err;
        ubifs_msg("default file-system created");
        return 0;
@@ -447,7 +449,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
                goto failed;
        }
-        if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
+        if (c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
                err = 13;
                goto failed;
        }
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 58aa05df2bb6..89adbc4d08ac 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -131,7 +131,8 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
 * @offs: offset to start at (usually zero)
 * @sbuf: scan buffer (must be c->leb_size)
 *
- * This function returns %0 on success and a negative error code on failure.
+ * This function returns the scanned information on success and a negative error
+ * code on failure.
 */
 struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
                                        int offs, void *sbuf)
@@ -157,9 +158,10 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
                return ERR_PTR(err);
        }
-        if (err == -EBADMSG)
+        /*
-                sleb->ecc = 1;
+         * Note, we ignore integrity errors (EBASMSG) because all the nodes are
+         * protected by CRC checksums.
+         */
        return sleb;
 }
@@ -169,8 +171,6 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
 * @sleb: scanning information
 * @lnum: logical eraseblock number
 * @offs: offset to start at (usually zero)
- *
- * This function returns %0 on success and a negative error code on failure.
 */
 void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
                    int lnum, int offs)
@@ -257,7 +257,7 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
 * @quiet: print no messages
 *
 * This function scans LEB number @lnum and returns complete information about
- * its contents. Returns the scaned information in case of success and,
+ * its contents. Returns the scanned information in case of success and,
 * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
 * of failure.
 *
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 3904c8574ef9..106bf20629ce 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -75,7 +75,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode)
                return 1;
        }
-        if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
+        if (ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
                ubifs_err("unknown compression type %d", ui->compr_type);
                return 2;
        }
@@ -424,19 +424,19 @@ static int ubifs_show_options(struct seq_file *s, struct dentry *root)
        struct ubifs_info *c = root->d_sb->s_fs_info;
        if (c->mount_opts.unmount_mode == 2)
-                seq_printf(s, ",fast_unmount");
+                seq_puts(s, ",fast_unmount");
        else if (c->mount_opts.unmount_mode == 1)
-                seq_printf(s, ",norm_unmount");
+                seq_puts(s, ",norm_unmount");
        if (c->mount_opts.bulk_read == 2)
-                seq_printf(s, ",bulk_read");
+                seq_puts(s, ",bulk_read");
        else if (c->mount_opts.bulk_read == 1)
-                seq_printf(s, ",no_bulk_read");
+                seq_puts(s, ",no_bulk_read");
        if (c->mount_opts.chk_data_crc == 2)
-                seq_printf(s, ",chk_data_crc");
+                seq_puts(s, ",chk_data_crc");
        else if (c->mount_opts.chk_data_crc == 1)
-                seq_printf(s, ",no_chk_data_crc");
+                seq_puts(s, ",no_chk_data_crc");
        if (c->mount_opts.override_compr) {
                seq_printf(s, ",compr=%s",
@@ -796,8 +796,8 @@ static int alloc_wbufs(struct ubifs_info *c)
 {
        int i, err;
-        c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead),
+        c->jheads = kcalloc(c->jhead_cnt, sizeof(struct ubifs_jhead),
-                           GFP_KERNEL);
+                            GFP_KERNEL);
        if (!c->jheads)
                return -ENOMEM;
@@ -1963,7 +1963,6 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
                mutex_init(&c->lp_mutex);
                mutex_init(&c->tnc_mutex);
                mutex_init(&c->log_mutex);
-                mutex_init(&c->mst_mutex);
                mutex_init(&c->umount_mutex);
                mutex_init(&c->bu_mutex);
                mutex_init(&c->write_reserve_mutex);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 8a40cf9c02d7..6793db0754f6 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -3294,7 +3294,6 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
                goto out_unlock;
        if (err) {
-                err = -EINVAL;
                key = &from_key;
                goto out_dump;
        }
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 3600994f8411..7a205e046776 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -389,7 +389,6 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
                                ubifs_dump_lprops(c);
                        }
                        /* Try to commit anyway */
-                        err = 0;
                        break;
                }
                p++;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c1f71fe17cc0..c4fe900c67ab 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -314,7 +314,6 @@ struct ubifs_scan_node {
 * @nodes_cnt: number of nodes scanned
 * @nodes: list of struct ubifs_scan_node
 * @endpt: end point (and therefore the start of empty space)
- * @ecc: read returned -EBADMSG
 * @buf: buffer containing entire LEB scanned
 */
 struct ubifs_scan_leb {
@@ -322,7 +321,6 @@ struct ubifs_scan_leb {
        int nodes_cnt;
        struct list_head nodes;
        int endpt;
-        int ecc;
        void *buf;
 };
@@ -1051,7 +1049,6 @@ struct ubifs_debug_info;
 *
 * @mst_node: master node
 * @mst_offs: offset of valid master node
- * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
 *
 * @max_bu_buf_len: maximum bulk-read buffer length
 * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
@@ -1292,7 +1289,6 @@ struct ubifs_info {
        struct ubifs_mst_node *mst_node;
        int mst_offs;
-        struct mutex mst_mutex;
        int max_bu_buf_len;
        struct mutex bu_mutex;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index d80738fdf424..86c6743ec1fe 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -27,7 +27,7 @@
 #include "udfdecl.h"
 #include <linux/fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/string.h> /* memset */
 #include <linux/capability.h>
@@ -100,24 +100,6 @@ static int udf_adinicb_write_begin(struct file *file,
        return 0;
 }
-static int udf_adinicb_write_end(struct file *file,
-                        struct address_space *mapping,
-                        loff_t pos, unsigned len, unsigned copied,
-                        struct page *page, void *fsdata)
-{
-        struct inode *inode = mapping->host;
-        unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-        char *kaddr;
-        struct udf_inode_info *iinfo = UDF_I(inode);
-        kaddr = kmap_atomic(page);
-        memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset,
-                kaddr + offset, copied);
-        kunmap_atomic(kaddr);
-        return simple_write_end(file, mapping, pos, len, copied, page, fsdata);
-}
 static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb,
                                     struct iov_iter *iter,
                                     loff_t offset)
@@ -130,7 +112,7 @@ const struct address_space_operations udf_adinicb_aops = {
        .readpage       = udf_adinicb_readpage,
        .writepage      = udf_adinicb_writepage,
        .write_begin    = udf_adinicb_write_begin,
-        .write_end      = udf_adinicb_write_end,
+        .write_end      = simple_write_end,
        .direct_IO      = udf_adinicb_direct_IO,
 };
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 6583fe9b0645..6ad5a453af97 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -21,7 +21,7 @@
 #include <linux/blkdev.h>
 #include <linux/cdrom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "udf_sb.h"
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3286db047a40..813da94d447b 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -63,7 +63,7 @@
 #include "udf_i.h"
 #include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #define VDS_POS_PRIMARY_VOL_DESC        0
 #define VDS_POS_UNALLOC_SPACE_DESC      1
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index d7c6dbe4194b..6fb7945c1e6e 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -20,7 +20,7 @@
 */
 #include "udfdecl.h"
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/time.h>
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 44b815e57f94..afd470e588ff 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -412,7 +412,6 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
        int extIndex = 0, newExtIndex = 0, hasExt = 0;
        unsigned short valueCRC;
        uint8_t curr;
-        const uint8_t hexChar[] = "0123456789ABCDEF";
        if (udfName[0] == '.' &&
            (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
@@ -477,10 +476,10 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
                        newIndex = 250;
                newName[newIndex++] = CRC_MARK;
                valueCRC = crc_itu_t(0, fidName, fidNameLen);
-                newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
+                newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
-                newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
+                newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
-                newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
+                newName[newIndex++] = hex_asc_upper_hi(valueCRC);
-                newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
+                newName[newIndex++] = hex_asc_upper_lo(valueCRC);
                if (hasExt) {
                        newName[newIndex++] = EXT_MARK;
diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile
index dd39980437fc..4d0e02b022b3 100644
--- a/fs/ufs/Makefile
+++ b/fs/ufs/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_UFS_FS) += ufs.o
 ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \
            namei.o super.o symlink.o truncate.o util.o
+ccflags-$(CONFIG_UFS_DEBUG)    += -DDEBUG
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 61e8a9b021dd..7c580c97990e 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -158,16 +158,16 @@ out:
 /**
 * ufs_inode_getfrag() - allocate new fragment(s)
- * @inode - pointer to inode
+ * @inode: pointer to inode
- * @fragment - number of `fragment' which hold pointer
+ * @fragment: number of `fragment' which hold pointer
 *   to new allocated fragment(s)
- * @new_fragment - number of new allocated fragment(s)
+ * @new_fragment: number of new allocated fragment(s)
- * @required - how many fragment(s) we require
+ * @required: how many fragment(s) we require
- * @err - we set it if something wrong
+ * @err: we set it if something wrong
- * @phys - pointer to where we save physical number of new allocated fragments,
+ * @phys: pointer to where we save physical number of new allocated fragments,
 *   NULL if we allocate not data(indirect blocks for example).
- * @new - we set it if we allocate new block
+ * @new: we set it if we allocate new block
- * @locked_page - for ufs_new_fragments()
+ * @locked_page: for ufs_new_fragments()
 */
 static struct buffer_head *
 ufs_inode_getfrag(struct inode *inode, u64 fragment,
@@ -315,16 +315,16 @@ repeat2:
 /**
 * ufs_inode_getblock() - allocate new block
- * @inode - pointer to inode
+ * @inode: pointer to inode
- * @bh - pointer to block which hold "pointer" to new allocated block
+ * @bh: pointer to block which hold "pointer" to new allocated block
- * @fragment - number of `fragment' which hold pointer
+ * @fragment: number of `fragment' which hold pointer
 *   to new allocated block
- * @new_fragment - number of new allocated fragment
+ * @new_fragment: number of new allocated fragment
 *  (block will hold this fragment and also uspi->s_fpb-1)
- * @err - see ufs_inode_getfrag()
+ * @err: see ufs_inode_getfrag()
- * @phys - see ufs_inode_getfrag()
+ * @phys: see ufs_inode_getfrag()
- * @new - see ufs_inode_getfrag()
+ * @new: see ufs_inode_getfrag()
- * @locked_page - see ufs_inode_getfrag()
+ * @locked_page: see ufs_inode_getfrag()
 */
 static struct buffer_head *
 ufs_inode_getblock(struct inode *inode, struct buffer_head *bh,
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index b879f1ba3439..da73801301d5 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -65,7 +65,6 @@
 * Evgeniy Dushistov <dushistov@mail.ru>, 2007
 */
 #include <linux/exportfs.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
@@ -172,73 +171,73 @@ static void ufs_print_super_stuff(struct super_block *sb,
 {
        u32 magic = fs32_to_cpu(sb, usb3->fs_magic);
-        printk("ufs_print_super_stuff\n");
+        pr_debug("ufs_print_super_stuff\n");
-        printk("  magic:     0x%x\n", magic);
+        pr_debug("  magic:     0x%x\n", magic);
        if (fs32_to_cpu(sb, usb3->fs_magic) == UFS2_MAGIC) {
-                printk("  fs_size:   %llu\n", (unsigned long long)
+                pr_debug("  fs_size:   %llu\n", (unsigned long long)
-                       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size));
+                         fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size));
-                printk("  fs_dsize:  %llu\n", (unsigned long long)
+                pr_debug("  fs_dsize:  %llu\n", (unsigned long long)
-                       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize));
+                         fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize));
-                printk("  bsize:         %u\n",
+                pr_debug("  bsize:         %u\n",
-                       fs32_to_cpu(sb, usb1->fs_bsize));
+                         fs32_to_cpu(sb, usb1->fs_bsize));
-                printk("  fsize:         %u\n",
+                pr_debug("  fsize:         %u\n",
-                       fs32_to_cpu(sb, usb1->fs_fsize));
+                         fs32_to_cpu(sb, usb1->fs_fsize));
-                printk("  fs_volname:  %s\n", usb2->fs_un.fs_u2.fs_volname);
+                pr_debug("  fs_volname:  %s\n", usb2->fs_un.fs_u2.fs_volname);
-                printk("  fs_sblockloc: %llu\n", (unsigned long long)
+                pr_debug("  fs_sblockloc: %llu\n", (unsigned long long)
-                       fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc));
+                         fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc));
-                printk("  cs_ndir(No of dirs):  %llu\n", (unsigned long long)
+                pr_debug("  cs_ndir(No of dirs):  %llu\n", (unsigned long long)
-                       fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir));
+                         fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir));
-                printk("  cs_nbfree(No of free blocks):  %llu\n",
+                pr_debug("  cs_nbfree(No of free blocks):  %llu\n",
-                       (unsigned long long)
+                         (unsigned long long)
-                       fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree));
+                         fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree));
-                printk(KERN_INFO"  cs_nifree(Num of free inodes): %llu\n",
+                pr_info("  cs_nifree(Num of free inodes): %llu\n",
-                       (unsigned long long)
+                        (unsigned long long)
-                       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree));
+                        fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree));
-                printk(KERN_INFO"  cs_nffree(Num of free frags): %llu\n",
+                pr_info("  cs_nffree(Num of free frags): %llu\n",
-                       (unsigned long long)
+                        (unsigned long long)
-                       fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree));
+                        fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree));
-                printk(KERN_INFO"  fs_maxsymlinklen: %u\n",
+                pr_info("  fs_maxsymlinklen: %u\n",
-                       fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen));
+                        fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen));
        } else {
-                printk(" sblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
+                pr_debug(" sblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
-                printk(" cblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
+                pr_debug(" cblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
-                printk(" iblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
+                pr_debug(" iblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
-                printk(" dblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
+                pr_debug(" dblkno:      %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
-                printk(" cgoffset:    %u\n",
+                pr_debug(" cgoffset:    %u\n",
-                       fs32_to_cpu(sb, usb1->fs_cgoffset));
+                         fs32_to_cpu(sb, usb1->fs_cgoffset));
-                printk(" ~cgmask:     0x%x\n",
+                pr_debug(" ~cgmask:     0x%x\n",
-                       ~fs32_to_cpu(sb, usb1->fs_cgmask));
+                         ~fs32_to_cpu(sb, usb1->fs_cgmask));
-                printk(" size:        %u\n", fs32_to_cpu(sb, usb1->fs_size));
+                pr_debug(" size:        %u\n", fs32_to_cpu(sb, usb1->fs_size));
-                printk(" dsize:       %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
+                pr_debug(" dsize:       %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
-                printk(" ncg:         %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
+                pr_debug(" ncg:         %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
-                printk(" bsize:       %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
+                pr_debug(" bsize:       %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
-                printk(" fsize:       %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
+                pr_debug(" fsize:       %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
-                printk(" frag:        %u\n", fs32_to_cpu(sb, usb1->fs_frag));
+                pr_debug(" frag:        %u\n", fs32_to_cpu(sb, usb1->fs_frag));
-                printk(" fragshift:   %u\n",
+                pr_debug(" fragshift:   %u\n",
-                       fs32_to_cpu(sb, usb1->fs_fragshift));
+                         fs32_to_cpu(sb, usb1->fs_fragshift));
-                printk(" ~fmask:      %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
+                pr_debug(" ~fmask:      %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
-                printk(" fshift:      %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
+                pr_debug(" fshift:      %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
-                printk(" sbsize:      %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
+                pr_debug(" sbsize:      %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
-                printk(" spc:         %u\n", fs32_to_cpu(sb, usb1->fs_spc));
+                pr_debug(" spc:         %u\n", fs32_to_cpu(sb, usb1->fs_spc));
-                printk(" cpg:         %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
+                pr_debug(" cpg:         %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
-                printk(" ipg:         %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
+                pr_debug(" ipg:         %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
-                printk(" fpg:         %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
+                pr_debug(" fpg:         %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
-                printk(" csaddr:      %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
+                pr_debug(" csaddr:      %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
-                printk(" cssize:      %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
+                pr_debug(" cssize:      %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
-                printk(" cgsize:      %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
+                pr_debug(" cgsize:      %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
-                printk(" fstodb:      %u\n",
+                pr_debug(" fstodb:      %u\n",
-                       fs32_to_cpu(sb, usb1->fs_fsbtodb));
+                         fs32_to_cpu(sb, usb1->fs_fsbtodb));
-                printk(" nrpos:       %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
+                pr_debug(" nrpos:       %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
-                printk(" ndir         %u\n",
+                pr_debug(" ndir         %u\n",
-                       fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
+                         fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
-                printk(" nifree       %u\n",
+                pr_debug(" nifree       %u\n",
-                       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
+                         fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
-                printk(" nbfree       %u\n",
+                pr_debug(" nbfree       %u\n",
-                       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
+                         fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
-                printk(" nffree       %u\n",
+                pr_debug(" nffree       %u\n",
-                       fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
+                         fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
        }
-        printk("\n");
+        pr_debug("\n");
 }
 /*
@@ -247,38 +246,38 @@ static void ufs_print_super_stuff(struct super_block *sb,
 static void ufs_print_cylinder_stuff(struct super_block *sb,
                                     struct ufs_cylinder_group *cg)
 {
-        printk("\nufs_print_cylinder_stuff\n");
+        pr_debug("\nufs_print_cylinder_stuff\n");
-        printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group));
+        pr_debug("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group));
-        printk("  magic:        %x\n", fs32_to_cpu(sb, cg->cg_magic));
+        pr_debug("  magic:        %x\n", fs32_to_cpu(sb, cg->cg_magic));
-        printk("  time:         %u\n", fs32_to_cpu(sb, cg->cg_time));
+        pr_debug("  time:         %u\n", fs32_to_cpu(sb, cg->cg_time));
-        printk("  cgx:          %u\n", fs32_to_cpu(sb, cg->cg_cgx));
+        pr_debug("  cgx:          %u\n", fs32_to_cpu(sb, cg->cg_cgx));
-        printk("  ncyl:         %u\n", fs16_to_cpu(sb, cg->cg_ncyl));
+        pr_debug("  ncyl:         %u\n", fs16_to_cpu(sb, cg->cg_ncyl));
-        printk("  niblk:        %u\n", fs16_to_cpu(sb, cg->cg_niblk));
+        pr_debug("  niblk:        %u\n", fs16_to_cpu(sb, cg->cg_niblk));
-        printk("  ndblk:        %u\n", fs32_to_cpu(sb, cg->cg_ndblk));
+        pr_debug("  ndblk:        %u\n", fs32_to_cpu(sb, cg->cg_ndblk));
-        printk("  cs_ndir:      %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir));
+        pr_debug("  cs_ndir:      %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir));
-        printk("  cs_nbfree:    %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree));
+        pr_debug("  cs_nbfree:    %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree));
-        printk("  cs_nifree:    %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree));
+        pr_debug("  cs_nifree:    %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree));
-        printk("  cs_nffree:    %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree));
+        pr_debug("  cs_nffree:    %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree));
-        printk("  rotor:        %u\n", fs32_to_cpu(sb, cg->cg_rotor));
+        pr_debug("  rotor:        %u\n", fs32_to_cpu(sb, cg->cg_rotor));
-        printk("  frotor:       %u\n", fs32_to_cpu(sb, cg->cg_frotor));
+        pr_debug("  frotor:       %u\n", fs32_to_cpu(sb, cg->cg_frotor));
-        printk("  irotor:       %u\n", fs32_to_cpu(sb, cg->cg_irotor));
+        pr_debug("  irotor:       %u\n", fs32_to_cpu(sb, cg->cg_irotor));
-        printk("  frsum:        %u, %u, %u, %u, %u, %u, %u, %u\n",
+        pr_debug("  frsum:        %u, %u, %u, %u, %u, %u, %u, %u\n",
            fs32_to_cpu(sb, cg->cg_frsum[0]), fs32_to_cpu(sb, cg->cg_frsum[1]),
            fs32_to_cpu(sb, cg->cg_frsum[2]), fs32_to_cpu(sb, cg->cg_frsum[3]),
            fs32_to_cpu(sb, cg->cg_frsum[4]), fs32_to_cpu(sb, cg->cg_frsum[5]),
            fs32_to_cpu(sb, cg->cg_frsum[6]), fs32_to_cpu(sb, cg->cg_frsum[7]));
-        printk("  btotoff:      %u\n", fs32_to_cpu(sb, cg->cg_btotoff));
+        pr_debug("  btotoff:      %u\n", fs32_to_cpu(sb, cg->cg_btotoff));
-        printk("  boff:         %u\n", fs32_to_cpu(sb, cg->cg_boff));
+        pr_debug("  boff:         %u\n", fs32_to_cpu(sb, cg->cg_boff));
-        printk("  iuseoff:      %u\n", fs32_to_cpu(sb, cg->cg_iusedoff));
+        pr_debug("  iuseoff:      %u\n", fs32_to_cpu(sb, cg->cg_iusedoff));
-        printk("  freeoff:      %u\n", fs32_to_cpu(sb, cg->cg_freeoff));
+        pr_debug("  freeoff:      %u\n", fs32_to_cpu(sb, cg->cg_freeoff));
-        printk("  nextfreeoff:  %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff));
+        pr_debug("  nextfreeoff:  %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff));
-        printk("  clustersumoff %u\n",
+        pr_debug("  clustersumoff %u\n",
-               fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
+                 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
-        printk("  clusteroff    %u\n",
+        pr_debug("  clusteroff    %u\n",
-               fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
+                 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
-        printk("  nclusterblks  %u\n",
+        pr_debug("  nclusterblks  %u\n",
-               fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
+                 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
-        printk("\n");
+        pr_debug("\n");
 }
 #else
 #  define ufs_print_super_stuff(sb, usb1, usb2, usb3) /**/
@@ -287,13 +286,12 @@ static void ufs_print_cylinder_stuff(struct super_block *sb,
 static const struct super_operations ufs_super_ops;
-static char error_buf[1024];
 void ufs_error (struct super_block * sb, const char * function,
        const char * fmt, ...)
 {
        struct ufs_sb_private_info * uspi;
        struct ufs_super_block_first * usb1;
+        struct va_format vaf;
        va_list args;
        uspi = UFS_SB(sb)->s_uspi;
@@ -305,20 +303,21 @@ void ufs_error (struct super_block * sb, const char * function,
                ufs_mark_sb_dirty(sb);
                sb->s_flags |= MS_RDONLY;
        }
-        va_start (args, fmt);
+        va_start(args, fmt);
-        vsnprintf (error_buf, sizeof(error_buf), fmt, args);
+        vaf.fmt = fmt;
-        va_end (args);
+        vaf.va = &args;
        switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) {
        case UFS_MOUNT_ONERROR_PANIC:
-                panic ("UFS-fs panic (device %s): %s: %s\n", 
+                panic("panic (device %s): %s: %pV\n",
-                        sb->s_id, function, error_buf);
+                      sb->s_id, function, &vaf);
        case UFS_MOUNT_ONERROR_LOCK:
        case UFS_MOUNT_ONERROR_UMOUNT:
        case UFS_MOUNT_ONERROR_REPAIR:
-                printk (KERN_CRIT "UFS-fs error (device %s): %s: %s\n",
+                pr_crit("error (device %s): %s: %pV\n",
-                        sb->s_id, function, error_buf);
+                        sb->s_id, function, &vaf);
-        }               
+        }
+        va_end(args);
 }
 void ufs_panic (struct super_block * sb, const char * function,
@@ -326,6 +325,7 @@ void ufs_panic (struct super_block * sb, const char * function,
 {
        struct ufs_sb_private_info * uspi;
        struct ufs_super_block_first * usb1;
+        struct va_format vaf;
        va_list args;
        
        uspi = UFS_SB(sb)->s_uspi;
@@ -336,24 +336,27 @@ void ufs_panic (struct super_block * sb, const char * function,
                ubh_mark_buffer_dirty(USPI_UBH(uspi));
                ufs_mark_sb_dirty(sb);
        }
-        va_start (args, fmt);
+        va_start(args, fmt);
-        vsnprintf (error_buf, sizeof(error_buf), fmt, args);
+        vaf.fmt = fmt;
-        va_end (args);
+        vaf.va = &args;
        sb->s_flags |= MS_RDONLY;
-        printk (KERN_CRIT "UFS-fs panic (device %s): %s: %s\n",
+        pr_crit("panic (device %s): %s: %pV\n",
-                sb->s_id, function, error_buf);
+                sb->s_id, function, &vaf);
+        va_end(args);
 }
 void ufs_warning (struct super_block * sb, const char * function,
        const char * fmt, ...)
 {
+        struct va_format vaf;
        va_list args;
-        va_start (args, fmt);
+        va_start(args, fmt);
-        vsnprintf (error_buf, sizeof(error_buf), fmt, args);
+        vaf.fmt = fmt;
-        va_end (args);
+        vaf.va = &args;
-        printk (KERN_WARNING "UFS-fs warning (device %s): %s: %s\n",
+        pr_warn("(device %s): %s: %pV\n",
-                sb->s_id, function, error_buf);
+                sb->s_id, function, &vaf);
+        va_end(args);
 }
 enum {
@@ -464,14 +467,12 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
                        ufs_set_opt (*mount_options, ONERROR_UMOUNT);
                        break;
                case Opt_onerror_repair:
-                        printk("UFS-fs: Unable to do repair on error, "
+                        pr_err("Unable to do repair on error, will lock lock instead\n");
-                                "will lock lock instead\n");
                        ufs_clear_opt (*mount_options, ONERROR);
                        ufs_set_opt (*mount_options, ONERROR_REPAIR);
                        break;
                default:
-                        printk("UFS-fs: Invalid option: \"%s\" "
+                        pr_err("Invalid option: \"%s\" or missing value\n", p);
-                                        "or missing value\n", p);
                        return 0;
                }
        }
@@ -788,8 +789,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 #ifndef CONFIG_UFS_FS_WRITE
        if (!(sb->s_flags & MS_RDONLY)) {
-                printk("ufs was compiled with read-only support, "
+                pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n");
-                       "can't be mounted as read-write\n");
                return -EROFS;
        }
 #endif
@@ -812,12 +812,12 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_mount_opt = 0;
        ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK);
        if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) {
-                printk("wrong mount options\n");
+                pr_err("wrong mount options\n");
                goto failed;
        }
        if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) {
                if (!silent)
-                        printk("You didn't specify the type of your ufs filesystem\n\n"
+                        pr_err("You didn't specify the type of your ufs filesystem\n\n"
                        "mount -t ufs -o ufstype="
                        "sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n"
                        ">>>WARNING<<< Wrong ufstype may corrupt your filesystem, "
@@ -868,7 +868,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                break;
        case UFS_MOUNT_UFSTYPE_SUNOS:
-                UFSD(("ufstype=sunos\n"))
+                UFSD("ufstype=sunos\n");
                uspi->s_fsize = block_size = 1024;
                uspi->s_fmask = ~(1024 - 1);
                uspi->s_fshift = 10;
@@ -900,7 +900,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
                if (!(sb->s_flags & MS_RDONLY)) {
                        if (!silent)
-                                printk(KERN_INFO "ufstype=old is supported read-only\n");
+                                pr_info("ufstype=old is supported read-only\n");
                        sb->s_flags |= MS_RDONLY;
                }
                break;
@@ -916,7 +916,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
                if (!(sb->s_flags & MS_RDONLY)) {
                        if (!silent)
-                                printk(KERN_INFO "ufstype=nextstep is supported read-only\n");
+                                pr_info("ufstype=nextstep is supported read-only\n");
                        sb->s_flags |= MS_RDONLY;
                }
                break;
@@ -932,7 +932,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
                if (!(sb->s_flags & MS_RDONLY)) {
                        if (!silent)
-                                printk(KERN_INFO "ufstype=nextstep-cd is supported read-only\n");
+                                pr_info("ufstype=nextstep-cd is supported read-only\n");
                        sb->s_flags |= MS_RDONLY;
                }
                break;
@@ -948,7 +948,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
                if (!(sb->s_flags & MS_RDONLY)) {
                        if (!silent)
-                                printk(KERN_INFO "ufstype=openstep is supported read-only\n");
+                                pr_info("ufstype=openstep is supported read-only\n");
                        sb->s_flags |= MS_RDONLY;
                }
                break;
@@ -963,19 +963,19 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
                flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
                if (!(sb->s_flags & MS_RDONLY)) {
                        if (!silent)
-                                printk(KERN_INFO "ufstype=hp is supported read-only\n");
+                                pr_info("ufstype=hp is supported read-only\n");
                        sb->s_flags |= MS_RDONLY;
                }
                break;
        default:
                if (!silent)
-                        printk("unknown ufstype\n");
+                        pr_err("unknown ufstype\n");
                goto failed;
        }
        
 again:  
        if (!sb_set_blocksize(sb, block_size)) {
-                printk(KERN_ERR "UFS: failed to set blocksize\n");
+                pr_err("failed to set blocksize\n");
                goto failed;
        }
@@ -1034,7 +1034,7 @@ again:
                goto again;
        }
        if (!silent)
-                printk("ufs_read_super: bad magic number\n");
+                pr_err("%s(): bad magic number\n", __func__);
        goto failed;
 magic_found:
@@ -1048,33 +1048,33 @@ magic_found:
        uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
        if (!is_power_of_2(uspi->s_fsize)) {
-                printk(KERN_ERR "ufs_read_super: fragment size %u is not a power of 2\n",
+                pr_err("%s(): fragment size %u is not a power of 2\n",
-                        uspi->s_fsize);
+                       __func__, uspi->s_fsize);
-                        goto failed;
+                goto failed;
        }
        if (uspi->s_fsize < 512) {
-                printk(KERN_ERR "ufs_read_super: fragment size %u is too small\n",
+                pr_err("%s(): fragment size %u is too small\n",
-                        uspi->s_fsize);
+                       __func__, uspi->s_fsize);
                goto failed;
        }
        if (uspi->s_fsize > 4096) {
-                printk(KERN_ERR "ufs_read_super: fragment size %u is too large\n",
+                pr_err("%s(): fragment size %u is too large\n",
-                        uspi->s_fsize);
+                       __func__, uspi->s_fsize);
                goto failed;
        }
        if (!is_power_of_2(uspi->s_bsize)) {
-                printk(KERN_ERR "ufs_read_super: block size %u is not a power of 2\n",
+                pr_err("%s(): block size %u is not a power of 2\n",
-                        uspi->s_bsize);
+                       __func__, uspi->s_bsize);
                goto failed;
        }
        if (uspi->s_bsize < 4096) {
-                printk(KERN_ERR "ufs_read_super: block size %u is too small\n",
+                pr_err("%s(): block size %u is too small\n",
-                        uspi->s_bsize);
+                       __func__, uspi->s_bsize);
                goto failed;
        }
        if (uspi->s_bsize / uspi->s_fsize > 8) {
-                printk(KERN_ERR "ufs_read_super: too many fragments per block (%u)\n",
+                pr_err("%s(): too many fragments per block (%u)\n",
-                        uspi->s_bsize / uspi->s_fsize);
+                       __func__, uspi->s_bsize / uspi->s_fsize);
                goto failed;
        }
        if (uspi->s_fsize != block_size || uspi->s_sbsize != super_block_size) {
@@ -1113,20 +1113,21 @@ magic_found:
                        UFSD("fs is DEC OSF/1\n");
                        break;
                case UFS_FSACTIVE:
-                        printk("ufs_read_super: fs is active\n");
+                        pr_err("%s(): fs is active\n", __func__);
                        sb->s_flags |= MS_RDONLY;
                        break;
                case UFS_FSBAD:
-                        printk("ufs_read_super: fs is bad\n");
+                        pr_err("%s(): fs is bad\n", __func__);
                        sb->s_flags |= MS_RDONLY;
                        break;
                default:
-                        printk("ufs_read_super: can't grok fs_clean 0x%x\n", usb1->fs_clean);
+                        pr_err("%s(): can't grok fs_clean 0x%x\n",
+                               __func__, usb1->fs_clean);
                        sb->s_flags |= MS_RDONLY;
                        break;
                }
        } else {
-                printk("ufs_read_super: fs needs fsck\n");
+                pr_err("%s(): fs needs fsck\n", __func__);
                sb->s_flags |= MS_RDONLY;
        }
@@ -1299,7 +1300,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
                new_mount_opt |= ufstype;
        } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
-                printk("ufstype can't be changed during remount\n");
+                pr_err("ufstype can't be changed during remount\n");
                unlock_ufs(sb);
                return -EINVAL;
        }
@@ -1328,8 +1329,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
         * fs was mounted as ro, remounting rw
         */
 #ifndef CONFIG_UFS_FS_WRITE
-                printk("ufs was compiled with read-only support, "
+                pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n");
-                "can't be mounted as read-write\n");
                unlock_ufs(sb);
                return -EINVAL;
 #else
@@ -1338,12 +1338,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                    ufstype != UFS_MOUNT_UFSTYPE_44BSD &&
                    ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
                    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
-                        printk("this ufstype is read-only supported\n");
+                        pr_err("this ufstype is read-only supported\n");
                        unlock_ufs(sb);
                        return -EINVAL;
                }
                if (!ufs_read_cylinder_structures(sb)) {
-                        printk("failed during remounting\n");
+                        pr_err("failed during remounting\n");
                        unlock_ufs(sb);
                        return -EPERM;
                }
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 343e6fc571e5..2a07396d5f9e 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -1,6 +1,12 @@
 #ifndef _UFS_UFS_H
 #define _UFS_UFS_H 1
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #define UFS_MAX_GROUP_LOADED 8
 #define UFS_CGNO_EMPTY ((unsigned)-1)
@@ -71,9 +77,9 @@ struct ufs_inode_info {
 */
 #ifdef CONFIG_UFS_DEBUG
 #       define UFSD(f, a...)    {                                       \
-                printk ("UFSD (%s, %d): %s:",                           \
+                pr_debug("UFSD (%s, %d): %s:",                          \
                        __FILE__, __LINE__, __func__);          \
-                printk (f, ## a);                                       \
+                pr_debug(f, ## a);                                      \
        }
 #else
 #       define UFSD(f, a...)    /**/
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 399e8cec6e60..5d47b4df61ea 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,6 +1,7 @@
 config XFS_FS
        tristate "XFS filesystem support"
        depends on BLOCK
+        depends on (64BIT || LBDAF)
        select EXPORTFS
        select LIBCRC32C
        help
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c21f43506661..d61799949580 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -17,6 +17,7 @@
 #
 ccflags-y += -I$(src)                   # needed for trace events
+ccflags-y += -I$(src)/libxfs
 ccflags-$(CONFIG_XFS_DEBUG) += -g
@@ -25,6 +26,39 @@ obj-$(CONFIG_XFS_FS)		+= xfs.o
 # this one should be compiled first, as the tracing macros can easily blow up
 xfs-y                           += xfs_trace.o
+# build the libxfs code first
+xfs-y                           += $(addprefix libxfs/, \
+                                   xfs_alloc.o \
+                                   xfs_alloc_btree.o \
+                                   xfs_attr.o \
+                                   xfs_attr_leaf.o \
+                                   xfs_attr_remote.o \
+                                   xfs_bmap.o \
+                                   xfs_bmap_btree.o \
+                                   xfs_btree.o \
+                                   xfs_da_btree.o \
+                                   xfs_da_format.o \
+                                   xfs_dir2.o \
+                                   xfs_dir2_block.o \
+                                   xfs_dir2_data.o \
+                                   xfs_dir2_leaf.o \
+                                   xfs_dir2_node.o \
+                                   xfs_dir2_sf.o \
+                                   xfs_dquot_buf.o \
+                                   xfs_ialloc.o \
+                                   xfs_ialloc_btree.o \
+                                   xfs_inode_fork.o \
+                                   xfs_inode_buf.o \
+                                   xfs_log_rlimit.o \
+                                   xfs_sb.o \
+                                   xfs_symlink_remote.o \
+                                   xfs_trans_resv.o \
+                                   )
+# xfs_rtbitmap is shared with libxfs
+xfs-$(CONFIG_XFS_RT)            += $(addprefix libxfs/, \
+                                   xfs_rtbitmap.o \
+                                   )
 # highlevel code
 xfs-y                           += xfs_aops.o \
                                   xfs_attr_inactive.o \
@@ -45,53 +79,27 @@ xfs-y				+= xfs_aops.o \
                                   xfs_ioctl.o \
                                   xfs_iomap.o \
                                   xfs_iops.o \
+                                   xfs_inode.o \
                                   xfs_itable.o \
                                   xfs_message.o \
                                   xfs_mount.o \
                                   xfs_mru_cache.o \
                                   xfs_super.o \
                                   xfs_symlink.o \
+                                   xfs_sysfs.o \
                                   xfs_trans.o \
                                   xfs_xattr.o \
                                   kmem.o \
                                   uuid.o
-# code shared with libxfs
-xfs-y                           += xfs_alloc.o \
-                                   xfs_alloc_btree.o \
-                                   xfs_attr.o \
-                                   xfs_attr_leaf.o \
-                                   xfs_attr_remote.o \
-                                   xfs_bmap.o \
-                                   xfs_bmap_btree.o \
-                                   xfs_btree.o \
-                                   xfs_da_btree.o \
-                                   xfs_da_format.o \
-                                   xfs_dir2.o \
-                                   xfs_dir2_block.o \
-                                   xfs_dir2_data.o \
-                                   xfs_dir2_leaf.o \
-                                   xfs_dir2_node.o \
-                                   xfs_dir2_sf.o \
-                                   xfs_dquot_buf.o \
-                                   xfs_ialloc.o \
-                                   xfs_ialloc_btree.o \
-                                   xfs_icreate_item.o \
-                                   xfs_inode.o \
-                                   xfs_inode_fork.o \
-                                   xfs_inode_buf.o \
-                                   xfs_log_recover.o \
-                                   xfs_log_rlimit.o \
-                                   xfs_sb.o \
-                                   xfs_symlink_remote.o \
-                                   xfs_trans_resv.o
 # low-level transaction/log code
 xfs-y                           += xfs_log.o \
                                   xfs_log_cil.o \
                                   xfs_buf_item.o \
                                   xfs_extfree_item.o \
+                                   xfs_icreate_item.o \
                                   xfs_inode_item.o \
+                                   xfs_log_recover.o \
                                   xfs_trans_ail.o \
                                   xfs_trans_buf.o \
                                   xfs_trans_extfree.o \
@@ -107,8 +115,7 @@ xfs-$(CONFIG_XFS_QUOTA)		+= xfs_dquot.o \
                                   xfs_quotaops.o
 # xfs_rtbitmap is shared with libxfs
-xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o \
+xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o
-                                   xfs_rtbitmap.o
 xfs-$(CONFIG_XFS_POSIX_ACL)     += xfs_acl.o
 xfs-$(CONFIG_PROC_FS)           += xfs_stats.o
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 6e247a99f5db..6e247a99f5db 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index d43813267a80..4bffffe038a1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -483,9 +483,9 @@ xfs_agfl_read_verify(
                return;
        if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
-                xfs_buf_ioerror(bp, EFSBADCRC);
+                xfs_buf_ioerror(bp, -EFSBADCRC);
        else if (!xfs_agfl_verify(bp))
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
        if (bp->b_error)
                xfs_verifier_error(bp);
@@ -503,7 +503,7 @@ xfs_agfl_write_verify(
                return;
        if (!xfs_agfl_verify(bp)) {
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                return;
        }
@@ -559,7 +559,7 @@ xfs_alloc_update_counters(
        xfs_trans_agblocks_delta(tp, len);
        if (unlikely(be32_to_cpu(agf->agf_freeblks) >
                     be32_to_cpu(agf->agf_length)))
-                return EFSCORRUPTED;
+                return -EFSCORRUPTED;
        xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
        return 0;
@@ -2234,11 +2234,11 @@ xfs_agf_read_verify(
        if (xfs_sb_version_hascrc(&mp->m_sb) &&
            !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
-                xfs_buf_ioerror(bp, EFSBADCRC);
+                xfs_buf_ioerror(bp, -EFSBADCRC);
        else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
                                XFS_ERRTAG_ALLOC_READ_AGF,
                                XFS_RANDOM_ALLOC_READ_AGF))
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
        if (bp->b_error)
                xfs_verifier_error(bp);
@@ -2252,7 +2252,7 @@ xfs_agf_write_verify(
        struct xfs_buf_log_item *bip = bp->b_fspriv;
        if (!xfs_agf_verify(mp, bp)) {
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                return;
        }
@@ -2601,11 +2601,11 @@ xfs_free_extent(
         */
        args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
        if (args.agno >= args.mp->m_sb.sb_agcount)
-                return EFSCORRUPTED;
+                return -EFSCORRUPTED;
        args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
        if (args.agbno >= args.mp->m_sb.sb_agblocks)
-                return EFSCORRUPTED;
+                return -EFSCORRUPTED;
        args.pag = xfs_perag_get(args.mp, args.agno);
        ASSERT(args.pag);
@@ -2617,7 +2617,7 @@ xfs_free_extent(
        /* validate the extent size is legal now we have the agf locked */
        if (args.agbno + len >
                        be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
-                error = EFSCORRUPTED;
+                error = -EFSCORRUPTED;
                goto error0;
        }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index feacb061bab7..feacb061bab7 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 8358f1ded94d..e0e83e24d3ef 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -355,9 +355,9 @@ xfs_allocbt_read_verify(
        struct xfs_buf  *bp)
 {
        if (!xfs_btree_sblock_verify_crc(bp))
-                xfs_buf_ioerror(bp, EFSBADCRC);
+                xfs_buf_ioerror(bp, -EFSBADCRC);
        else if (!xfs_allocbt_verify(bp))
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
        if (bp->b_error) {
                trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -371,7 +371,7 @@ xfs_allocbt_write_verify(
 {
        if (!xfs_allocbt_verify(bp)) {
                trace_xfs_btree_corrupt(bp, _RET_IP_);
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                return;
        }
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 45e189e7e81c..45e189e7e81c 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index bfe36fc2cdc2..353fb425faef 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -85,7 +85,7 @@ xfs_attr_args_init(
 {
        if (!name)
-                return EINVAL;
+                return -EINVAL;
        memset(args, 0, sizeof(*args));
        args->geo = dp->i_mount->m_attr_geo;
@@ -95,7 +95,7 @@ xfs_attr_args_init(
        args->name = name;
        args->namelen = strlen((const char *)name);
        if (args->namelen >= MAXNAMELEN)
-                return EFAULT;          /* match IRIX behaviour */
+                return -EFAULT;         /* match IRIX behaviour */
        args->hashval = xfs_da_hashname(args->name, args->namelen);
        return 0;
@@ -131,10 +131,10 @@ xfs_attr_get(
        XFS_STATS_INC(xs_attr_get);
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                return EIO;
+                return -EIO;
        if (!xfs_inode_hasattr(ip))
-                return ENOATTR;
+                return -ENOATTR;
        error = xfs_attr_args_init(&args, ip, name, flags);
        if (error)
@@ -145,7 +145,7 @@ xfs_attr_get(
        lock_mode = xfs_ilock_attr_map_shared(ip);
        if (!xfs_inode_hasattr(ip))
-                error = ENOATTR;
+                error = -ENOATTR;
        else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
                error = xfs_attr_shortform_getvalue(&args);
        else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
@@ -155,7 +155,7 @@ xfs_attr_get(
        xfs_iunlock(ip, lock_mode);
        *valuelenp = args.valuelen;
-        return error == EEXIST ? 0 : error;
+        return error == -EEXIST ? 0 : error;
 }
 /*
@@ -213,7 +213,7 @@ xfs_attr_set(
        XFS_STATS_INC(xs_attr_set);
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-                return EIO;
+                return -EIO;
        error = xfs_attr_args_init(&args, dp, name, flags);
        if (error)
@@ -304,7 +304,7 @@ xfs_attr_set(
                 * the inode.
                 */
                error = xfs_attr_shortform_addname(&args);
-                if (error != ENOSPC) {
+                if (error != -ENOSPC) {
                        /*
                         * Commit the shortform mods, and we're done.
                         * NOTE: this is also the error path (EEXIST, etc).
@@ -419,10 +419,10 @@ xfs_attr_remove(
        XFS_STATS_INC(xs_attr_remove);
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-                return EIO;
+                return -EIO;
        if (!xfs_inode_hasattr(dp))
-                return ENOATTR;
+                return -ENOATTR;
        error = xfs_attr_args_init(&args, dp, name, flags);
        if (error)
@@ -477,7 +477,7 @@ xfs_attr_remove(
        xfs_trans_ijoin(args.trans, dp, 0);
        if (!xfs_inode_hasattr(dp)) {
-                error = XFS_ERROR(ENOATTR);
+                error = -ENOATTR;
        } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
                ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
                error = xfs_attr_shortform_remove(&args);
@@ -534,28 +534,28 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
        trace_xfs_attr_sf_addname(args);
        retval = xfs_attr_shortform_lookup(args);
-        if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+        if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
-                return(retval);
+                return retval;
-        } else if (retval == EEXIST) {
+        } else if (retval == -EEXIST) {
                if (args->flags & ATTR_CREATE)
-                        return(retval);
+                        return retval;
                retval = xfs_attr_shortform_remove(args);
                ASSERT(retval == 0);
        }
        if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
            args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
-                return(XFS_ERROR(ENOSPC));
+                return -ENOSPC;
        newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
        newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
        forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
        if (!forkoff)
-                return(XFS_ERROR(ENOSPC));
+                return -ENOSPC;
        xfs_attr_shortform_add(args, forkoff);
-        return(0);
+        return 0;
 }
@@ -592,10 +592,10 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
         * the given flags produce an error or call for an atomic rename.
         */
        retval = xfs_attr3_leaf_lookup_int(bp, args);
-        if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+        if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
                xfs_trans_brelse(args->trans, bp);
                return retval;
-        } else if (retval == EEXIST) {
+        } else if (retval == -EEXIST) {
                if (args->flags & ATTR_CREATE) {        /* pure create op */
                        xfs_trans_brelse(args->trans, bp);
                        return retval;
@@ -626,7 +626,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
         * if required.
         */
        retval = xfs_attr3_leaf_add(bp, args);
-        if (retval == ENOSPC) {
+        if (retval == -ENOSPC) {
                /*
                 * Promote the attribute list to the Btree format, then
                 * Commit that transaction so that the node_addname() call
@@ -642,7 +642,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                        ASSERT(committed);
                        args->trans = NULL;
                        xfs_bmap_cancel(args->flist);
-                        return(error);
+                        return error;
                }
                /*
@@ -658,13 +658,13 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                 */
                error = xfs_trans_roll(&args->trans, dp);
                if (error)
-                        return (error);
+                        return error;
                /*
                 * Fob the whole rest of the problem off on the Btree code.
                 */
                error = xfs_attr_node_addname(args);
-                return(error);
+                return error;
        }
        /*
@@ -673,7 +673,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
         */
        error = xfs_trans_roll(&args->trans, dp);
        if (error)
-                return (error);
+                return error;
        /*
         * If there was an out-of-line value, allocate the blocks we
@@ -684,7 +684,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
        if (args->rmtblkno > 0) {
                error = xfs_attr_rmtval_set(args);
                if (error)
-                        return(error);
+                        return error;
        }
        /*
@@ -700,7 +700,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                 */
                error = xfs_attr3_leaf_flipflags(args);
                if (error)
-                        return(error);
+                        return error;
                /*
                 * Dismantle the "old" attribute/value pair by removing
@@ -714,7 +714,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                if (args->rmtblkno) {
                        error = xfs_attr_rmtval_remove(args);
                        if (error)
-                                return(error);
+                                return error;
                }
                /*
@@ -744,7 +744,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                                ASSERT(committed);
                                args->trans = NULL;
                                xfs_bmap_cancel(args->flist);
-                                return(error);
+                                return error;
                        }
                        /*
@@ -795,7 +795,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
                return error;
        error = xfs_attr3_leaf_lookup_int(bp, args);
-        if (error == ENOATTR) {
+        if (error == -ENOATTR) {
                xfs_trans_brelse(args->trans, bp);
                return error;
        }
@@ -850,7 +850,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
                return error;
        error = xfs_attr3_leaf_lookup_int(bp, args);
-        if (error != EEXIST)  {
+        if (error != -EEXIST)  {
                xfs_trans_brelse(args->trans, bp);
                return error;
        }
@@ -906,9 +906,9 @@ restart:
                goto out;
        blk = &state->path.blk[ state->path.active-1 ];
        ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
-        if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+        if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
                goto out;
-        } else if (retval == EEXIST) {
+        } else if (retval == -EEXIST) {
                if (args->flags & ATTR_CREATE)
                        goto out;
@@ -933,7 +933,7 @@ restart:
        }
        retval = xfs_attr3_leaf_add(blk->bp, state->args);
-        if (retval == ENOSPC) {
+        if (retval == -ENOSPC) {
                if (state->path.active == 1) {
                        /*
                         * Its really a single leaf node, but it had
@@ -1031,7 +1031,7 @@ restart:
        if (args->rmtblkno > 0) {
                error = xfs_attr_rmtval_set(args);
                if (error)
-                        return(error);
+                        return error;
        }
        /*
@@ -1061,7 +1061,7 @@ restart:
                if (args->rmtblkno) {
                        error = xfs_attr_rmtval_remove(args);
                        if (error)
-                                return(error);
+                                return error;
                }
                /*
@@ -1134,8 +1134,8 @@ out:
        if (state)
                xfs_da_state_free(state);
        if (error)
-                return(error);
+                return error;
-        return(retval);
+        return retval;
 }
 /*
@@ -1168,7 +1168,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
         * Search to see if name exists, and get back a pointer to it.
         */
        error = xfs_da3_node_lookup_int(state, &retval);
-        if (error || (retval != EEXIST)) {
+        if (error || (retval != -EEXIST)) {
                if (error == 0)
                        error = retval;
                goto out;
@@ -1297,7 +1297,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 out:
        xfs_da_state_free(state);
-        return(error);
+        return error;
 }
 /*
@@ -1345,7 +1345,7 @@ xfs_attr_fillstate(xfs_da_state_t *state)
                }
        }
-        return(0);
+        return 0;
 }
 /*
@@ -1376,7 +1376,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
                                                blk->blkno, blk->disk_blkno,
                                                &blk->bp, XFS_ATTR_FORK);
                        if (error)
-                                return(error);
+                                return error;
                } else {
                        blk->bp = NULL;
                }
@@ -1395,13 +1395,13 @@ xfs_attr_refillstate(xfs_da_state_t *state)
                                                blk->blkno, blk->disk_blkno,
                                                &blk->bp, XFS_ATTR_FORK);
                        if (error)
-                                return(error);
+                                return error;
                } else {
                        blk->bp = NULL;
                }
        }
-        return(0);
+        return 0;
 }
 /*
@@ -1431,7 +1431,7 @@ xfs_attr_node_get(xfs_da_args_t *args)
        error = xfs_da3_node_lookup_int(state, &retval);
        if (error) {
                retval = error;
-        } else if (retval == EEXIST) {
+        } else if (retval == -EEXIST) {
                blk = &state->path.blk[ state->path.active-1 ];
                ASSERT(blk->bp != NULL);
                ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
@@ -1455,5 +1455,5 @@ xfs_attr_node_get(xfs_da_args_t *args)
        }
        xfs_da_state_free(state);
-        return(retval);
+        return retval;
 }
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 28712d29e43c..b1f73dbbf3d8 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -214,7 +214,7 @@ xfs_attr3_leaf_write_verify(
        struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
        if (!xfs_attr3_leaf_verify(bp)) {
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                return;
        }
@@ -242,9 +242,9 @@ xfs_attr3_leaf_read_verify(
        if (xfs_sb_version_hascrc(&mp->m_sb) &&
             !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
-                xfs_buf_ioerror(bp, EFSBADCRC);
+                xfs_buf_ioerror(bp, -EFSBADCRC);
        else if (!xfs_attr3_leaf_verify(bp))
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
        if (bp->b_error)
                xfs_verifier_error(bp);
@@ -547,7 +547,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
                break;
        }
        if (i == end)
-                return(XFS_ERROR(ENOATTR));
+                return -ENOATTR;
        /*
         * Fix up the attribute fork data, covering the hole
@@ -582,7 +582,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
        xfs_sbversion_add_attr2(mp, args->trans);
-        return(0);
+        return 0;
 }
 /*
@@ -611,9 +611,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
                        continue;
                if (!xfs_attr_namesp_match(args->flags, sfe->flags))
                        continue;
-                return(XFS_ERROR(EEXIST));
+                return -EEXIST;
        }
-        return(XFS_ERROR(ENOATTR));
+        return -ENOATTR;
 }
 /*
@@ -640,18 +640,18 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
                        continue;
                if (args->flags & ATTR_KERNOVAL) {
                        args->valuelen = sfe->valuelen;
-                        return(XFS_ERROR(EEXIST));
+                        return -EEXIST;
                }
                if (args->valuelen < sfe->valuelen) {
                        args->valuelen = sfe->valuelen;
-                        return(XFS_ERROR(ERANGE));
+                        return -ERANGE;
                }
                args->valuelen = sfe->valuelen;
                memcpy(args->value, &sfe->nameval[args->namelen],
                                                    args->valuelen);
-                return(XFS_ERROR(EEXIST));
+                return -EEXIST;
        }
-        return(XFS_ERROR(ENOATTR));
+        return -ENOATTR;
 }
 /*
@@ -691,7 +691,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
                 * If we hit an IO error middle of the transaction inside
                 * grow_inode(), we may have inconsistent data. Bail out.
                 */
-                if (error == EIO)
+                if (error == -EIO)
                        goto out;
                xfs_idata_realloc(dp, size, XFS_ATTR_FORK);     /* try to put */
                memcpy(ifp->if_u1.if_data, tmpbuffer, size);    /* it back */
@@ -730,9 +730,9 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
                                                sfe->namelen);
                nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
                error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
-                ASSERT(error == ENOATTR);
+                ASSERT(error == -ENOATTR);
                error = xfs_attr3_leaf_add(bp, &nargs);
-                ASSERT(error != ENOSPC);
+                ASSERT(error != -ENOSPC);
                if (error)
                        goto out;
                sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
@@ -741,7 +741,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
 out:
        kmem_free(tmpbuffer);
-        return(error);
+        return error;
 }
 /*
@@ -769,12 +769,12 @@ xfs_attr_shortform_allfit(
                if (entry->flags & XFS_ATTR_INCOMPLETE)
                        continue;               /* don't copy partial entries */
                if (!(entry->flags & XFS_ATTR_LOCAL))
-                        return(0);
+                        return 0;
                name_loc = xfs_attr3_leaf_name_local(leaf, i);
                if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
-                        return(0);
+                        return 0;
                if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
-                        return(0);
+                        return 0;
                bytes += sizeof(struct xfs_attr_sf_entry) - 1
                                + name_loc->namelen
                                + be16_to_cpu(name_loc->valuelen);
@@ -809,7 +809,7 @@ xfs_attr3_leaf_to_shortform(
        tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
        if (!tmpbuffer)
-                return ENOMEM;
+                return -ENOMEM;
        memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
@@ -1017,10 +1017,10 @@ xfs_attr3_leaf_split(
        ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
        error = xfs_da_grow_inode(state->args, &blkno);
        if (error)
-                return(error);
+                return error;
        error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
        if (error)
-                return(error);
+                return error;
        newblk->blkno = blkno;
        newblk->magic = XFS_ATTR_LEAF_MAGIC;
@@ -1031,7 +1031,7 @@ xfs_attr3_leaf_split(
        xfs_attr3_leaf_rebalance(state, oldblk, newblk);
        error = xfs_da3_blk_link(state, oldblk, newblk);
        if (error)
-                return(error);
+                return error;
        /*
         * Save info on "old" attribute for "atomic rename" ops, leaf_add()
@@ -1053,7 +1053,7 @@ xfs_attr3_leaf_split(
         */
        oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
        newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
-        return(error);
+        return error;
 }
 /*
@@ -1108,7 +1108,7 @@ xfs_attr3_leaf_add(
         * no good and we should just give up.
         */
        if (!ichdr.holes && sum < entsize)
-                return XFS_ERROR(ENOSPC);
+                return -ENOSPC;
        /*
         * Compact the entries to coalesce free space.
@@ -1121,7 +1121,7 @@ xfs_attr3_leaf_add(
         * free region, in freemap[0].  If it is not big enough, give up.
         */
        if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
-                tmp = ENOSPC;
+                tmp = -ENOSPC;
                goto out_log_hdr;
        }
@@ -1692,7 +1692,7 @@ xfs_attr3_leaf_toosmall(
                ichdr.usedbytes;
        if (bytes > (state->args->geo->blksize >> 1)) {
                *action = 0;    /* blk over 50%, don't try to join */
-                return(0);
+                return 0;
        }
        /*
@@ -1711,7 +1711,7 @@ xfs_attr3_leaf_toosmall(
                error = xfs_da3_path_shift(state, &state->altpath, forward,
                                                 0, &retval);
                if (error)
-                        return(error);
+                        return error;
                if (retval) {
                        *action = 0;
                } else {
@@ -1740,7 +1740,7 @@ xfs_attr3_leaf_toosmall(
                error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
                                        blkno, -1, &bp);
                if (error)
-                        return(error);
+                        return error;
                xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
@@ -1757,7 +1757,7 @@ xfs_attr3_leaf_toosmall(
        }
        if (i >= 2) {
                *action = 0;
-                return(0);
+                return 0;
        }
        /*
@@ -1773,13 +1773,13 @@ xfs_attr3_leaf_toosmall(
                                                 0, &retval);
        }
        if (error)
-                return(error);
+                return error;
        if (retval) {
                *action = 0;
        } else {
                *action = 1;
        }
-        return(0);
+        return 0;
 }
 /*
@@ -2123,7 +2123,7 @@ xfs_attr3_leaf_lookup_int(
        }
        if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
                args->index = probe;
-                return XFS_ERROR(ENOATTR);
+                return -ENOATTR;
        }
        /*
@@ -2152,7 +2152,7 @@ xfs_attr3_leaf_lookup_int(
                        if (!xfs_attr_namesp_match(args->flags, entry->flags))
                                continue;
                        args->index = probe;
-                        return XFS_ERROR(EEXIST);
+                        return -EEXIST;
                } else {
                        name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
                        if (name_rmt->namelen != args->namelen)
@@ -2168,11 +2168,11 @@ xfs_attr3_leaf_lookup_int(
                        args->rmtblkcnt = xfs_attr3_rmt_blocks(
                                                        args->dp->i_mount,
                                                        args->rmtvaluelen);
-                        return XFS_ERROR(EEXIST);
+                        return -EEXIST;
                }
        }
        args->index = probe;
-        return XFS_ERROR(ENOATTR);
+        return -ENOATTR;
 }
 /*
@@ -2208,7 +2208,7 @@ xfs_attr3_leaf_getvalue(
                }
                if (args->valuelen < valuelen) {
                        args->valuelen = valuelen;
-                        return XFS_ERROR(ERANGE);
+                        return -ERANGE;
                }
                args->valuelen = valuelen;
                memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
@@ -2226,7 +2226,7 @@ xfs_attr3_leaf_getvalue(
                }
                if (args->valuelen < args->rmtvaluelen) {
                        args->valuelen = args->rmtvaluelen;
-                        return XFS_ERROR(ERANGE);
+                        return -ERANGE;
                }
                args->valuelen = args->rmtvaluelen;
        }
@@ -2481,7 +2481,7 @@ xfs_attr3_leaf_clearflag(
         */
        error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
        if (error)
-                return(error);
+                return error;
        leaf = bp->b_addr;
        entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
@@ -2548,7 +2548,7 @@ xfs_attr3_leaf_setflag(
         */
        error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
        if (error)
-                return(error);
+                return error;
        leaf = bp->b_addr;
 #ifdef DEBUG
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index e2929da7c3ba..e2929da7c3ba 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index b5adfecbb8ee..7510ab8058a4 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -138,11 +138,11 @@ xfs_attr3_rmt_read_verify(
        while (len > 0) {
                if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
-                        xfs_buf_ioerror(bp, EFSBADCRC);
+                        xfs_buf_ioerror(bp, -EFSBADCRC);
                        break;
                }
                if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
-                        xfs_buf_ioerror(bp, EFSCORRUPTED);
+                        xfs_buf_ioerror(bp, -EFSCORRUPTED);
                        break;
                }
                len -= blksize;
@@ -178,7 +178,7 @@ xfs_attr3_rmt_write_verify(
        while (len > 0) {
                if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
-                        xfs_buf_ioerror(bp, EFSCORRUPTED);
+                        xfs_buf_ioerror(bp, -EFSCORRUPTED);
                        xfs_verifier_error(bp);
                        return;
                }
@@ -257,7 +257,7 @@ xfs_attr_rmtval_copyout(
                                xfs_alert(mp,
 "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
                                        bno, *offset, byte_cnt, ino);
-                                return EFSCORRUPTED;
+                                return -EFSCORRUPTED;
                        }
                        hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
                }
@@ -452,7 +452,7 @@ xfs_attr_rmtval_set(
                        ASSERT(committed);
                        args->trans = NULL;
                        xfs_bmap_cancel(args->flist);
-                        return(error);
+                        return error;
                }
                /*
@@ -473,7 +473,7 @@ xfs_attr_rmtval_set(
                 */
                error = xfs_trans_roll(&args->trans, dp);
                if (error)
-                        return (error);
+                        return error;
        }
        /*
@@ -498,7 +498,7 @@ xfs_attr_rmtval_set(
                                       blkcnt, &map, &nmap,
                                       XFS_BMAPI_ATTRFORK);
                if (error)
-                        return(error);
+                        return error;
                ASSERT(nmap == 1);
                ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
                       (map.br_startblock != HOLESTARTBLOCK));
@@ -508,7 +508,7 @@ xfs_attr_rmtval_set(
                bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
                if (!bp)
-                        return ENOMEM;
+                        return -ENOMEM;
                bp->b_ops = &xfs_attr3_rmt_buf_ops;
                xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
@@ -563,7 +563,7 @@ xfs_attr_rmtval_remove(
                error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
                                       blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
                if (error)
-                        return(error);
+                        return error;
                ASSERT(nmap == 1);
                ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
                       (map.br_startblock != HOLESTARTBLOCK));
@@ -622,7 +622,7 @@ xfs_attr_rmtval_remove(
                 */
                error = xfs_trans_roll(&args->trans, args->dp);
                if (error)
-                        return (error);
+                        return error;
        }
-        return(0);
+        return 0;
 }
diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index 5a9acfa156d7..5a9acfa156d7 100644
--- a/fs/xfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index 919756e3ba53..919756e3ba53 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h
index e1649c0d3e02..e1649c0d3e02 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/libxfs/xfs_bit.h
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 75c3fe5f3d9d..de2d26d32844 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -392,7 +392,7 @@ xfs_bmap_check_leaf_extents(
        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
-        ASSERT(bno != NULLDFSBNO);
+        ASSERT(bno != NULLFSBLOCK);
        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
        ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
@@ -1033,7 +1033,7 @@ xfs_bmap_add_attrfork_btree(
                        goto error0;
                if (stat == 0) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-                        return XFS_ERROR(ENOSPC);
+                        return -ENOSPC;
                }
                *firstblock = cur->bc_private.b.firstblock;
                cur->bc_private.b.allocated = 0;
@@ -1115,7 +1115,7 @@ xfs_bmap_add_attrfork_local(
        /* should only be called for types that support local format data */
        ASSERT(0);
-        return EFSCORRUPTED;
+        return -EFSCORRUPTED;
 }
 /*
@@ -1192,7 +1192,7 @@ xfs_bmap_add_attrfork(
                break;
        default:
                ASSERT(0);
-                error = XFS_ERROR(EINVAL);
+                error = -EINVAL;
                goto trans_cancel;
        }
@@ -1299,7 +1299,7 @@ xfs_bmap_read_extents(
        ASSERT(level > 0);
        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
-        ASSERT(bno != NULLDFSBNO);
+        ASSERT(bno != NULLFSBLOCK);
        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
        ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
        /*
@@ -1399,7 +1399,7 @@ xfs_bmap_read_extents(
        return 0;
 error0:
        xfs_trans_brelse(tp, bp);
-        return XFS_ERROR(EFSCORRUPTED);
+        return -EFSCORRUPTED;
 }
@@ -1429,11 +1429,7 @@ xfs_bmap_search_multi_extents(
        gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
        gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
        gotp->br_state = XFS_EXT_INVALID;
-#if XFS_BIG_BLKNOS
        gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
-#else
-        gotp->br_startblock = 0xffffa5a5;
-#endif
        prevp->br_startoff = NULLFILEOFF;
        ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
@@ -1576,7 +1572,7 @@ xfs_bmap_last_before(
        if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
-               return XFS_ERROR(EIO);
+               return -EIO;
        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
                *last_block = 0;
                return 0;
@@ -1690,7 +1686,7 @@ xfs_bmap_last_offset(
        if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-               return XFS_ERROR(EIO);
+               return -EIO;
        error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
        if (error || is_empty)
@@ -3323,7 +3319,7 @@ xfs_bmap_extsize_align(
                if (orig_off < align_off ||
                    orig_end > align_off + align_alen ||
                    align_alen - temp < orig_alen)
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                /*
                 * Try to fix it by moving the start up.
                 */
@@ -3348,7 +3344,7 @@ xfs_bmap_extsize_align(
                 * Result doesn't cover the request, fail it.
                 */
                if (orig_off < align_off || orig_end > align_off + align_alen)
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
        } else {
                ASSERT(orig_off >= align_off);
                ASSERT(orig_end <= align_off + align_alen);
@@ -4051,11 +4047,11 @@ xfs_bmapi_read(
             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
                XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        XFS_STATS_INC(xs_blk_mapr);
@@ -4246,11 +4242,11 @@ xfs_bmapi_delay(
             XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
                XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        XFS_STATS_INC(xs_blk_mapw);
@@ -4469,7 +4465,7 @@ xfs_bmapi_convert_unwritten(
         * so generate another request.
         */
        if (mval->br_blockcount < len)
-                return EAGAIN;
+                return -EAGAIN;
        return 0;
 }
@@ -4540,11 +4536,11 @@ xfs_bmapi_write(
             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
                XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -4620,7 +4616,7 @@ xfs_bmapi_write(
                /* Execute unwritten extent conversion if necessary */
                error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
-                if (error == EAGAIN)
+                if (error == -EAGAIN)
                        continue;
                if (error)
                        goto error0;
@@ -4922,7 +4918,7 @@ xfs_bmap_del_extent(
                                        goto done;
                                cur->bc_rec.b = new;
                                error = xfs_btree_insert(cur, &i);
-                                if (error && error != ENOSPC)
+                                if (error && error != -ENOSPC)
                                        goto done;
                                /*
                                 * If get no-space back from btree insert,
@@ -4930,7 +4926,7 @@ xfs_bmap_del_extent(
                                 * block reservation.
                                 * Fix up our state and return the error.
                                 */
-                                if (error == ENOSPC) {
+                                if (error == -ENOSPC) {
                                        /*
                                         * Reset the cursor, don't trust
                                         * it after any insert operation.
@@ -4958,7 +4954,7 @@ xfs_bmap_del_extent(
                                        xfs_bmbt_set_blockcount(ep,
                                                got.br_blockcount);
                                        flags = 0;
-                                        error = XFS_ERROR(ENOSPC);
+                                        error = -ENOSPC;
                                        goto done;
                                }
                                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
@@ -5076,11 +5072,11 @@ xfs_bunmapi(
            XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
                XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
                                 ip->i_mount);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        mp = ip->i_mount;
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ASSERT(len > 0);
@@ -5325,7 +5321,7 @@ xfs_bunmapi(
                    del.br_startoff > got.br_startoff &&
                    del.br_startoff + del.br_blockcount <
                    got.br_startoff + got.br_blockcount) {
-                        error = XFS_ERROR(ENOSPC);
+                        error = -ENOSPC;
                        goto error0;
                }
                error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
@@ -5449,11 +5445,11 @@ xfs_bmap_shift_extents(
             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
                XFS_ERROR_REPORT("xfs_bmap_shift_extents",
                                 XFS_ERRLEVEL_LOW, mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        ASSERT(current_ext != NULL);
@@ -5516,14 +5512,14 @@ xfs_bmap_shift_extents(
                                                *current_ext - 1), &left);
                        if (startoff < left.br_startoff + left.br_blockcount)
-                                error = XFS_ERROR(EINVAL);
+                                error = -EINVAL;
                } else if (offset_shift_fsb > got.br_startoff) {
                        /*
                         * When first extent is shifted, offset_shift_fsb
                         * should be less than the stating offset of
                         * the first extent.
                         */
-                        error = XFS_ERROR(EINVAL);
+                        error = -EINVAL;
                }
                if (error)
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b879ca56a64c..b879ca56a64c 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 948836c4fd90..fba753308f31 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -111,23 +111,8 @@ __xfs_bmbt_get_all(
        ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
        s->br_startoff = ((xfs_fileoff_t)l0 &
                           xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
-#if XFS_BIG_BLKNOS
        s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
                           (((xfs_fsblock_t)l1) >> 21);
-#else
-#ifdef DEBUG
-        {
-                xfs_dfsbno_t    b;
-                b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
-                    (((xfs_dfsbno_t)l1) >> 21);
-                ASSERT((b >> 32) == 0 || isnulldstartblock(b));
-                s->br_startblock = (xfs_fsblock_t)b;
-        }
-#else   /* !DEBUG */
-        s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
-#endif  /* DEBUG */
-#endif  /* XFS_BIG_BLKNOS */
        s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
        /* This is xfs_extent_state() in-line */
        if (ext_flag) {
@@ -163,21 +148,8 @@ xfs_fsblock_t
 xfs_bmbt_get_startblock(
        xfs_bmbt_rec_host_t     *r)
 {
-#if XFS_BIG_BLKNOS
        return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
               (((xfs_fsblock_t)r->l1) >> 21);
-#else
-#ifdef DEBUG
-        xfs_dfsbno_t    b;
-        b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
-            (((xfs_dfsbno_t)r->l1) >> 21);
-        ASSERT((b >> 32) == 0 || isnulldstartblock(b));
-        return (xfs_fsblock_t)b;
-#else   /* !DEBUG */
-        return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
-#endif  /* DEBUG */
-#endif  /* XFS_BIG_BLKNOS */
 }
 /*
@@ -241,7 +213,6 @@ xfs_bmbt_set_allf(
        ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
        ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
-#if XFS_BIG_BLKNOS
        ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
        r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
@@ -250,23 +221,6 @@ xfs_bmbt_set_allf(
        r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
                ((xfs_bmbt_rec_base_t)blockcount &
                (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-#else   /* !XFS_BIG_BLKNOS */
-        if (isnullstartblock(startblock)) {
-                r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-                        ((xfs_bmbt_rec_base_t)startoff << 9) |
-                         (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
-                r->l1 = xfs_mask64hi(11) |
-                          ((xfs_bmbt_rec_base_t)startblock << 21) |
-                          ((xfs_bmbt_rec_base_t)blockcount &
-                           (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-        } else {
-                r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-                        ((xfs_bmbt_rec_base_t)startoff << 9);
-                r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
-                         ((xfs_bmbt_rec_base_t)blockcount &
-                         (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-        }
-#endif  /* XFS_BIG_BLKNOS */
 }
 /*
@@ -298,8 +252,6 @@ xfs_bmbt_disk_set_allf(
        ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
        ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
        ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
-#if XFS_BIG_BLKNOS
        ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
        r->l0 = cpu_to_be64(
@@ -310,26 +262,6 @@ xfs_bmbt_disk_set_allf(
                ((xfs_bmbt_rec_base_t)startblock << 21) |
                 ((xfs_bmbt_rec_base_t)blockcount &
                  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-#else   /* !XFS_BIG_BLKNOS */
-        if (isnullstartblock(startblock)) {
-                r->l0 = cpu_to_be64(
-                        ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-                         ((xfs_bmbt_rec_base_t)startoff << 9) |
-                          (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
-                r->l1 = cpu_to_be64(xfs_mask64hi(11) |
-                          ((xfs_bmbt_rec_base_t)startblock << 21) |
-                          ((xfs_bmbt_rec_base_t)blockcount &
-                           (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-        } else {
-                r->l0 = cpu_to_be64(
-                        ((xfs_bmbt_rec_base_t)extent_flag << 63) |
-                         ((xfs_bmbt_rec_base_t)startoff << 9));
-                r->l1 = cpu_to_be64(
-                        ((xfs_bmbt_rec_base_t)startblock << 21) |
-                         ((xfs_bmbt_rec_base_t)blockcount &
-                          (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
-        }
-#endif  /* XFS_BIG_BLKNOS */
 }
 /*
@@ -365,24 +297,11 @@ xfs_bmbt_set_startblock(
        xfs_bmbt_rec_host_t *r,
        xfs_fsblock_t   v)
 {
-#if XFS_BIG_BLKNOS
        ASSERT((v & xfs_mask64hi(12)) == 0);
        r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
                  (xfs_bmbt_rec_base_t)(v >> 43);
        r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
                  (xfs_bmbt_rec_base_t)(v << 21);
-#else   /* !XFS_BIG_BLKNOS */
-        if (isnullstartblock(v)) {
-                r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
-                r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
-                          ((xfs_bmbt_rec_base_t)v << 21) |
-                          (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-        } else {
-                r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
-                r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
-                          (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
-        }
-#endif  /* XFS_BIG_BLKNOS */
 }
 /*
@@ -438,8 +357,8 @@ xfs_bmbt_to_bmdr(
                       cpu_to_be64(XFS_BUF_DADDR_NULL));
        } else
                ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
-        ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO));
+        ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
-        ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO));
+        ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
        ASSERT(rblock->bb_level != 0);
        dblock->bb_level = rblock->bb_level;
        dblock->bb_numrecs = rblock->bb_numrecs;
@@ -554,7 +473,7 @@ xfs_bmbt_alloc_block(
        args.minlen = args.maxlen = args.prod = 1;
        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
        if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
-                error = XFS_ERROR(ENOSPC);
+                error = -ENOSPC;
                goto error0;
        }
        error = xfs_alloc_vextent(&args);
@@ -763,11 +682,11 @@ xfs_bmbt_verify(
        /* sibling pointer verification */
        if (!block->bb_u.l.bb_leftsib ||
-            (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) &&
+            (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
             !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
                return false;
        if (!block->bb_u.l.bb_rightsib ||
-            (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) &&
+            (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
             !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
                return false;
@@ -779,9 +698,9 @@ xfs_bmbt_read_verify(
        struct xfs_buf  *bp)
 {
        if (!xfs_btree_lblock_verify_crc(bp))
-                xfs_buf_ioerror(bp, EFSBADCRC);
+                xfs_buf_ioerror(bp, -EFSBADCRC);
        else if (!xfs_bmbt_verify(bp))
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
        if (bp->b_error) {
                trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -795,7 +714,7 @@ xfs_bmbt_write_verify(
 {
        if (!xfs_bmbt_verify(bp)) {
                trace_xfs_btree_corrupt(bp, _RET_IP_);
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                return;
        }
@@ -959,7 +878,7 @@ xfs_bmbt_change_owner(
        cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
        if (!cur)
-                return ENOMEM;
+                return -ENOMEM;
        error = xfs_btree_change_owner(cur, new_owner, buffer_list);
        xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 819a8a4dee95..819a8a4dee95 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index cf893bc1e373..8fe6a93ff473 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -78,11 +78,11 @@ xfs_btree_check_lblock(
                be16_to_cpu(block->bb_numrecs) <=
                        cur->bc_ops->get_maxrecs(cur, level) &&
                block->bb_u.l.bb_leftsib &&
-                (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
+                (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
                 XFS_FSB_SANITY_CHECK(mp,
                        be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
                block->bb_u.l.bb_rightsib &&
-                (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
+                (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
                 XFS_FSB_SANITY_CHECK(mp,
                        be64_to_cpu(block->bb_u.l.bb_rightsib)));
@@ -92,7 +92,7 @@ xfs_btree_check_lblock(
                if (bp)
                        trace_xfs_btree_corrupt(bp, _RET_IP_);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        return 0;
 }
@@ -140,7 +140,7 @@ xfs_btree_check_sblock(
                if (bp)
                        trace_xfs_btree_corrupt(bp, _RET_IP_);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        return 0;
 }
@@ -167,12 +167,12 @@ xfs_btree_check_block(
 int                                     /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lptr(
        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_dfsbno_t            bno,    /* btree block disk address */
+        xfs_fsblock_t           bno,    /* btree block disk address */
        int                     level)  /* btree block level */
 {
        XFS_WANT_CORRUPTED_RETURN(
                level > 0 &&
-                bno != NULLDFSBNO &&
+                bno != NULLFSBLOCK &&
                XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
        return 0;
 }
@@ -595,7 +595,7 @@ xfs_btree_islastblock(
        block = xfs_btree_get_block(cur, level, &bp);
        xfs_btree_check_block(cur, block, level, bp);
        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-                return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO);
+                return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
        else
                return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
 }
@@ -771,16 +771,16 @@ xfs_btree_readahead_lblock(
        struct xfs_btree_block  *block)
 {
        int                     rval = 0;
-        xfs_dfsbno_t            left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+        xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
-        xfs_dfsbno_t            right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+        xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
-        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
+        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
                xfs_btree_reada_bufl(cur->bc_mp, left, 1,
                                     cur->bc_ops->buf_ops);
                rval++;
        }
-        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
+        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
                xfs_btree_reada_bufl(cur->bc_mp, right, 1,
                                     cur->bc_ops->buf_ops);
                rval++;
@@ -852,7 +852,7 @@ xfs_btree_ptr_to_daddr(
        union xfs_btree_ptr     *ptr)
 {
        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-                ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
+                ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
                return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
        } else {
@@ -900,9 +900,9 @@ xfs_btree_setbuf(
        b = XFS_BUF_TO_BLOCK(bp);
        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-                if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO))
+                if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
-                if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO))
+                if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
        } else {
                if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
@@ -918,7 +918,7 @@ xfs_btree_ptr_is_null(
        union xfs_btree_ptr     *ptr)
 {
        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-                return ptr->l == cpu_to_be64(NULLDFSBNO);
+                return ptr->l == cpu_to_be64(NULLFSBLOCK);
        else
                return ptr->s == cpu_to_be32(NULLAGBLOCK);
 }
@@ -929,7 +929,7 @@ xfs_btree_set_ptr_null(
        union xfs_btree_ptr     *ptr)
 {
        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-                ptr->l = cpu_to_be64(NULLDFSBNO);
+                ptr->l = cpu_to_be64(NULLFSBLOCK);
        else
                ptr->s = cpu_to_be32(NULLAGBLOCK);
 }
@@ -997,8 +997,8 @@ xfs_btree_init_block_int(
        buf->bb_numrecs = cpu_to_be16(numrecs);
        if (flags & XFS_BTREE_LONG_PTRS) {
-                buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+                buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
-                buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+                buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
                if (flags & XFS_BTREE_CRC_BLOCKS) {
                        buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
                        buf->bb_u.l.bb_owner = cpu_to_be64(owner);
@@ -1140,7 +1140,7 @@ xfs_btree_get_buf_block(
                                 mp->m_bsize, flags);
        if (!*bpp)
-                return ENOMEM;
+                return -ENOMEM;
        (*bpp)->b_ops = cur->bc_ops->buf_ops;
        *block = XFS_BUF_TO_BLOCK(*bpp);
@@ -1498,7 +1498,7 @@ xfs_btree_increment(
                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
                        goto out0;
                ASSERT(0);
-                error = EFSCORRUPTED;
+                error = -EFSCORRUPTED;
                goto error0;
        }
        ASSERT(lev < cur->bc_nlevels);
@@ -1597,7 +1597,7 @@ xfs_btree_decrement(
                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
                        goto out0;
                ASSERT(0);
-                error = EFSCORRUPTED;
+                error = -EFSCORRUPTED;
                goto error0;
        }
        ASSERT(lev < cur->bc_nlevels);
@@ -4018,7 +4018,7 @@ xfs_btree_block_change_owner(
        /* now read rh sibling block for next iteration */
        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
        if (xfs_btree_ptr_is_null(cur, &rptr))
-                return ENOENT;
+                return -ENOENT;
        return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
 }
@@ -4061,7 +4061,7 @@ xfs_btree_change_owner(
                                                             buffer_list);
                } while (!error);
-                if (error != ENOENT)
+                if (error != -ENOENT)
                        return error;
        }
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index a04b69422f67..8f18bab73ea5 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -258,7 +258,7 @@ xfs_btree_check_block(
 int                                     /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lptr(
        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_dfsbno_t            ptr,    /* btree block disk address */
+        xfs_fsblock_t           ptr,    /* btree block disk address */
        int                     level); /* btree block level */
 /*
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
index fad1676ad8cd..fad1676ad8cd 100644
--- a/fs/xfs/xfs_cksum.h
+++ b/fs/xfs/libxfs/xfs_cksum.h
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index a514ab616650..2c42ae28d027 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -185,7 +185,7 @@ xfs_da3_node_write_verify(
        struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
        if (!xfs_da3_node_verify(bp)) {
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                return;
        }
@@ -214,13 +214,13 @@ xfs_da3_node_read_verify(
        switch (be16_to_cpu(info->magic)) {
                case XFS_DA3_NODE_MAGIC:
                        if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
-                                xfs_buf_ioerror(bp, EFSBADCRC);
+                                xfs_buf_ioerror(bp, -EFSBADCRC);
                                break;
                        }
                        /* fall through */
                case XFS_DA_NODE_MAGIC:
                        if (!xfs_da3_node_verify(bp)) {
-                                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                                break;
                        }
                        return;
@@ -315,7 +315,7 @@ xfs_da3_node_create(
        error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
        if (error)
-                return(error);
+                return error;
        bp->b_ops = &xfs_da3_node_buf_ops;
        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
        node = bp->b_addr;
@@ -337,7 +337,7 @@ xfs_da3_node_create(
                XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
        *bpp = bp;
-        return(0);
+        return 0;
 }
 /*
@@ -385,8 +385,8 @@ xfs_da3_split(
                switch (oldblk->magic) {
                case XFS_ATTR_LEAF_MAGIC:
                        error = xfs_attr3_leaf_split(state, oldblk, newblk);
-                        if ((error != 0) && (error != ENOSPC)) {
+                        if ((error != 0) && (error != -ENOSPC)) {
-                                return(error);  /* GROT: attr is inconsistent */
+                                return error;   /* GROT: attr is inconsistent */
                        }
                        if (!error) {
                                addblk = newblk;
@@ -408,7 +408,7 @@ xfs_da3_split(
                                                            &state->extrablk);
                        }
                        if (error)
-                                return(error);  /* GROT: attr inconsistent */
+                                return error;   /* GROT: attr inconsistent */
                        addblk = newblk;
                        break;
                case XFS_DIR2_LEAFN_MAGIC:
@@ -422,7 +422,7 @@ xfs_da3_split(
                                                         max - i, &action);
                        addblk->bp = NULL;
                        if (error)
-                                return(error);  /* GROT: dir is inconsistent */
+                                return error;   /* GROT: dir is inconsistent */
                        /*
                         * Record the newly split block for the next time thru?
                         */
@@ -439,7 +439,7 @@ xfs_da3_split(
                xfs_da3_fixhashpath(state, &state->path);
        }
        if (!addblk)
-                return(0);
+                return 0;
        /*
         * Split the root node.
@@ -449,7 +449,7 @@ xfs_da3_split(
        error = xfs_da3_root_split(state, oldblk, addblk);
        if (error) {
                addblk->bp = NULL;
-                return(error);  /* GROT: dir is inconsistent */
+                return error;   /* GROT: dir is inconsistent */
        }
        /*
@@ -492,7 +492,7 @@ xfs_da3_split(
                    sizeof(node->hdr.info)));
        }
        addblk->bp = NULL;
-        return(0);
+        return 0;
 }
 /*
@@ -670,18 +670,18 @@ xfs_da3_node_split(
                 */
                error = xfs_da_grow_inode(state->args, &blkno);
                if (error)
-                        return(error);  /* GROT: dir is inconsistent */
+                        return error;   /* GROT: dir is inconsistent */
                error = xfs_da3_node_create(state->args, blkno, treelevel,
                                           &newblk->bp, state->args->whichfork);
                if (error)
-                        return(error);  /* GROT: dir is inconsistent */
+                        return error;   /* GROT: dir is inconsistent */
                newblk->blkno = blkno;
                newblk->magic = XFS_DA_NODE_MAGIC;
                xfs_da3_node_rebalance(state, oldblk, newblk);
                error = xfs_da3_blk_link(state, oldblk, newblk);
                if (error)
-                        return(error);
+                        return error;
                *result = 1;
        } else {
                *result = 0;
@@ -721,7 +721,7 @@ xfs_da3_node_split(
                }
        }
-        return(0);
+        return 0;
 }
 /*
@@ -963,9 +963,9 @@ xfs_da3_join(
                case XFS_ATTR_LEAF_MAGIC:
                        error = xfs_attr3_leaf_toosmall(state, &action);
                        if (error)
-                                return(error);
+                                return error;
                        if (action == 0)
-                                return(0);
+                                return 0;
                        xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
                        break;
                case XFS_DIR2_LEAFN_MAGIC:
@@ -985,7 +985,7 @@ xfs_da3_join(
                        xfs_da3_fixhashpath(state, &state->path);
                        error = xfs_da3_node_toosmall(state, &action);
                        if (error)
-                                return(error);
+                                return error;
                        if (action == 0)
                                return 0;
                        xfs_da3_node_unbalance(state, drop_blk, save_blk);
@@ -995,12 +995,12 @@ xfs_da3_join(
                error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
                xfs_da_state_kill_altpath(state);
                if (error)
-                        return(error);
+                        return error;
                error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
                                                         drop_blk->bp);
                drop_blk->bp = NULL;
                if (error)
-                        return(error);
+                        return error;
        }
        /*
         * We joined all the way to the top.  If it turns out that
@@ -1010,7 +1010,7 @@ xfs_da3_join(
        xfs_da3_node_remove(state, drop_blk);
        xfs_da3_fixhashpath(state, &state->path);
        error = xfs_da3_root_join(state, &state->path.blk[0]);
-        return(error);
+        return error;
 }
 #ifdef  DEBUG
@@ -1099,7 +1099,7 @@ xfs_da3_root_join(
        xfs_trans_log_buf(args->trans, root_blk->bp, 0,
                          args->geo->blksize - 1);
        error = xfs_da_shrink_inode(args, child, bp);
-        return(error);
+        return error;
 }
 /*
@@ -1142,7 +1142,7 @@ xfs_da3_node_toosmall(
        dp->d_ops->node_hdr_from_disk(&nodehdr, node);
        if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
                *action = 0;    /* blk over 50%, don't try to join */
-                return(0);      /* blk over 50%, don't try to join */
+                return 0;       /* blk over 50%, don't try to join */
        }
        /*
@@ -1161,13 +1161,13 @@ xfs_da3_node_toosmall(
                error = xfs_da3_path_shift(state, &state->altpath, forward,
                                                 0, &retval);
                if (error)
-                        return(error);
+                        return error;
                if (retval) {
                        *action = 0;
                } else {
                        *action = 2;
                }
-                return(0);
+                return 0;
        }
        /*
@@ -1194,7 +1194,7 @@ xfs_da3_node_toosmall(
                error = xfs_da3_node_read(state->args->trans, dp,
                                        blkno, -1, &bp, state->args->whichfork);
                if (error)
-                        return(error);
+                        return error;
                node = bp->b_addr;
                dp->d_ops->node_hdr_from_disk(&thdr, node);
@@ -1486,7 +1486,7 @@ xfs_da3_node_lookup_int(
                if (error) {
                        blk->blkno = 0;
                        state->path.active--;
-                        return(error);
+                        return error;
                }
                curr = blk->bp->b_addr;
                blk->magic = be16_to_cpu(curr->magic);
@@ -1579,25 +1579,25 @@ xfs_da3_node_lookup_int(
                        args->blkno = blk->blkno;
                } else {
                        ASSERT(0);
-                        return XFS_ERROR(EFSCORRUPTED);
+                        return -EFSCORRUPTED;
                }
-                if (((retval == ENOENT) || (retval == ENOATTR)) &&
+                if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
                    (blk->hashval == args->hashval)) {
                        error = xfs_da3_path_shift(state, &state->path, 1, 1,
                                                         &retval);
                        if (error)
-                                return(error);
+                                return error;
                        if (retval == 0) {
                                continue;
                        } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
                                /* path_shift() gives ENOENT */
-                                retval = XFS_ERROR(ENOATTR);
+                                retval = -ENOATTR;
                        }
                }
                break;
        }
        *result = retval;
-        return(0);
+        return 0;
 }
 /*========================================================================
@@ -1692,7 +1692,7 @@ xfs_da3_blk_link(
                                                be32_to_cpu(old_info->back),
                                                -1, &bp, args->whichfork);
                        if (error)
-                                return(error);
+                                return error;
                        ASSERT(bp != NULL);
                        tmp_info = bp->b_addr;
                        ASSERT(tmp_info->magic == old_info->magic);
@@ -1713,7 +1713,7 @@ xfs_da3_blk_link(
                                                be32_to_cpu(old_info->forw),
                                                -1, &bp, args->whichfork);
                        if (error)
-                                return(error);
+                                return error;
                        ASSERT(bp != NULL);
                        tmp_info = bp->b_addr;
                        ASSERT(tmp_info->magic == old_info->magic);
@@ -1726,7 +1726,7 @@ xfs_da3_blk_link(
        xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
        xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
-        return(0);
+        return 0;
 }
 /*
@@ -1772,7 +1772,7 @@ xfs_da3_blk_unlink(
                                                be32_to_cpu(drop_info->back),
                                                -1, &bp, args->whichfork);
                        if (error)
-                                return(error);
+                                return error;
                        ASSERT(bp != NULL);
                        tmp_info = bp->b_addr;
                        ASSERT(tmp_info->magic == save_info->magic);
@@ -1789,7 +1789,7 @@ xfs_da3_blk_unlink(
                                                be32_to_cpu(drop_info->forw),
                                                -1, &bp, args->whichfork);
                        if (error)
-                                return(error);
+                                return error;
                        ASSERT(bp != NULL);
                        tmp_info = bp->b_addr;
                        ASSERT(tmp_info->magic == save_info->magic);
@@ -1801,7 +1801,7 @@ xfs_da3_blk_unlink(
        }
        xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
-        return(0);
+        return 0;
 }
 /*
@@ -1859,9 +1859,9 @@ xfs_da3_path_shift(
                }
        }
        if (level < 0) {
-                *result = XFS_ERROR(ENOENT);    /* we're out of our tree */
+                *result = -ENOENT;      /* we're out of our tree */
                ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-                return(0);
+                return 0;
        }
        /*
@@ -1883,7 +1883,7 @@ xfs_da3_path_shift(
                error = xfs_da3_node_read(args->trans, dp, blkno, -1,
                                        &blk->bp, args->whichfork);
                if (error)
-                        return(error);
+                        return error;
                info = blk->bp->b_addr;
                ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
                       info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
@@ -2004,7 +2004,7 @@ xfs_da_grow_inode_int(
        struct xfs_trans        *tp = args->trans;
        struct xfs_inode        *dp = args->dp;
        int                     w = args->whichfork;
-        xfs_drfsbno_t           nblks = dp->i_d.di_nblocks;
+        xfs_rfsblock_t          nblks = dp->i_d.di_nblocks;
        struct xfs_bmbt_irec    map, *mapp;
        int                     nmap, error, got, i, mapi;
@@ -2068,7 +2068,7 @@ xfs_da_grow_inode_int(
        if (got != count || mapp[0].br_startoff != *bno ||
            mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
            *bno + count) {
-                error = XFS_ERROR(ENOSPC);
+                error = -ENOSPC;
                goto out_free_map;
        }
@@ -2158,7 +2158,7 @@ xfs_da3_swap_lastblock(
        if (unlikely(lastoff == 0)) {
                XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
                                 mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        /*
         * Read the last block in the btree space.
@@ -2209,7 +2209,7 @@ xfs_da3_swap_lastblock(
                    sib_info->magic != dead_info->magic)) {
                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
                                         XFS_ERRLEVEL_LOW, mp);
-                        error = XFS_ERROR(EFSCORRUPTED);
+                        error = -EFSCORRUPTED;
                        goto done;
                }
                sib_info->forw = cpu_to_be32(dead_blkno);
@@ -2231,7 +2231,7 @@ xfs_da3_swap_lastblock(
                       sib_info->magic != dead_info->magic)) {
                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
                                         XFS_ERRLEVEL_LOW, mp);
-                        error = XFS_ERROR(EFSCORRUPTED);
+                        error = -EFSCORRUPTED;
                        goto done;
                }
                sib_info->back = cpu_to_be32(dead_blkno);
@@ -2254,7 +2254,7 @@ xfs_da3_swap_lastblock(
                if (level >= 0 && level != par_hdr.level + 1) {
                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
                                         XFS_ERRLEVEL_LOW, mp);
-                        error = XFS_ERROR(EFSCORRUPTED);
+                        error = -EFSCORRUPTED;
                        goto done;
                }
                level = par_hdr.level;
@@ -2267,7 +2267,7 @@ xfs_da3_swap_lastblock(
                if (entno == par_hdr.count) {
                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
                                         XFS_ERRLEVEL_LOW, mp);
-                        error = XFS_ERROR(EFSCORRUPTED);
+                        error = -EFSCORRUPTED;
                        goto done;
                }
                par_blkno = be32_to_cpu(btree[entno].before);
@@ -2294,7 +2294,7 @@ xfs_da3_swap_lastblock(
                if (unlikely(par_blkno == 0)) {
                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
                                         XFS_ERRLEVEL_LOW, mp);
-                        error = XFS_ERROR(EFSCORRUPTED);
+                        error = -EFSCORRUPTED;
                        goto done;
                }
                error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
@@ -2305,7 +2305,7 @@ xfs_da3_swap_lastblock(
                if (par_hdr.level != level) {
                        XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
                                         XFS_ERRLEVEL_LOW, mp);
-                        error = XFS_ERROR(EFSCORRUPTED);
+                        error = -EFSCORRUPTED;
                        goto done;
                }
                btree = dp->d_ops->node_tree_p(par_node);
@@ -2359,7 +2359,7 @@ xfs_da_shrink_inode(
                error = xfs_bunmapi(tp, dp, dead_blkno, count,
                                    xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
                                    0, args->firstblock, args->flist, &done);
-                if (error == ENOSPC) {
+                if (error == -ENOSPC) {
                        if (w != XFS_DATA_FORK)
                                break;
                        error = xfs_da3_swap_lastblock(args, &dead_blkno,
@@ -2427,7 +2427,7 @@ xfs_buf_map_from_irec(
                map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
                                  KM_SLEEP | KM_NOFS);
                if (!map)
-                        return ENOMEM;
+                        return -ENOMEM;
                *mapp = map;
        }
@@ -2500,8 +2500,8 @@ xfs_dabuf_map(
        }
        if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
-                error = mappedbno == -2 ? -1 : XFS_ERROR(EFSCORRUPTED);
+                error = mappedbno == -2 ? -1 : -EFSCORRUPTED;
-                if (unlikely(error == EFSCORRUPTED)) {
+                if (unlikely(error == -EFSCORRUPTED)) {
                        if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
                                int i;
                                xfs_alert(mp, "%s: bno %lld dir: inode %lld",
@@ -2561,7 +2561,7 @@ xfs_da_get_buf(
        bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
                                    mapp, nmap, 0);
-        error = bp ? bp->b_error : XFS_ERROR(EIO);
+        error = bp ? bp->b_error : -EIO;
        if (error) {
                xfs_trans_brelse(trans, bp);
                goto out_free;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 6e153e399a77..6e153e399a77 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index c9aee52a37e2..c9aee52a37e2 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 0a49b0286372..0a49b0286372 100644
--- a/fs/xfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/libxfs/xfs_dinode.h
index 623bbe8fd921..623bbe8fd921 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/libxfs/xfs_dinode.h
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 79670cda48ae..6cef22152fd6 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -108,7 +108,7 @@ xfs_da_mount(
        if (!mp->m_dir_geo || !mp->m_attr_geo) {
                kmem_free(mp->m_dir_geo);
                kmem_free(mp->m_attr_geo);
-                return ENOMEM;
+                return -ENOMEM;
        }
        /* set up directory geometry */
@@ -202,7 +202,7 @@ xfs_dir_ino_validate(
                xfs_warn(mp, "Invalid inode number 0x%Lx",
                                (unsigned long long) ino);
                XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        return 0;
 }
@@ -226,7 +226,7 @@ xfs_dir_init(
        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
        if (!args)
-                return ENOMEM;
+                return -ENOMEM;
        args->geo = dp->i_mount->m_dir_geo;
        args->dp = dp;
@@ -261,7 +261,7 @@ xfs_dir_createname(
        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
        if (!args)
-                return ENOMEM;
+                return -ENOMEM;
        args->geo = dp->i_mount->m_dir_geo;
        args->name = name->name;
@@ -314,18 +314,18 @@ xfs_dir_cilookup_result(
        int             len)
 {
        if (args->cmpresult == XFS_CMP_DIFFERENT)
-                return ENOENT;
+                return -ENOENT;
        if (args->cmpresult != XFS_CMP_CASE ||
                                        !(args->op_flags & XFS_DA_OP_CILOOKUP))
-                return EEXIST;
+                return -EEXIST;
        args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
        if (!args->value)
-                return ENOMEM;
+                return -ENOMEM;
        memcpy(args->value, name, len);
        args->valuelen = len;
-        return EEXIST;
+        return -EEXIST;
 }
 /*
@@ -392,7 +392,7 @@ xfs_dir_lookup(
                rval = xfs_dir2_node_lookup(args);
 out_check_rval:
-        if (rval == EEXIST)
+        if (rval == -EEXIST)
                rval = 0;
        if (!rval) {
                *inum = args->inumber;
@@ -428,7 +428,7 @@ xfs_dir_removename(
        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
        if (!args)
-                return ENOMEM;
+                return -ENOMEM;
        args->geo = dp->i_mount->m_dir_geo;
        args->name = name->name;
@@ -493,7 +493,7 @@ xfs_dir_replace(
        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
        if (!args)
-                return ENOMEM;
+                return -ENOMEM;
        args->geo = dp->i_mount->m_dir_geo;
        args->name = name->name;
@@ -555,7 +555,7 @@ xfs_dir_canenter(
        args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
        if (!args)
-                return ENOMEM;
+                return -ENOMEM;
        args->geo = dp->i_mount->m_dir_geo;
        args->name = name->name;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index c8e86b0b5e99..c8e86b0b5e99 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index c7cd3154026a..9628ceccfa02 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -91,9 +91,9 @@ xfs_dir3_block_read_verify(
        if (xfs_sb_version_hascrc(&mp->m_sb) &&
             !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
-                xfs_buf_ioerror(bp, EFSBADCRC);
+                xfs_buf_ioerror(bp, -EFSBADCRC);
        else if (!xfs_dir3_block_verify(bp))
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
        if (bp->b_error)
                xfs_verifier_error(bp);
@@ -108,7 +108,7 @@ xfs_dir3_block_write_verify(
        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
        if (!xfs_dir3_block_verify(bp)) {
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                return;
        }
@@ -392,7 +392,7 @@ xfs_dir2_block_addname(
        if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
                xfs_trans_brelse(tp, bp);
                if (!dup)
-                        return XFS_ERROR(ENOSPC);
+                        return -ENOSPC;
                return 0;
        }
@@ -402,7 +402,7 @@ xfs_dir2_block_addname(
        if (!dup) {
                /* Don't have a space reservation: return no-space.  */
                if (args->total == 0)
-                        return XFS_ERROR(ENOSPC);
+                        return -ENOSPC;
                /*
                 * Convert to the next larger format.
                 * Then add the new entry in that format.
@@ -647,7 +647,7 @@ xfs_dir2_block_lookup(
        args->filetype = dp->d_ops->data_get_ftype(dep);
        error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
        xfs_trans_brelse(args->trans, bp);
-        return XFS_ERROR(error);
+        return error;
 }
 /*
@@ -703,7 +703,7 @@ xfs_dir2_block_lookup_int(
                if (low > high) {
                        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
                        xfs_trans_brelse(tp, bp);
-                        return XFS_ERROR(ENOENT);
+                        return -ENOENT;
                }
        }
        /*
@@ -751,7 +751,7 @@ xfs_dir2_block_lookup_int(
         * No match, release the buffer and return ENOENT.
         */
        xfs_trans_brelse(tp, bp);
-        return XFS_ERROR(ENOENT);
+        return -ENOENT;
 }
 /*
@@ -1091,7 +1091,7 @@ xfs_dir2_sf_to_block(
         */
        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
                ASSERT(XFS_FORCED_SHUTDOWN(mp));
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 8c2f6422648e..fdd803fecb8e 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -100,7 +100,7 @@ __xfs_dir3_data_check(
                break;
        default:
                XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
-                return EFSCORRUPTED;
+                return -EFSCORRUPTED;
        }
        /*
@@ -256,7 +256,7 @@ xfs_dir3_data_reada_verify(
                xfs_dir3_data_verify(bp);
                return;
        default:
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                break;
        }
@@ -270,9 +270,9 @@ xfs_dir3_data_read_verify(
        if (xfs_sb_version_hascrc(&mp->m_sb) &&
             !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
-                 xfs_buf_ioerror(bp, EFSBADCRC);
+                 xfs_buf_ioerror(bp, -EFSBADCRC);
        else if (!xfs_dir3_data_verify(bp))
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
        if (bp->b_error)
                xfs_verifier_error(bp);
@@ -287,7 +287,7 @@ xfs_dir3_data_write_verify(
        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
        if (!xfs_dir3_data_verify(bp)) {
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                return;
        }
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index fb0aad4440c1..a19174eb3cb2 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -183,9 +183,9 @@ __read_verify(
        if (xfs_sb_version_hascrc(&mp->m_sb) &&
             !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
-                xfs_buf_ioerror(bp, EFSBADCRC);
+                xfs_buf_ioerror(bp, -EFSBADCRC);
        else if (!xfs_dir3_leaf_verify(bp, magic))
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
        if (bp->b_error)
                xfs_verifier_error(bp);
@@ -201,7 +201,7 @@ __write_verify(
        struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
        if (!xfs_dir3_leaf_verify(bp, magic)) {
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                return;
        }
@@ -731,7 +731,7 @@ xfs_dir2_leaf_addname(
                if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
                                                        args->total == 0) {
                        xfs_trans_brelse(tp, lbp);
-                        return XFS_ERROR(ENOSPC);
+                        return -ENOSPC;
                }
                /*
                 * Convert to node form.
@@ -755,7 +755,7 @@ xfs_dir2_leaf_addname(
         */
        if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
                xfs_trans_brelse(tp, lbp);
-                return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
+                return use_block == -1 ? -ENOSPC : 0;
        }
        /*
         * If no allocations are allowed, return now before we've
@@ -763,7 +763,7 @@ xfs_dir2_leaf_addname(
         */
        if (args->total == 0 && use_block == -1) {
                xfs_trans_brelse(tp, lbp);
-                return XFS_ERROR(ENOSPC);
+                return -ENOSPC;
        }
        /*
         * Need to compact the leaf entries, removing stale ones.
@@ -1198,7 +1198,7 @@ xfs_dir2_leaf_lookup(
        error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
        xfs_trans_brelse(tp, dbp);
        xfs_trans_brelse(tp, lbp);
-        return XFS_ERROR(error);
+        return error;
 }
 /*
@@ -1327,13 +1327,13 @@ xfs_dir2_leaf_lookup_int(
                return 0;
        }
        /*
-         * No match found, return ENOENT.
+         * No match found, return -ENOENT.
         */
        ASSERT(cidb == -1);
        if (dbp)
                xfs_trans_brelse(tp, dbp);
        xfs_trans_brelse(tp, lbp);
-        return XFS_ERROR(ENOENT);
+        return -ENOENT;
 }
 /*
@@ -1440,7 +1440,7 @@ xfs_dir2_leaf_removename(
                         * Just go on, returning success, leaving the
                         * empty block in place.
                         */
-                        if (error == ENOSPC && args->total == 0)
+                        if (error == -ENOSPC && args->total == 0)
                                error = 0;
                        xfs_dir3_leaf_check(dp, lbp);
                        return error;
@@ -1641,7 +1641,7 @@ xfs_dir2_leaf_trim_data(
         * Get rid of the data block.
         */
        if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
-                ASSERT(error != ENOSPC);
+                ASSERT(error != -ENOSPC);
                xfs_trans_brelse(tp, dbp);
                return error;
        }
@@ -1815,7 +1815,7 @@ xfs_dir2_node_to_leaf(
                 * punching out the middle of an extent, and this is an
                 * isolated block.
                 */
-                ASSERT(error != ENOSPC);
+                ASSERT(error != -ENOSPC);
                return error;
        }
        fbp = NULL;
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index da43d304fca2..2ae6ac2c11ae 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -117,9 +117,9 @@ xfs_dir3_free_read_verify(
        if (xfs_sb_version_hascrc(&mp->m_sb) &&
            !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
-                xfs_buf_ioerror(bp, EFSBADCRC);
+                xfs_buf_ioerror(bp, -EFSBADCRC);
        else if (!xfs_dir3_free_verify(bp))
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
        if (bp->b_error)
                xfs_verifier_error(bp);
@@ -134,7 +134,7 @@ xfs_dir3_free_write_verify(
        struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
        if (!xfs_dir3_free_verify(bp)) {
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                return;
        }
@@ -406,7 +406,7 @@ xfs_dir2_leafn_add(
         * into other peoples memory
         */
        if (index < 0)
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        /*
         * If there are already the maximum number of leaf entries in
@@ -417,7 +417,7 @@ xfs_dir2_leafn_add(
        if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
                if (!leafhdr.stale)
-                        return XFS_ERROR(ENOSPC);
+                        return -ENOSPC;
                compact = leafhdr.stale > 1;
        } else
                compact = 0;
@@ -629,7 +629,7 @@ xfs_dir2_leafn_lookup_for_addname(
                                                        XFS_ERRLEVEL_LOW, mp);
                                if (curfdb != newfdb)
                                        xfs_trans_brelse(tp, curbp);
-                                return XFS_ERROR(EFSCORRUPTED);
+                                return -EFSCORRUPTED;
                        }
                        curfdb = newfdb;
                        if (be16_to_cpu(bests[fi]) >= length)
@@ -660,7 +660,7 @@ out:
         * Return the index, that will be the insertion point.
         */
        *indexp = index;
-        return XFS_ERROR(ENOENT);
+        return -ENOENT;
 }
 /*
@@ -789,7 +789,7 @@ xfs_dir2_leafn_lookup_for_entry(
                        curbp->b_ops = &xfs_dir3_data_buf_ops;
                        xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
                        if (cmp == XFS_CMP_EXACT)
-                                return XFS_ERROR(EEXIST);
+                                return -EEXIST;
                }
        }
        ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
@@ -812,7 +812,7 @@ xfs_dir2_leafn_lookup_for_entry(
                state->extravalid = 0;
        }
        *indexp = index;
-        return XFS_ERROR(ENOENT);
+        return -ENOENT;
 }
 /*
@@ -1133,7 +1133,7 @@ xfs_dir3_data_block_free(
                if (error == 0) {
                        fbp = NULL;
                        logfree = 0;
-                } else if (error != ENOSPC || args->total != 0)
+                } else if (error != -ENOSPC || args->total != 0)
                        return error;
                /*
                 * It's possible to get ENOSPC if there is no
@@ -1287,7 +1287,7 @@ xfs_dir2_leafn_remove(
                         * In this case just drop the buffer and some one else
                         * will eventually get rid of the empty block.
                         */
-                        else if (!(error == ENOSPC && args->total == 0))
+                        else if (!(error == -ENOSPC && args->total == 0))
                                return error;
                }
                /*
@@ -1599,7 +1599,7 @@ xfs_dir2_node_addname(
        error = xfs_da3_node_lookup_int(state, &rval);
        if (error)
                rval = error;
-        if (rval != ENOENT) {
+        if (rval != -ENOENT) {
                goto done;
        }
        /*
@@ -1628,7 +1628,7 @@ xfs_dir2_node_addname(
                 * It didn't work, we need to split the leaf block.
                 */
                if (args->total == 0) {
-                        ASSERT(rval == ENOSPC);
+                        ASSERT(rval == -ENOSPC);
                        goto done;
                }
                /*
@@ -1815,7 +1815,7 @@ xfs_dir2_node_addname_int(
                 * Not allowed to allocate, return failure.
                 */
                if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
-                        return XFS_ERROR(ENOSPC);
+                        return -ENOSPC;
                /*
                 * Allocate and initialize the new data block.
@@ -1876,7 +1876,7 @@ xfs_dir2_node_addname_int(
                                }
                                XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
                                                 XFS_ERRLEVEL_LOW, mp);
-                                return XFS_ERROR(EFSCORRUPTED);
+                                return -EFSCORRUPTED;
                        }
                        /*
@@ -2042,8 +2042,8 @@ xfs_dir2_node_lookup(
        error = xfs_da3_node_lookup_int(state, &rval);
        if (error)
                rval = error;
-        else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
+        else if (rval == -ENOENT && args->cmpresult == XFS_CMP_CASE) {
-                /* If a CI match, dup the actual name and return EEXIST */
+                /* If a CI match, dup the actual name and return -EEXIST */
                xfs_dir2_data_entry_t   *dep;
                dep = (xfs_dir2_data_entry_t *)
@@ -2096,7 +2096,7 @@ xfs_dir2_node_removename(
                goto out_free;
        /* Didn't find it, upper layer screwed up. */
-        if (rval != EEXIST) {
+        if (rval != -EEXIST) {
                error = rval;
                goto out_free;
        }
@@ -2169,7 +2169,7 @@ xfs_dir2_node_replace(
         * It should be found, since the vnodeops layer has looked it up
         * and locked it.  But paranoia is good.
         */
-        if (rval == EEXIST) {
+        if (rval == -EEXIST) {
                struct xfs_dir2_leaf_entry *ents;
                /*
                 * Find the leaf entry.
@@ -2272,7 +2272,7 @@ xfs_dir2_node_trim_free(
                 * space reservation, when breaking up an extent into two
                 * pieces.  This is the last block of an extent.
                 */
-                ASSERT(error != ENOSPC);
+                ASSERT(error != -ENOSPC);
                xfs_trans_brelse(tp, bp);
                return error;
        }
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 27ce0794d196..27ce0794d196 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 53c3be619db5..5079e051ef08 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -51,10 +51,9 @@ static void xfs_dir2_sf_check(xfs_da_args_t *args);
 #else
 #define xfs_dir2_sf_check(args)
 #endif /* DEBUG */
-#if XFS_BIG_INUMS
 static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
 static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
-#endif /* XFS_BIG_INUMS */
 /*
 * Given a block directory (dp/block), calculate its size as a shortform (sf)
@@ -117,10 +116,10 @@ xfs_dir2_block_sfsize(
                isdotdot =
                        dep->namelen == 2 &&
                        dep->name[0] == '.' && dep->name[1] == '.';
-#if XFS_BIG_INUMS
                if (!isdot)
                        i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
-#endif
                /* take into account the file type field */
                if (!isdot && !isdotdot) {
                        count++;
@@ -251,7 +250,7 @@ xfs_dir2_block_to_sf(
        logflags = XFS_ILOG_CORE;
        error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
        if (error) {
-                ASSERT(error != ENOSPC);
+                ASSERT(error != -ENOSPC);
                goto out;
        }
@@ -299,7 +298,7 @@ xfs_dir2_sf_addname(
        trace_xfs_dir2_sf_addname(args);
-        ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
+        ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
        dp = args->dp;
        ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
        /*
@@ -307,7 +306,7 @@ xfs_dir2_sf_addname(
         */
        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
        ASSERT(dp->i_df.if_u1.if_data != NULL);
@@ -318,7 +317,7 @@ xfs_dir2_sf_addname(
         */
        incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
        objchange = 0;
-#if XFS_BIG_INUMS
        /*
         * Do we have to change to 8 byte inodes?
         */
@@ -332,7 +331,7 @@ xfs_dir2_sf_addname(
                         (uint)sizeof(xfs_dir2_ino4_t));
                objchange = 1;
        }
-#endif
        new_isize = (int)dp->i_d.di_size + incr_isize;
        /*
         * Won't fit as shortform any more (due to size),
@@ -345,7 +344,7 @@ xfs_dir2_sf_addname(
                 * Just checking or no space reservation, it doesn't fit.
                 */
                if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
-                        return XFS_ERROR(ENOSPC);
+                        return -ENOSPC;
                /*
                 * Convert to block form then add the name.
                 */
@@ -370,10 +369,8 @@ xfs_dir2_sf_addname(
         */
        else {
                ASSERT(pick == 2);
-#if XFS_BIG_INUMS
                if (objchange)
                        xfs_dir2_sf_toino8(args);
-#endif
                xfs_dir2_sf_addname_hard(args, objchange, new_isize);
        }
        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
@@ -425,10 +422,8 @@ xfs_dir2_sf_addname_easy(
         * Update the header and inode.
         */
        sfp->count++;
-#if XFS_BIG_INUMS
        if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
                sfp->i8count++;
-#endif
        dp->i_d.di_size = new_isize;
        xfs_dir2_sf_check(args);
 }
@@ -516,10 +511,8 @@ xfs_dir2_sf_addname_hard(
        dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
        dp->d_ops->sf_put_ftype(sfep, args->filetype);
        sfp->count++;
-#if XFS_BIG_INUMS
        if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
                sfp->i8count++;
-#endif
        /*
         * If there's more left to copy, do that.
         */
@@ -593,13 +586,8 @@ xfs_dir2_sf_addname_pick(
        /*
         * If changing the inode number size, do it the hard way.
         */
-#if XFS_BIG_INUMS
+        if (objchange)
-        if (objchange) {
                return 2;
-        }
-#else
-        ASSERT(objchange == 0);
-#endif
        /*
         * If it won't fit at the end then do it the hard way (use the hole).
         */
@@ -650,7 +638,6 @@ xfs_dir2_sf_check(
                ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
        }
        ASSERT(i8count == sfp->i8count);
-        ASSERT(XFS_BIG_INUMS || i8count == 0);
        ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
        ASSERT(offset +
               (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
@@ -738,7 +725,7 @@ xfs_dir2_sf_lookup(
         */
        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
        ASSERT(dp->i_df.if_u1.if_data != NULL);
@@ -751,7 +738,7 @@ xfs_dir2_sf_lookup(
                args->inumber = dp->i_ino;
                args->cmpresult = XFS_CMP_EXACT;
                args->filetype = XFS_DIR3_FT_DIR;
-                return XFS_ERROR(EEXIST);
+                return -EEXIST;
        }
        /*
         * Special case for ..
@@ -761,7 +748,7 @@ xfs_dir2_sf_lookup(
                args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
                args->cmpresult = XFS_CMP_EXACT;
                args->filetype = XFS_DIR3_FT_DIR;
-                return XFS_ERROR(EEXIST);
+                return -EEXIST;
        }
        /*
         * Loop over all the entries trying to match ours.
@@ -781,20 +768,20 @@ xfs_dir2_sf_lookup(
                        args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
                        args->filetype = dp->d_ops->sf_get_ftype(sfep);
                        if (cmp == XFS_CMP_EXACT)
-                                return XFS_ERROR(EEXIST);
+                                return -EEXIST;
                        ci_sfep = sfep;
                }
        }
        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
        /*
         * Here, we can only be doing a lookup (not a rename or replace).
-         * If a case-insensitive match was not found, return ENOENT.
+         * If a case-insensitive match was not found, return -ENOENT.
         */
        if (!ci_sfep)
-                return XFS_ERROR(ENOENT);
+                return -ENOENT;
        /* otherwise process the CI match as required by the caller */
        error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
-        return XFS_ERROR(error);
+        return error;
 }
 /*
@@ -824,7 +811,7 @@ xfs_dir2_sf_removename(
         */
        if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        ASSERT(dp->i_df.if_bytes == oldsize);
        ASSERT(dp->i_df.if_u1.if_data != NULL);
@@ -847,7 +834,7 @@ xfs_dir2_sf_removename(
         * Didn't find it.
         */
        if (i == sfp->count)
-                return XFS_ERROR(ENOENT);
+                return -ENOENT;
        /*
         * Calculate sizes.
         */
@@ -870,7 +857,6 @@ xfs_dir2_sf_removename(
         */
        xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
-#if XFS_BIG_INUMS
        /*
         * Are we changing inode number size?
         */
@@ -880,7 +866,6 @@ xfs_dir2_sf_removename(
                else
                        sfp->i8count--;
        }
-#endif
        xfs_dir2_sf_check(args);
        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
        return 0;
@@ -895,12 +880,8 @@ xfs_dir2_sf_replace(
 {
        xfs_inode_t             *dp;            /* incore directory inode */
        int                     i;              /* entry index */
-#if XFS_BIG_INUMS || defined(DEBUG)
        xfs_ino_t               ino=0;          /* entry old inode number */
-#endif
-#if XFS_BIG_INUMS
        int                     i8elevated;     /* sf_toino8 set i8count=1 */
-#endif
        xfs_dir2_sf_entry_t     *sfep;          /* shortform directory entry */
        xfs_dir2_sf_hdr_t       *sfp;           /* shortform structure */
@@ -914,13 +895,13 @@ xfs_dir2_sf_replace(
         */
        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
        ASSERT(dp->i_df.if_u1.if_data != NULL);
        sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
        ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
-#if XFS_BIG_INUMS
        /*
         * New inode number is large, and need to convert to 8-byte inodes.
         */
@@ -951,17 +932,15 @@ xfs_dir2_sf_replace(
                sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
        } else
                i8elevated = 0;
-#endif
        ASSERT(args->namelen != 1 || args->name[0] != '.');
        /*
         * Replace ..'s entry.
         */
        if (args->namelen == 2 &&
            args->name[0] == '.' && args->name[1] == '.') {
-#if XFS_BIG_INUMS || defined(DEBUG)
                ino = dp->d_ops->sf_get_parent_ino(sfp);
                ASSERT(args->inumber != ino);
-#endif
                dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
        }
        /*
@@ -972,10 +951,8 @@ xfs_dir2_sf_replace(
                     i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
                        if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
                                                                XFS_CMP_EXACT) {
-#if XFS_BIG_INUMS || defined(DEBUG)
                                ino = dp->d_ops->sf_get_ino(sfp, sfep);
                                ASSERT(args->inumber != ino);
-#endif
                                dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
                                dp->d_ops->sf_put_ftype(sfep, args->filetype);
                                break;
@@ -986,14 +963,11 @@ xfs_dir2_sf_replace(
                 */
                if (i == sfp->count) {
                        ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
-#if XFS_BIG_INUMS
                        if (i8elevated)
                                xfs_dir2_sf_toino4(args);
-#endif
+                        return -ENOENT;
-                        return XFS_ERROR(ENOENT);
                }
        }
-#if XFS_BIG_INUMS
        /*
         * See if the old number was large, the new number is small.
         */
@@ -1020,13 +994,11 @@ xfs_dir2_sf_replace(
                if (!i8elevated)
                        sfp->i8count++;
        }
-#endif
        xfs_dir2_sf_check(args);
        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
        return 0;
 }
-#if XFS_BIG_INUMS
 /*
 * Convert from 8-byte inode numbers to 4-byte inode numbers.
 * The last 8-byte inode number is gone, but the count is still 1.
@@ -1181,4 +1153,3 @@ xfs_dir2_sf_toino8(
        dp->i_d.di_size = newsize;
        xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
 }
-#endif  /* XFS_BIG_INUMS */
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index c2ac0c611ad8..bb969337efc8 100644
--- a/fs/xfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -257,9 +257,9 @@ xfs_dquot_buf_read_verify(
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        if (!xfs_dquot_buf_verify_crc(mp, bp))
-                xfs_buf_ioerror(bp, EFSBADCRC);
+                xfs_buf_ioerror(bp, -EFSBADCRC);
        else if (!xfs_dquot_buf_verify(mp, bp))
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
        if (bp->b_error)
                xfs_verifier_error(bp);
@@ -277,7 +277,7 @@ xfs_dquot_buf_write_verify(
        struct xfs_mount        *mp = bp->b_target->bt_mount;
        if (!xfs_dquot_buf_verify(mp, bp)) {
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                return;
        }
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 34d85aca3058..7e42bba9a420 100644
--- a/fs/xfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -68,11 +68,7 @@ struct xfs_ifork;
 #define XFS_RTLOBIT(w)  xfs_lowbit32(w)
 #define XFS_RTHIBIT(w)  xfs_highbit32(w)
-#if XFS_BIG_BLKNOS
 #define XFS_RTBLOCKLOG(b)       xfs_highbit64(b)
-#else
-#define XFS_RTBLOCKLOG(b)       xfs_highbit32(b)
-#endif
 /*
 * Dquot and dquot block format definitions
@@ -304,23 +300,15 @@ typedef struct xfs_bmbt_rec_host {
 * Values and macros for delayed-allocation startblock fields.
 */
 #define STARTBLOCKVALBITS       17
-#define STARTBLOCKMASKBITS      (15 + XFS_BIG_BLKNOS * 20)
+#define STARTBLOCKMASKBITS      (15 + 20)
-#define DSTARTBLOCKMASKBITS     (15 + 20)
 #define STARTBLOCKMASK          \
        (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
-#define DSTARTBLOCKMASK         \
-        (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
 static inline int isnullstartblock(xfs_fsblock_t x)
 {
        return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
 }
-static inline int isnulldstartblock(xfs_dfsbno_t x)
-{
-        return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
-}
 static inline xfs_fsblock_t nullstartblock(int k)
 {
        ASSERT(k < (1 << STARTBLOCKVALBITS));
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 5960e5593fe0..b62771f1f4b5 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -292,7 +292,7 @@ xfs_ialloc_inode_init(
                                         mp->m_bsize * blks_per_cluster,
                                         XBF_UNMAPPED);
                if (!fbuf)
-                        return ENOMEM;
+                        return -ENOMEM;
                /* Initialize the inode buffers and log them appropriately. */
                fbuf->b_ops = &xfs_inode_buf_ops;
@@ -380,7 +380,7 @@ xfs_ialloc_ag_alloc(
        newlen = args.mp->m_ialloc_inos;
        if (args.mp->m_maxicount &&
            args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
-                return XFS_ERROR(ENOSPC);
+                return -ENOSPC;
        args.minlen = args.maxlen = args.mp->m_ialloc_blks;
        /*
         * First try to allocate inodes contiguous with the last-allocated
@@ -1385,7 +1385,7 @@ xfs_dialloc(
                if (error) {
                        xfs_trans_brelse(tp, agbp);
-                        if (error != ENOSPC)
+                        if (error != -ENOSPC)
                                goto out_error;
                        xfs_perag_put(pag);
@@ -1416,7 +1416,7 @@ nextag:
                        agno = 0;
                if (agno == start_agno) {
                        *inop = NULLFSINO;
-                        return noroom ? ENOSPC : 0;
+                        return noroom ? -ENOSPC : 0;
                }
        }
@@ -1425,7 +1425,7 @@ out_alloc:
        return xfs_dialloc_ag(tp, agbp, parent, inop);
 out_error:
        xfs_perag_put(pag);
-        return XFS_ERROR(error);
+        return error;
 }
 STATIC int
@@ -1682,7 +1682,7 @@ xfs_difree(
                xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
                        __func__, agno, mp->m_sb.sb_agcount);
                ASSERT(0);
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        agino = XFS_INO_TO_AGINO(mp, inode);
        if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
@@ -1690,14 +1690,14 @@ xfs_difree(
                        __func__, (unsigned long long)inode,
                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
                ASSERT(0);
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
        if (agbno >= mp->m_sb.sb_agblocks)  {
                xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
                        __func__, agbno, mp->m_sb.sb_agblocks);
                ASSERT(0);
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        /*
         * Get the allocation group header.
@@ -1769,7 +1769,7 @@ xfs_imap_lookup(
                if (i)
                        error = xfs_inobt_get_rec(cur, &rec, &i);
                if (!error && i == 0)
-                        error = EINVAL;
+                        error = -EINVAL;
        }
        xfs_trans_brelse(tp, agbp);
@@ -1780,12 +1780,12 @@ xfs_imap_lookup(
        /* check that the returned record contains the required inode */
        if (rec.ir_startino > agino ||
            rec.ir_startino + mp->m_ialloc_inos <= agino)
-                return EINVAL;
+                return -EINVAL;
        /* for untrusted inodes check it is allocated first */
        if ((flags & XFS_IGET_UNTRUSTED) &&
            (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
-                return EINVAL;
+                return -EINVAL;
        *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
        *offset_agbno = agbno - *chunk_agbno;
@@ -1829,7 +1829,7 @@ xfs_imap(
                 * as they can be invalid without implying corruption.
                 */
                if (flags & XFS_IGET_UNTRUSTED)
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                if (agno >= mp->m_sb.sb_agcount) {
                        xfs_alert(mp,
                                "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
@@ -1849,7 +1849,7 @@ xfs_imap(
                }
                xfs_stack_trace();
 #endif /* DEBUG */
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        blks_per_cluster = xfs_icluster_size_fsb(mp);
@@ -1922,7 +1922,7 @@ out_map:
                        __func__, (unsigned long long) imap->im_blkno,
                        (unsigned long long) imap->im_len,
                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        return 0;
 }
@@ -2072,11 +2072,11 @@ xfs_agi_read_verify(
        if (xfs_sb_version_hascrc(&mp->m_sb) &&
            !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
-                xfs_buf_ioerror(bp, EFSBADCRC);
+                xfs_buf_ioerror(bp, -EFSBADCRC);
        else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
                                XFS_ERRTAG_IALLOC_READ_AGI,
                                XFS_RANDOM_IALLOC_READ_AGI))
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
        if (bp->b_error)
                xfs_verifier_error(bp);
@@ -2090,7 +2090,7 @@ xfs_agi_write_verify(
        struct xfs_buf_log_item *bip = bp->b_fspriv;
        if (!xfs_agi_verify(bp)) {
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                return;
        }
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 95ad1c002d60..95ad1c002d60 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 726f83a681a5..c9b06f30fe86 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -272,9 +272,9 @@ xfs_inobt_read_verify(
        struct xfs_buf  *bp)
 {
        if (!xfs_btree_sblock_verify_crc(bp))
-                xfs_buf_ioerror(bp, EFSBADCRC);
+                xfs_buf_ioerror(bp, -EFSBADCRC);
        else if (!xfs_inobt_verify(bp))
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
        if (bp->b_error) {
                trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -288,7 +288,7 @@ xfs_inobt_write_verify(
 {
        if (!xfs_inobt_verify(bp)) {
                trace_xfs_btree_corrupt(bp, _RET_IP_);
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                return;
        }
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index d7ebea72c2d0..d7ebea72c2d0 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index cb35ae41d4a1..f18fd2da49f7 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -101,7 +101,7 @@ xfs_inode_buf_verify(
                                return;
                        }
-                        xfs_buf_ioerror(bp, EFSCORRUPTED);
+                        xfs_buf_ioerror(bp, -EFSCORRUPTED);
                        xfs_verifier_error(bp);
 #ifdef DEBUG
                        xfs_alert(mp,
@@ -174,14 +174,14 @@ xfs_imap_to_bp(
                                   (int)imap->im_len, buf_flags, &bp,
                                   &xfs_inode_buf_ops);
        if (error) {
-                if (error == EAGAIN) {
+                if (error == -EAGAIN) {
                        ASSERT(buf_flags & XBF_TRYLOCK);
                        return error;
                }
-                if (error == EFSCORRUPTED &&
+                if (error == -EFSCORRUPTED &&
                    (iget_flags & XFS_IGET_UNTRUSTED))
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
                        __func__, error);
@@ -390,7 +390,7 @@ xfs_iread(
                                __func__, ip->i_ino);
                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
-                error = XFS_ERROR(EFSCORRUPTED);
+                error = -EFSCORRUPTED;
                goto out_brelse;
        }
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 9308c47f2a52..9308c47f2a52 100644
--- a/fs/xfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index b031e8d0d928..6a00f7fed69d 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -102,7 +102,7 @@ xfs_iformat_fork(
                                be64_to_cpu(dip->di_nblocks));
                XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
@@ -111,7 +111,7 @@ xfs_iformat_fork(
                        dip->di_forkoff);
                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
@@ -121,7 +121,7 @@ xfs_iformat_fork(
                        ip->i_ino);
                XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
                                     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        switch (ip->i_d.di_mode & S_IFMT) {
@@ -132,7 +132,7 @@ xfs_iformat_fork(
                if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
                        XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
                                              ip->i_mount, dip);
-                        return XFS_ERROR(EFSCORRUPTED);
+                        return -EFSCORRUPTED;
                }
                ip->i_d.di_size = 0;
                ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
@@ -153,7 +153,7 @@ xfs_iformat_fork(
                                XFS_CORRUPTION_ERROR("xfs_iformat(4)",
                                                     XFS_ERRLEVEL_LOW,
                                                     ip->i_mount, dip);
-                                return XFS_ERROR(EFSCORRUPTED);
+                                return -EFSCORRUPTED;
                        }
                        di_size = be64_to_cpu(dip->di_size);
@@ -166,7 +166,7 @@ xfs_iformat_fork(
                                XFS_CORRUPTION_ERROR("xfs_iformat(5)",
                                                     XFS_ERRLEVEL_LOW,
                                                     ip->i_mount, dip);
-                                return XFS_ERROR(EFSCORRUPTED);
+                                return -EFSCORRUPTED;
                        }
                        size = (int)di_size;
@@ -181,13 +181,13 @@ xfs_iformat_fork(
                default:
                        XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
                                         ip->i_mount);
-                        return XFS_ERROR(EFSCORRUPTED);
+                        return -EFSCORRUPTED;
                }
                break;
        default:
                XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        if (error) {
                return error;
@@ -211,7 +211,7 @@ xfs_iformat_fork(
                        XFS_CORRUPTION_ERROR("xfs_iformat(8)",
                                             XFS_ERRLEVEL_LOW,
                                             ip->i_mount, dip);
-                        return XFS_ERROR(EFSCORRUPTED);
+                        return -EFSCORRUPTED;
                }
                error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
@@ -223,7 +223,7 @@ xfs_iformat_fork(
                error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
                break;
        default:
-                error = XFS_ERROR(EFSCORRUPTED);
+                error = -EFSCORRUPTED;
                break;
        }
        if (error) {
@@ -266,7 +266,7 @@ xfs_iformat_local(
                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
                XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        ifp = XFS_IFORK_PTR(ip, whichfork);
        real_size = 0;
@@ -322,7 +322,7 @@ xfs_iformat_extents(
                        (unsigned long long) ip->i_ino, nex);
                XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        ifp->if_real_bytes = 0;
@@ -350,7 +350,7 @@ xfs_iformat_extents(
                                        XFS_ERROR_REPORT("xfs_iformat_extents(2)",
                                                         XFS_ERRLEVEL_LOW,
                                                         ip->i_mount);
-                                        return XFS_ERROR(EFSCORRUPTED);
+                                        return -EFSCORRUPTED;
                                }
        }
        ifp->if_flags |= XFS_IFEXTENTS;
@@ -399,7 +399,7 @@ xfs_iformat_btree(
                                        (unsigned long long) ip->i_ino);
                XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
                                         mp, dip);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        ifp->if_broot_bytes = size;
@@ -436,7 +436,7 @@ xfs_iread_extents(
        if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
                XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
                                 ip->i_mount);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
        ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -528,7 +528,7 @@ xfs_iroot_realloc(
                ifp->if_broot_bytes = (int)new_size;
                ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
                        XFS_IFORK_SIZE(ip, whichfork));
-                memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
+                memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
                return;
        }
@@ -575,7 +575,7 @@ xfs_iroot_realloc(
                                                     ifp->if_broot_bytes);
                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
                                                     (int)new_size);
-                memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
+                memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
        }
        kmem_free(ifp->if_broot);
        ifp->if_broot = new_broot;
@@ -1692,7 +1692,7 @@ xfs_iext_idx_to_irec(
        }
        *idxp = page_idx;
        *erp_idxp = erp_idx;
-        return(erp);
+        return erp;
 }
 /*
diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 7d3b1ed6dcbe..7d3b1ed6dcbe 100644
--- a/fs/xfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/libxfs/xfs_inum.h
index 90efdaf1706f..4ff2278e147a 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/libxfs/xfs_inum.h
@@ -54,11 +54,7 @@ struct xfs_mount;
 #define XFS_OFFBNO_TO_AGINO(mp,b,o)     \
        ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
-#if XFS_BIG_INUMS
 #define XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 56) - 1ULL))
-#else
-#define XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 32) - 1ULL))
-#endif
 #define XFS_MAXINUMBER_32       ((xfs_ino_t)((1ULL << 32) - 1ULL))
 #endif  /* __XFS_INUM_H__ */
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index f0969c77bdbe..aff12f2d4428 100644
--- a/fs/xfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -380,7 +380,7 @@ typedef struct xfs_icdinode {
        xfs_ictimestamp_t di_mtime;     /* time last modified */
        xfs_ictimestamp_t di_ctime;     /* time created/inode modified */
        xfs_fsize_t     di_size;        /* number of bytes in file */
-        xfs_drfsbno_t   di_nblocks;     /* # of direct & btree blocks used */
+        xfs_rfsblock_t  di_nblocks;     /* # of direct & btree blocks used */
        xfs_extlen_t    di_extsize;     /* basic/minimum extent size for file */
        xfs_extnum_t    di_nextents;    /* number of extents in data fork */
        xfs_aextnum_t   di_anextents;   /* number of extents in attribute fork*/
@@ -516,7 +516,7 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
 * EFI/EFD log format definitions
 */
 typedef struct xfs_extent {
-        xfs_dfsbno_t    ext_start;
+        xfs_fsblock_t   ext_start;
        xfs_extlen_t    ext_len;
 } xfs_extent_t;
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 1c55ccbb379d..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index ee7e0e80246b..ee7e0e80246b 100644
--- a/fs/xfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 137e20937077..1b0a08379759 100644
--- a/fs/xfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -98,8 +98,6 @@ typedef __uint16_t	xfs_qwarncnt_t;
 #define XFS_IS_QUOTA_ON(mp)     ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
                                                   XFS_GQUOTA_ACTIVE | \
                                                   XFS_PQUOTA_ACTIVE))
-#define XFS_IS_OQUOTA_ON(mp)    ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
-                                                   XFS_PQUOTA_ACTIVE))
 #define XFS_IS_UQUOTA_ON(mp)    ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
 #define XFS_IS_GQUOTA_ON(mp)    ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
 #define XFS_IS_PQUOTA_ON(mp)    ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index f4dd697cac08..f4dd697cac08 100644
--- a/fs/xfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 7703fa6770ff..ad525a5623a4 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -186,13 +186,13 @@ xfs_mount_validate_sb(
         */
        if (sbp->sb_magicnum != XFS_SB_MAGIC) {
                xfs_warn(mp, "bad magic number");
-                return XFS_ERROR(EWRONGFS);
+                return -EWRONGFS;
        }
        if (!xfs_sb_good_version(sbp)) {
                xfs_warn(mp, "bad version");
-                return XFS_ERROR(EWRONGFS);
+                return -EWRONGFS;
        }
        /*
@@ -220,7 +220,7 @@ xfs_mount_validate_sb(
                                xfs_warn(mp,
 "Attempted to mount read-only compatible filesystem read-write.\n"
 "Filesystem can only be safely mounted read only.");
-                                return XFS_ERROR(EINVAL);
+                                return -EINVAL;
                        }
                }
                if (xfs_sb_has_incompat_feature(sbp,
@@ -230,7 +230,7 @@ xfs_mount_validate_sb(
 "Filesystem can not be safely mounted by this kernel.",
                                (sbp->sb_features_incompat &
                                                XFS_SB_FEAT_INCOMPAT_UNKNOWN));
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
        }
@@ -238,13 +238,13 @@ xfs_mount_validate_sb(
                if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
                        xfs_notice(mp,
                           "Version 5 of Super block has XFS_OQUOTA bits.");
-                        return XFS_ERROR(EFSCORRUPTED);
+                        return -EFSCORRUPTED;
                }
        } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
                                XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
                        xfs_notice(mp,
 "Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.");
-                        return XFS_ERROR(EFSCORRUPTED);
+                        return -EFSCORRUPTED;
        }
        if (unlikely(
@@ -252,7 +252,7 @@ xfs_mount_validate_sb(
                xfs_warn(mp,
                "filesystem is marked as having an external log; "
                "specify logdev on the mount command line.");
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        if (unlikely(
@@ -260,7 +260,7 @@ xfs_mount_validate_sb(
                xfs_warn(mp,
                "filesystem is marked as having an internal log; "
                "do not specify logdev on the mount command line.");
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        /*
@@ -294,7 +294,7 @@ xfs_mount_validate_sb(
            sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp)                      ||
            sbp->sb_shared_vn != 0)) {
                xfs_notice(mp, "SB sanity check failed");
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        /*
@@ -305,7 +305,7 @@ xfs_mount_validate_sb(
                "File system with blocksize %d bytes. "
                "Only pagesize (%ld) or less will currently work.",
                                sbp->sb_blocksize, PAGE_SIZE);
-                return XFS_ERROR(ENOSYS);
+                return -ENOSYS;
        }
        /*
@@ -320,19 +320,19 @@ xfs_mount_validate_sb(
        default:
                xfs_warn(mp, "inode size of %d bytes not supported",
                                sbp->sb_inodesize);
-                return XFS_ERROR(ENOSYS);
+                return -ENOSYS;
        }
        if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
                xfs_warn(mp,
                "file system too large to be mounted on this system.");
-                return XFS_ERROR(EFBIG);
+                return -EFBIG;
        }
        if (check_inprogress && sbp->sb_inprogress) {
                xfs_warn(mp, "Offline file system operation in progress!");
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        return 0;
 }
@@ -386,10 +386,11 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
        }
 }
-void
+static void
-xfs_sb_from_disk(
+__xfs_sb_from_disk(
        struct xfs_sb   *to,
-        xfs_dsb_t       *from)
+        xfs_dsb_t       *from,
+        bool            convert_xquota)
 {
        to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
        to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
@@ -445,6 +446,17 @@ xfs_sb_from_disk(
        to->sb_pad = 0;
        to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
        to->sb_lsn = be64_to_cpu(from->sb_lsn);
+        /* Convert on-disk flags to in-memory flags? */
+        if (convert_xquota)
+                xfs_sb_quota_from_disk(to);
+}
+void
+xfs_sb_from_disk(
+        struct xfs_sb   *to,
+        xfs_dsb_t       *from)
+{
+        __xfs_sb_from_disk(to, from, true);
 }
 static inline void
@@ -577,7 +589,11 @@ xfs_sb_verify(
        struct xfs_mount *mp = bp->b_target->bt_mount;
        struct xfs_sb   sb;
-        xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+        /*
+         * Use call variant which doesn't convert quota flags from disk 
+         * format, because xfs_mount_validate_sb checks the on-disk flags.
+         */
+        __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false);
        /*
         * Only check the in progress field for the primary superblock as
@@ -620,7 +636,7 @@ xfs_sb_read_verify(
                        /* Only fail bad secondaries on a known V5 filesystem */
                        if (bp->b_bn == XFS_SB_DADDR ||
                            xfs_sb_version_hascrc(&mp->m_sb)) {
-                                error = EFSBADCRC;
+                                error = -EFSBADCRC;
                                goto out_error;
                        }
                }
@@ -630,7 +646,7 @@ xfs_sb_read_verify(
 out_error:
        if (error) {
                xfs_buf_ioerror(bp, error);
-                if (error == EFSCORRUPTED || error == EFSBADCRC)
+                if (error == -EFSCORRUPTED || error == -EFSBADCRC)
                        xfs_verifier_error(bp);
        }
 }
@@ -653,7 +669,7 @@ xfs_sb_quiet_read_verify(
                return;
        }
        /* quietly fail */
-        xfs_buf_ioerror(bp, EWRONGFS);
+        xfs_buf_ioerror(bp, -EWRONGFS);
 }
 static void
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index c43c2d609a24..2e739708afd3 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -87,11 +87,11 @@ struct xfs_trans;
 typedef struct xfs_sb {
        __uint32_t      sb_magicnum;    /* magic number == XFS_SB_MAGIC */
        __uint32_t      sb_blocksize;   /* logical block size, bytes */
-        xfs_drfsbno_t   sb_dblocks;     /* number of data blocks */
+        xfs_rfsblock_t  sb_dblocks;     /* number of data blocks */
-        xfs_drfsbno_t   sb_rblocks;     /* number of realtime blocks */
+        xfs_rfsblock_t  sb_rblocks;     /* number of realtime blocks */
-        xfs_drtbno_t    sb_rextents;    /* number of realtime extents */
+        xfs_rtblock_t   sb_rextents;    /* number of realtime extents */
        uuid_t          sb_uuid;        /* file system unique id */
-        xfs_dfsbno_t    sb_logstart;    /* starting block of log if internal */
+        xfs_fsblock_t   sb_logstart;    /* starting block of log if internal */
        xfs_ino_t       sb_rootino;     /* root inode number */
        xfs_ino_t       sb_rbmino;      /* bitmap inode for realtime extents */
        xfs_ino_t       sb_rsumino;     /* summary inode for rt bitmap */
diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 82404da2ca67..82404da2ca67 100644
--- a/fs/xfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 23c2f2577c8d..5782f037eab4 100644
--- a/fs/xfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -133,9 +133,9 @@ xfs_symlink_read_verify(
                return;
        if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
-                xfs_buf_ioerror(bp, EFSBADCRC);
+                xfs_buf_ioerror(bp, -EFSBADCRC);
        else if (!xfs_symlink_verify(bp))
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
        if (bp->b_error)
                xfs_verifier_error(bp);
@@ -153,7 +153,7 @@ xfs_symlink_write_verify(
                return;
        if (!xfs_symlink_verify(bp)) {
-                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                xfs_buf_ioerror(bp, -EFSCORRUPTED);
                xfs_verifier_error(bp);
                return;
        }
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index f2bda7c76b8a..f2bda7c76b8a 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 1097d14cd583..1097d14cd583 100644
--- a/fs/xfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index bf9c4579334d..bf9c4579334d 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 6888ad886ff6..a65fa5dde6e9 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -152,7 +152,7 @@ xfs_get_acl(struct inode *inode, int type)
        if (!xfs_acl)
                return ERR_PTR(-ENOMEM);
-        error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
+        error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
                                                        &len, ATTR_ROOT);
        if (error) {
                /*
@@ -210,7 +210,7 @@ __xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                len -= sizeof(struct xfs_acl_entry) *
                         (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
-                error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
+                error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
                                len, ATTR_ROOT);
                kmem_free(xfs_acl);
@@ -218,7 +218,7 @@ __xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                /*
                 * A NULL ACL argument means we want to remove the ACL.
                 */
-                error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT);
+                error = xfs_attr_remove(ip, ea_name, ATTR_ROOT);
                /*
                 * If the attribute didn't exist to start with that's fine.
@@ -244,7 +244,7 @@ xfs_set_mode(struct inode *inode, umode_t mode)
                iattr.ia_mode = mode;
                iattr.ia_ctime = current_fs_time(inode->i_sb);
-                error = -xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
+                error = xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
        }
        return error;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index faaf716e2080..11e9b4caa54f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -240,7 +240,7 @@ xfs_end_io(
 done:
        if (error)
-                ioend->io_error = -error;
+                ioend->io_error = error;
        xfs_destroy_ioend(ioend);
 }
@@ -308,14 +308,14 @@ xfs_map_blocks(
        int                     nimaps = 1;
        if (XFS_FORCED_SHUTDOWN(mp))
-                return -XFS_ERROR(EIO);
+                return -EIO;
        if (type == XFS_IO_UNWRITTEN)
                bmapi_flags |= XFS_BMAPI_IGSTATE;
        if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
                if (nonblocking)
-                        return -XFS_ERROR(EAGAIN);
+                        return -EAGAIN;
                xfs_ilock(ip, XFS_ILOCK_SHARED);
        }
@@ -332,14 +332,14 @@ xfs_map_blocks(
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        if (error)
-                return -XFS_ERROR(error);
+                return error;
        if (type == XFS_IO_DELALLOC &&
            (!nimaps || isnullstartblock(imap->br_startblock))) {
                error = xfs_iomap_write_allocate(ip, offset, imap);
                if (!error)
                        trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
-                return -XFS_ERROR(error);
+                return error;
        }
 #ifdef DEBUG
@@ -502,7 +502,7 @@ xfs_submit_ioend(
                 * time.
                 */
                if (fail) {
-                        ioend->io_error = -fail;
+                        ioend->io_error = fail;
                        xfs_finish_ioend(ioend);
                        continue;
                }
@@ -1253,7 +1253,7 @@ __xfs_get_blocks(
        int                     new = 0;
        if (XFS_FORCED_SHUTDOWN(mp))
-                return -XFS_ERROR(EIO);
+                return -EIO;
        offset = (xfs_off_t)iblock << inode->i_blkbits;
        ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1302,7 +1302,7 @@ __xfs_get_blocks(
                        error = xfs_iomap_write_direct(ip, offset, size,
                                                       &imap, nimaps);
                        if (error)
-                                return -error;
+                                return error;
                        new = 1;
                } else {
                        /*
@@ -1415,7 +1415,7 @@ __xfs_get_blocks(
 out_unlock:
        xfs_iunlock(ip, lockmode);
-        return -error;
+        return error;
 }
 int
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 09480c57f069..aa2a8b1838a2 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -76,7 +76,7 @@ xfs_attr3_leaf_freextent(
                error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
                                       &map, &nmap, XFS_BMAPI_ATTRFORK);
                if (error) {
-                        return(error);
+                        return error;
                }
                ASSERT(nmap == 1);
                ASSERT(map.br_startblock != DELAYSTARTBLOCK);
@@ -95,21 +95,21 @@ xfs_attr3_leaf_freextent(
                                        dp->i_mount->m_ddev_targp,
                                        dblkno, dblkcnt, 0);
                        if (!bp)
-                                return ENOMEM;
+                                return -ENOMEM;
                        xfs_trans_binval(*trans, bp);
                        /*
                         * Roll to next transaction.
                         */
                        error = xfs_trans_roll(trans, dp);
                        if (error)
-                                return (error);
+                                return error;
                }
                tblkno += map.br_blockcount;
                tblkcnt -= map.br_blockcount;
        }
-        return(0);
+        return 0;
 }
 /*
@@ -227,7 +227,7 @@ xfs_attr3_node_inactive(
         */
        if (level > XFS_DA_NODE_MAXDEPTH) {
                xfs_trans_brelse(*trans, bp);   /* no locks for later trans */
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        node = bp->b_addr;
@@ -256,7 +256,7 @@ xfs_attr3_node_inactive(
                error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
                                                XFS_ATTR_FORK);
                if (error)
-                        return(error);
+                        return error;
                if (child_bp) {
                                                /* save for re-read later */
                        child_blkno = XFS_BUF_ADDR(child_bp);
@@ -277,7 +277,7 @@ xfs_attr3_node_inactive(
                                                        child_bp);
                                break;
                        default:
-                                error = XFS_ERROR(EIO);
+                                error = -EIO;
                                xfs_trans_brelse(*trans, child_bp);
                                break;
                        }
@@ -360,7 +360,7 @@ xfs_attr3_root_inactive(
                error = xfs_attr3_leaf_inactive(trans, dp, bp);
                break;
        default:
-                error = XFS_ERROR(EIO);
+                error = -EIO;
                xfs_trans_brelse(*trans, bp);
                break;
        }
@@ -414,7 +414,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
        error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
        if (error) {
                xfs_trans_cancel(trans, 0);
-                return(error);
+                return error;
        }
        xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -443,10 +443,10 @@ xfs_attr_inactive(xfs_inode_t *dp)
        error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
-        return(error);
+        return error;
 out:
        xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
-        return(error);
+        return error;
 }
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 90e2eeb21207..62db83ab6cbc 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -50,11 +50,11 @@ xfs_attr_shortform_compare(const void *a, const void *b)
        sa = (xfs_attr_sf_sort_t *)a;
        sb = (xfs_attr_sf_sort_t *)b;
        if (sa->hash < sb->hash) {
-                return(-1);
+                return -1;
        } else if (sa->hash > sb->hash) {
-                return(1);
+                return 1;
        } else {
-                return(sa->entno - sb->entno);
+                return sa->entno - sb->entno;
        }
 }
@@ -86,7 +86,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
        sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
        ASSERT(sf != NULL);
        if (!sf->hdr.count)
-                return(0);
+                return 0;
        cursor = context->cursor;
        ASSERT(cursor != NULL);
@@ -124,7 +124,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                        sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
                }
                trace_xfs_attr_list_sf_all(context);
-                return(0);
+                return 0;
        }
        /* do no more for a search callback */
@@ -150,7 +150,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
                                             XFS_ERRLEVEL_LOW,
                                             context->dp->i_mount, sfe);
                        kmem_free(sbuf);
-                        return XFS_ERROR(EFSCORRUPTED);
+                        return -EFSCORRUPTED;
                }
                sbp->entno = i;
@@ -188,7 +188,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
        }
        if (i == nsbuf) {
                kmem_free(sbuf);
-                return(0);
+                return 0;
        }
        /*
@@ -213,7 +213,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
        }
        kmem_free(sbuf);
-        return(0);
+        return 0;
 }
 STATIC int
@@ -243,8 +243,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
        if (cursor->blkno > 0) {
                error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1,
                                              &bp, XFS_ATTR_FORK);
-                if ((error != 0) && (error != EFSCORRUPTED))
+                if ((error != 0) && (error != -EFSCORRUPTED))
-                        return(error);
+                        return error;
                if (bp) {
                        struct xfs_attr_leaf_entry *entries;
@@ -295,7 +295,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                                                      cursor->blkno, -1, &bp,
                                                      XFS_ATTR_FORK);
                        if (error)
-                                return(error);
+                                return error;
                        node = bp->b_addr;
                        magic = be16_to_cpu(node->hdr.info.magic);
                        if (magic == XFS_ATTR_LEAF_MAGIC ||
@@ -308,7 +308,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                                                     context->dp->i_mount,
                                                     node);
                                xfs_trans_brelse(NULL, bp);
-                                return XFS_ERROR(EFSCORRUPTED);
+                                return -EFSCORRUPTED;
                        }
                        dp->d_ops->node_hdr_from_disk(&nodehdr, node);
@@ -496,11 +496,11 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
        context->cursor->blkno = 0;
        error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
        if (error)
-                return XFS_ERROR(error);
+                return error;
        error = xfs_attr3_leaf_list_int(bp, context);
        xfs_trans_brelse(NULL, bp);
-        return XFS_ERROR(error);
+        return error;
 }
 int
@@ -514,7 +514,7 @@ xfs_attr_list_int(
        XFS_STATS_INC(xs_attr_list);
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-                return EIO;
+                return -EIO;
        /*
         * Decide on what work routines to call based on the inode size.
@@ -616,16 +616,16 @@ xfs_attr_list(
         * Validate the cursor.
         */
        if (cursor->pad1 || cursor->pad2)
-                return(XFS_ERROR(EINVAL));
+                return -EINVAL;
        if ((cursor->initted == 0) &&
            (cursor->hashval || cursor->blkno || cursor->offset))
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        /*
         * Check for a properly aligned buffer.
         */
        if (((long)buffer) & (sizeof(int)-1))
-                return XFS_ERROR(EFAULT);
+                return -EFAULT;
        if (flags & ATTR_KERNOVAL)
                bufsize = 0;
@@ -648,6 +648,6 @@ xfs_attr_list(
        alist->al_offset[0] = context.bufsize;
        error = xfs_attr_list_int(&context);
-        ASSERT(error >= 0);
+        ASSERT(error <= 0);
        return error;
 }
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 64731ef3324d..2f1e30d39a35 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -133,7 +133,7 @@ xfs_bmap_finish(
                        mp = ntp->t_mountp;
                        if (!XFS_FORCED_SHUTDOWN(mp))
                                xfs_force_shutdown(mp,
-                                                   (error == EFSCORRUPTED) ?
+                                                   (error == -EFSCORRUPTED) ?
                                                   SHUTDOWN_CORRUPT_INCORE :
                                                   SHUTDOWN_META_IO_ERROR);
                        return error;
@@ -365,7 +365,7 @@ xfs_bmap_count_tree(
                        xfs_trans_brelse(tp, bp);
                        XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
                                         XFS_ERRLEVEL_LOW, mp);
-                        return XFS_ERROR(EFSCORRUPTED);
+                        return -EFSCORRUPTED;
                }
                xfs_trans_brelse(tp, bp);
        } else {
@@ -425,14 +425,14 @@ xfs_bmap_count_blocks(
        ASSERT(level > 0);
        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
-        ASSERT(bno != NULLDFSBNO);
+        ASSERT(bno != NULLFSBLOCK);
        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
        ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
        if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
                XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
                                 mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        return 0;
@@ -524,13 +524,13 @@ xfs_getbmap(
                        if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
                            ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
                            ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
-                                return XFS_ERROR(EINVAL);
+                                return -EINVAL;
                } else if (unlikely(
                           ip->i_d.di_aformat != 0 &&
                           ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
                        XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
                                         ip->i_mount);
-                        return XFS_ERROR(EFSCORRUPTED);
+                        return -EFSCORRUPTED;
                }
                prealloced = 0;
@@ -539,7 +539,7 @@ xfs_getbmap(
                if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
                    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
                    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                if (xfs_get_extsz_hint(ip) ||
                    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
@@ -559,26 +559,26 @@ xfs_getbmap(
                bmv->bmv_entries = 0;
                return 0;
        } else if (bmv->bmv_length < 0) {
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        nex = bmv->bmv_count - 1;
        if (nex <= 0)
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        bmvend = bmv->bmv_offset + bmv->bmv_length;
        if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
-                return XFS_ERROR(ENOMEM);
+                return -ENOMEM;
        out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
        if (!out)
-                return XFS_ERROR(ENOMEM);
+                return -ENOMEM;
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (whichfork == XFS_DATA_FORK) {
                if (!(iflags & BMV_IF_DELALLOC) &&
                    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
-                        error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+                        error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
                        if (error)
                                goto out_unlock_iolock;
@@ -611,7 +611,7 @@ xfs_getbmap(
        /*
         * Allocate enough space to handle "subnex" maps at a time.
         */
-        error = ENOMEM;
+        error = -ENOMEM;
        subnex = 16;
        map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
        if (!map)
@@ -809,7 +809,7 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
         * have speculative prealloc/delalloc blocks to remove.
         */
        if (VFS_I(ip)->i_size == 0 &&
-            VN_CACHED(VFS_I(ip)) == 0 &&
+            VFS_I(ip)->i_mapping->nrpages == 0 &&
            ip->i_delayed_blks == 0)
                return false;
@@ -882,7 +882,7 @@ xfs_free_eofblocks(
                if (need_iolock) {
                        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
                                xfs_trans_cancel(tp, 0);
-                                return EAGAIN;
+                                return -EAGAIN;
                        }
                }
@@ -955,14 +955,14 @@ xfs_alloc_file_space(
        trace_xfs_alloc_file_space(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        error = xfs_qm_dqattach(ip, 0);
        if (error)
                return error;
        if (len <= 0)
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        rt = XFS_IS_REALTIME_INODE(ip);
        extsz = xfs_get_extsz_hint(ip);
@@ -1028,7 +1028,7 @@ xfs_alloc_file_space(
                        /*
                         * Free the transaction structure.
                         */
-                        ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+                        ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
                        xfs_trans_cancel(tp, 0);
                        break;
                }
@@ -1065,7 +1065,7 @@ xfs_alloc_file_space(
                allocated_fsb = imapp->br_blockcount;
                if (nimaps == 0) {
-                        error = XFS_ERROR(ENOSPC);
+                        error = -ENOSPC;
                        break;
                }
@@ -1126,7 +1126,7 @@ xfs_zero_remaining_bytes(
                                        mp->m_rtdev_targp : mp->m_ddev_targp,
                                  BTOBB(mp->m_sb.sb_blocksize), 0);
        if (!bp)
-                return XFS_ERROR(ENOMEM);
+                return -ENOMEM;
        xfs_buf_unlock(bp);
@@ -1158,7 +1158,7 @@ xfs_zero_remaining_bytes(
                XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
                if (XFS_FORCED_SHUTDOWN(mp)) {
-                        error = XFS_ERROR(EIO);
+                        error = -EIO;
                        break;
                }
                xfs_buf_iorequest(bp);
@@ -1176,7 +1176,7 @@ xfs_zero_remaining_bytes(
                XFS_BUF_WRITE(bp);
                if (XFS_FORCED_SHUTDOWN(mp)) {
-                        error = XFS_ERROR(EIO);
+                        error = -EIO;
                        break;
                }
                xfs_buf_iorequest(bp);
@@ -1234,7 +1234,7 @@ xfs_free_file_space(
        rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
        ioffset = offset & ~(rounding - 1);
-        error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+        error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
                                              ioffset, -1);
        if (error)
                goto out;
@@ -1315,7 +1315,7 @@ xfs_free_file_space(
                        /*
                         * Free the transaction structure.
                         */
-                        ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+                        ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
                        xfs_trans_cancel(tp, 0);
                        break;
                }
@@ -1557,14 +1557,14 @@ xfs_swap_extents_check_format(
        /* Should never get a local format */
        if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
            tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
-                return EINVAL;
+                return -EINVAL;
        /*
         * if the target inode has less extents that then temporary inode then
         * why did userspace call us?
         */
        if (ip->i_d.di_nextents < tip->i_d.di_nextents)
-                return EINVAL;
+                return -EINVAL;
        /*
         * if the target inode is in extent form and the temp inode is in btree
@@ -1573,19 +1573,19 @@ xfs_swap_extents_check_format(
         */
        if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
            tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
-                return EINVAL;
+                return -EINVAL;
        /* Check temp in extent form to max in target */
        if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
            XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
                        XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
-                return EINVAL;
+                return -EINVAL;
        /* Check target in extent form to max in temp */
        if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
            XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
                        XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
-                return EINVAL;
+                return -EINVAL;
        /*
         * If we are in a btree format, check that the temp root block will fit
@@ -1599,26 +1599,50 @@ xfs_swap_extents_check_format(
        if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
                if (XFS_IFORK_BOFF(ip) &&
                    XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
-                        return EINVAL;
+                        return -EINVAL;
                if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
                    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
-                        return EINVAL;
+                        return -EINVAL;
        }
        /* Reciprocal target->temp btree format checks */
        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
                if (XFS_IFORK_BOFF(tip) &&
                    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
-                        return EINVAL;
+                        return -EINVAL;
                if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
                    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
-                        return EINVAL;
+                        return -EINVAL;
        }
        return 0;
 }
 int
+xfs_swap_extent_flush(
+        struct xfs_inode        *ip)
+{
+        int     error;
+        error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+        if (error)
+                return error;
+        truncate_pagecache_range(VFS_I(ip), 0, -1);
+        /* Verify O_DIRECT for ftmp */
+        if (VFS_I(ip)->i_mapping->nrpages)
+                return -EINVAL;
+        /*
+         * Don't try to swap extents on mmap()d files because we can't lock
+         * out races against page faults safely.
+         */
+        if (mapping_mapped(VFS_I(ip)->i_mapping))
+                return -EBUSY;
+        return 0;
+}
+int
 xfs_swap_extents(
        xfs_inode_t     *ip,    /* target inode */
        xfs_inode_t     *tip,   /* tmp inode */
@@ -1633,51 +1657,57 @@ xfs_swap_extents(
        int             aforkblks = 0;
        int             taforkblks = 0;
        __uint64_t      tmp;
+        int             lock_flags;
        tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
        if (!tempifp) {
-                error = XFS_ERROR(ENOMEM);
+                error = -ENOMEM;
                goto out;
        }
        /*
-         * we have to do two separate lock calls here to keep lockdep
+         * Lock up the inodes against other IO and truncate to begin with.
-         * happy. If we try to get all the locks in one call, lock will
+         * Then we can ensure the inodes are flushed and have no page cache
-         * report false positives when we drop the ILOCK and regain them
+         * safely. Once we have done this we can take the ilocks and do the rest
-         * below.
+         * of the checks.
         */
+        lock_flags = XFS_IOLOCK_EXCL;
        xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
-        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
        /* Verify that both files have the same format */
        if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
-                error = XFS_ERROR(EINVAL);
+                error = -EINVAL;
                goto out_unlock;
        }
        /* Verify both files are either real-time or non-realtime */
        if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
-                error = XFS_ERROR(EINVAL);
+                error = -EINVAL;
                goto out_unlock;
        }
-        error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
+        error = xfs_swap_extent_flush(ip);
+        if (error)
+                goto out_unlock;
+        error = xfs_swap_extent_flush(tip);
        if (error)
                goto out_unlock;
-        truncate_pagecache_range(VFS_I(tip), 0, -1);
-        /* Verify O_DIRECT for ftmp */
+        tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
-        if (VN_CACHED(VFS_I(tip)) != 0) {
+        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-                error = XFS_ERROR(EINVAL);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
                goto out_unlock;
        }
+        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+        lock_flags |= XFS_ILOCK_EXCL;
        /* Verify all data are being swapped */
        if (sxp->sx_offset != 0 ||
            sxp->sx_length != ip->i_d.di_size ||
            sxp->sx_length != tip->i_d.di_size) {
-                error = XFS_ERROR(EFAULT);
+                error = -EFAULT;
-                goto out_unlock;
+                goto out_trans_cancel;
        }
        trace_xfs_swap_extent_before(ip, 0);
@@ -1689,7 +1719,7 @@ xfs_swap_extents(
                xfs_notice(mp,
                    "%s: inode 0x%llx format is incompatible for exchanging.",
                                __func__, ip->i_ino);
-                goto out_unlock;
+                goto out_trans_cancel;
        }
        /*
@@ -1703,43 +1733,9 @@ xfs_swap_extents(
            (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
            (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
            (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
-                error = XFS_ERROR(EBUSY);
+                error = -EBUSY;
-                goto out_unlock;
+                goto out_trans_cancel;
-        }
-        /* We need to fail if the file is memory mapped.  Once we have tossed
-         * all existing pages, the page fault will have no option
-         * but to go to the filesystem for pages. By making the page fault call
-         * vop_read (or write in the case of autogrow) they block on the iolock
-         * until we have switched the extents.
-         */
-        if (VN_MAPPED(VFS_I(ip))) {
-                error = XFS_ERROR(EBUSY);
-                goto out_unlock;
-        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        xfs_iunlock(tip, XFS_ILOCK_EXCL);
-        /*
-         * There is a race condition here since we gave up the
-         * ilock.  However, the data fork will not change since
-         * we have the iolock (locked for truncation too) so we
-         * are safe.  We don't really care if non-io related
-         * fields change.
-         */
-        truncate_pagecache_range(VFS_I(ip), 0, -1);
-        tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-        if (error) {
-                xfs_iunlock(ip,  XFS_IOLOCK_EXCL);
-                xfs_iunlock(tip, XFS_IOLOCK_EXCL);
-                xfs_trans_cancel(tp, 0);
-                goto out;
        }
-        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
        /*
         * Count the number of extended attribute blocks
         */
@@ -1757,8 +1753,8 @@ xfs_swap_extents(
                        goto out_trans_cancel;
        }
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, lock_flags);
-        xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_trans_ijoin(tp, tip, lock_flags);
        /*
         * Before we've swapped the forks, lets set the owners of the forks
@@ -1887,8 +1883,8 @@ out:
        return error;
 out_unlock:
-        xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_iunlock(ip, lock_flags);
-        xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        xfs_iunlock(tip, lock_flags);
        goto out;
 out_trans_cancel:
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 7a34a1ae6552..cd7b8ca9b064 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -130,7 +130,7 @@ xfs_buf_get_maps(
        bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
                                KM_NOFS);
        if (!bp->b_maps)
-                return ENOMEM;
+                return -ENOMEM;
        return 0;
 }
@@ -344,7 +344,7 @@ retry:
                if (unlikely(page == NULL)) {
                        if (flags & XBF_READ_AHEAD) {
                                bp->b_page_count = i;
-                                error = ENOMEM;
+                                error = -ENOMEM;
                                goto out_free_pages;
                        }
@@ -465,7 +465,7 @@ _xfs_buf_find(
        eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
        if (blkno >= eofs) {
                /*
-                 * XXX (dgc): we should really be returning EFSCORRUPTED here,
+                 * XXX (dgc): we should really be returning -EFSCORRUPTED here,
                 * but none of the higher level infrastructure supports
                 * returning a specific error on buffer lookup failures.
                 */
@@ -1052,8 +1052,8 @@ xfs_buf_ioerror(
        xfs_buf_t               *bp,
        int                     error)
 {
-        ASSERT(error >= 0 && error <= 0xffff);
+        ASSERT(error <= 0 && error >= -1000);
-        bp->b_error = (unsigned short)error;
+        bp->b_error = error;
        trace_xfs_buf_ioerror(bp, error, _RET_IP_);
 }
@@ -1064,7 +1064,7 @@ xfs_buf_ioerror_alert(
 {
        xfs_alert(bp->b_target->bt_mount,
 "metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
-                (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length);
+                (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
 }
 /*
@@ -1083,7 +1083,7 @@ xfs_bioerror(
        /*
         * No need to wait until the buffer is unpinned, we aren't flushing it.
         */
-        xfs_buf_ioerror(bp, EIO);
+        xfs_buf_ioerror(bp, -EIO);
        /*
         * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
@@ -1094,7 +1094,7 @@ xfs_bioerror(
        xfs_buf_ioend(bp, 0);
-        return EIO;
+        return -EIO;
 }
 /*
@@ -1127,13 +1127,13 @@ xfs_bioerror_relse(
                 * There's no reason to mark error for
                 * ASYNC buffers.
                 */
-                xfs_buf_ioerror(bp, EIO);
+                xfs_buf_ioerror(bp, -EIO);
                complete(&bp->b_iowait);
        } else {
                xfs_buf_relse(bp);
        }
-        return EIO;
+        return -EIO;
 }
 STATIC int
@@ -1199,7 +1199,7 @@ xfs_buf_bio_end_io(
         * buffers that require multiple bios to complete.
         */
        if (!bp->b_error)
-                xfs_buf_ioerror(bp, -error);
+                xfs_buf_ioerror(bp, error);
        if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
@@ -1286,7 +1286,7 @@ next_chunk:
                 * because the caller (xfs_buf_iorequest) holds a count itself.
                 */
                atomic_dec(&bp->b_io_remaining);
-                xfs_buf_ioerror(bp, EIO);
+                xfs_buf_ioerror(bp, -EIO);
                bio_put(bio);
        }
@@ -1330,6 +1330,20 @@ _xfs_buf_ioapply(
                                                   SHUTDOWN_CORRUPT_INCORE);
                                return;
                        }
+                } else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
+                        struct xfs_mount *mp = bp->b_target->bt_mount;
+                        /*
+                         * non-crc filesystems don't attach verifiers during
+                         * log recovery, so don't warn for such filesystems.
+                         */
+                        if (xfs_sb_version_hascrc(&mp->m_sb)) {
+                                xfs_warn(mp,
+                                        "%s: no ops on block 0x%llx/0x%x",
+                                        __func__, bp->b_bn, bp->b_length);
+                                xfs_hex_dump(bp->b_addr, 64);
+                                dump_stack();
+                        }
                }
        } else if (bp->b_flags & XBF_READ_AHEAD) {
                rw = READA;
@@ -1628,7 +1642,7 @@ xfs_setsize_buftarg(
                xfs_warn(btp->bt_mount,
                        "Cannot set_blocksize to %u on device %s",
                        sectorsize, name);
-                return EINVAL;
+                return -EINVAL;
        }
        /* Set up device logical sector size mask */
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3a7a5523d3dc..c753183900b3 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -178,7 +178,7 @@ typedef struct xfs_buf {
        atomic_t                b_io_remaining; /* #outstanding I/O requests */
        unsigned int            b_page_count;   /* size of page array */
        unsigned int            b_offset;       /* page offset in first page */
-        unsigned short          b_error;        /* error code on I/O */
+        int                     b_error;        /* error code on I/O */
        const struct xfs_buf_ops        *b_ops;
 #ifdef XFS_BUF_LOCK_TRACKING
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 4654338b03fc..76007deed31f 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -488,7 +488,7 @@ xfs_buf_item_unpin(
                xfs_buf_lock(bp);
                xfs_buf_hold(bp);
                bp->b_flags |= XBF_ASYNC;
-                xfs_buf_ioerror(bp, EIO);
+                xfs_buf_ioerror(bp, -EIO);
                XFS_BUF_UNDONE(bp);
                xfs_buf_stale(bp);
                xfs_buf_ioend(bp, 0);
@@ -725,7 +725,7 @@ xfs_buf_item_get_format(
        bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
                                KM_SLEEP);
        if (!bip->bli_formats)
-                return ENOMEM;
+                return -ENOMEM;
        return 0;
 }
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 48e99afb9cb0..f1b69edcdf31 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -95,7 +95,7 @@ xfs_dir2_sf_getdents(
         */
        if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
                ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
@@ -677,7 +677,7 @@ xfs_readdir(
        trace_xfs_readdir(dp);
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-                return XFS_ERROR(EIO);
+                return -EIO;
        ASSERT(S_ISDIR(dp->i_d.di_mode));
        XFS_STATS_INC(xs_dir_getdents);
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 4f11ef011139..13d08a1b390e 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -124,7 +124,7 @@ xfs_trim_extents(
                }
                trace_xfs_discard_extent(mp, agno, fbno, flen);
-                error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
+                error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
                if (error)
                        goto out_del_cursor;
                *blocks_trimmed += flen;
@@ -166,11 +166,11 @@ xfs_ioc_trim(
        int                     error, last_error = 0;
        if (!capable(CAP_SYS_ADMIN))
-                return -XFS_ERROR(EPERM);
+                return -EPERM;
        if (!blk_queue_discard(q))
-                return -XFS_ERROR(EOPNOTSUPP);
+                return -EOPNOTSUPP;
        if (copy_from_user(&range, urange, sizeof(range)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        /*
         * Truncating down the len isn't actually quite correct, but using
@@ -182,7 +182,7 @@ xfs_ioc_trim(
        if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
            range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
            range.len < mp->m_sb.sb_blocksize)
-                return -XFS_ERROR(EINVAL);
+                return -EINVAL;
        start = BTOBB(range.start);
        end = start + BTOBBT(range.len) - 1;
@@ -195,7 +195,7 @@ xfs_ioc_trim(
        end_agno = xfs_daddr_to_agno(mp, end);
        for (agno = start_agno; agno <= end_agno; agno++) {
-                error = -xfs_trim_extents(mp, agno, start, end, minlen,
+                error = xfs_trim_extents(mp, agno, start, end, minlen,
                                          &blocks_trimmed);
                if (error)
                        last_error = error;
@@ -206,7 +206,7 @@ xfs_ioc_trim(
        range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
        if (copy_to_user(urange, &range, sizeof(range)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        return 0;
 }
@@ -222,11 +222,11 @@ xfs_discard_extents(
                trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
                                         busyp->length);
-                error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+                error = blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
                                XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
                                XFS_FSB_TO_BB(mp, busyp->length),
                                GFP_NOFS, 0);
-                if (error && error != EOPNOTSUPP) {
+                if (error && error != -EOPNOTSUPP) {
                        xfs_info(mp,
         "discard failed for extent [0x%llu,%u], error %d",
                                 (unsigned long long)busyp->bno,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 3ee0cd43edc0..63c2de49f61d 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -327,7 +327,7 @@ xfs_qm_dqalloc(
         */
        if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
                xfs_iunlock(quotip, XFS_ILOCK_EXCL);
-                return (ESRCH);
+                return -ESRCH;
        }
        xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
@@ -354,7 +354,7 @@ xfs_qm_dqalloc(
                               mp->m_quotainfo->qi_dqchunklen,
                               0);
        if (!bp) {
-                error = ENOMEM;
+                error = -ENOMEM;
                goto error1;
        }
        bp->b_ops = &xfs_dquot_buf_ops;
@@ -400,7 +400,7 @@ xfs_qm_dqalloc(
      error0:
        xfs_iunlock(quotip, XFS_ILOCK_EXCL);
-        return (error);
+        return error;
 }
 STATIC int
@@ -426,7 +426,7 @@ xfs_qm_dqrepair(
        if (error) {
                ASSERT(*bpp == NULL);
-                return XFS_ERROR(error);
+                return error;
        }
        (*bpp)->b_ops = &xfs_dquot_buf_ops;
@@ -442,7 +442,7 @@ xfs_qm_dqrepair(
                if (error) {
                        /* repair failed, we're screwed */
                        xfs_trans_brelse(tp, *bpp);
-                        return XFS_ERROR(EIO);
+                        return -EIO;
                }
        }
@@ -480,7 +480,7 @@ xfs_qm_dqtobp(
                 * didn't have the quota inode lock.
                 */
                xfs_iunlock(quotip, lock_mode);
-                return ESRCH;
+                return -ESRCH;
        }
        /*
@@ -508,7 +508,7 @@ xfs_qm_dqtobp(
                 * We don't allocate unless we're asked to
                 */
                if (!(flags & XFS_QMOPT_DQALLOC))
-                        return ENOENT;
+                        return -ENOENT;
                ASSERT(tp);
                error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
@@ -530,7 +530,7 @@ xfs_qm_dqtobp(
                                           mp->m_quotainfo->qi_dqchunklen,
                                           0, &bp, &xfs_dquot_buf_ops);
-                if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
+                if (error == -EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
                        xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
                                                mp->m_quotainfo->qi_dqperchunk;
                        ASSERT(bp == NULL);
@@ -539,7 +539,7 @@ xfs_qm_dqtobp(
                if (error) {
                        ASSERT(bp == NULL);
-                        return XFS_ERROR(error);
+                        return error;
                }
        }
@@ -547,7 +547,7 @@ xfs_qm_dqtobp(
        *O_bpp = bp;
        *O_ddpp = bp->b_addr + dqp->q_bufoffset;
-        return (0);
+        return 0;
 }
@@ -715,7 +715,7 @@ xfs_qm_dqget(
        if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
            (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
            (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
-                return (ESRCH);
+                return -ESRCH;
        }
 #ifdef DEBUG
@@ -723,7 +723,7 @@ xfs_qm_dqget(
                if ((xfs_dqerror_target == mp->m_ddev_targp) &&
                    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
                        xfs_debug(mp, "Returning error in dqget");
-                        return (EIO);
+                        return -EIO;
                }
        }
@@ -796,14 +796,14 @@ restart:
                } else {
                        /* inode stays locked on return */
                        xfs_qm_dqdestroy(dqp);
-                        return XFS_ERROR(ESRCH);
+                        return -ESRCH;
                }
        }
        mutex_lock(&qi->qi_tree_lock);
-        error = -radix_tree_insert(tree, id, dqp);
+        error = radix_tree_insert(tree, id, dqp);
        if (unlikely(error)) {
-                WARN_ON(error != EEXIST);
+                WARN_ON(error != -EEXIST);
                /*
                 * Duplicate found. Just throw away the new dquot and start
@@ -829,7 +829,7 @@ restart:
        ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
        trace_xfs_dqget_miss(dqp);
        *O_dqpp = dqp;
-        return (0);
+        return 0;
 }
 /*
@@ -966,7 +966,7 @@ xfs_qm_dqflush(
                                             SHUTDOWN_CORRUPT_INCORE);
                else
                        spin_unlock(&mp->m_ail->xa_lock);
-                error = XFS_ERROR(EIO);
+                error = -EIO;
                goto out_unlock;
        }
@@ -974,7 +974,8 @@ xfs_qm_dqflush(
         * Get the buffer containing the on-disk dquot
         */
        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
-                                   mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
+                                   mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+                                   &xfs_dquot_buf_ops);
        if (error)
                goto out_unlock;
@@ -992,7 +993,7 @@ xfs_qm_dqflush(
                xfs_buf_relse(bp);
                xfs_dqfunlock(dqp);
                xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        /* This is the only portion of data that needs to persist */
@@ -1045,7 +1046,7 @@ xfs_qm_dqflush(
 out_unlock:
        xfs_dqfunlock(dqp);
-        return XFS_ERROR(EIO);
+        return -EIO;
 }
 /*
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 68a68f704837..c24c67e22a2a 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -139,6 +139,21 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
        }
 }
+/*
+ * Check whether a dquot is under low free space conditions. We assume the quota
+ * is enabled and enforced.
+ */
+static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
+{
+        int64_t freesp;
+        freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount;
+        if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT])
+                return true;
+        return false;
+}
 #define XFS_DQ_IS_LOCKED(dqp)   (mutex_is_locked(&((dqp)->q_qlock)))
 #define XFS_DQ_IS_DIRTY(dqp)    ((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)       ((dqp)->dq_flags & XFS_DQ_USER)
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index edac5b057d28..b92fd7bc49e3 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -27,29 +27,6 @@
 #ifdef DEBUG
-int     xfs_etrap[XFS_ERROR_NTRAP] = {
-        0,
-};
-int
-xfs_error_trap(int e)
-{
-        int i;
-        if (!e)
-                return 0;
-        for (i = 0; i < XFS_ERROR_NTRAP; i++) {
-                if (xfs_etrap[i] == 0)
-                        break;
-                if (e != xfs_etrap[i])
-                        continue;
-                xfs_notice(NULL, "%s: error %d", __func__, e);
-                BUG();
-                break;
-        }
-        return e;
-}
 int     xfs_etest[XFS_NUM_INJECT_ERROR];
 int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
 char *  xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
@@ -190,7 +167,7 @@ xfs_verifier_error(
        struct xfs_mount *mp = bp->b_target->bt_mount;
        xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
-                  bp->b_error == EFSBADCRC ? "CRC error" : "corruption",
+                  bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
                  __return_address, bp->b_bn);
        xfs_alert(mp, "Unmount and run xfs_repair");
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c1c57d4a4b5d..279a76e52791 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -18,15 +18,6 @@
 #ifndef __XFS_ERROR_H__
 #define __XFS_ERROR_H__
-#ifdef DEBUG
-#define XFS_ERROR_NTRAP 10
-extern int      xfs_etrap[XFS_ERROR_NTRAP];
-extern int      xfs_error_trap(int);
-#define XFS_ERROR(e)    xfs_error_trap(e)
-#else
-#define XFS_ERROR(e)    (e)
-#endif
 struct xfs_mount;
 extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
@@ -56,7 +47,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
                if (unlikely(!fs_is_ok)) { \
                        XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
                                         XFS_ERRLEVEL_LOW, NULL); \
-                        error = XFS_ERROR(EFSCORRUPTED); \
+                        error = -EFSCORRUPTED; \
                        goto l; \
                } \
        }
@@ -68,7 +59,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
                if (unlikely(!fs_is_ok)) { \
                        XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
                                         XFS_ERRLEVEL_LOW, NULL); \
-                        return XFS_ERROR(EFSCORRUPTED); \
+                        return -EFSCORRUPTED; \
                } \
        }
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 753e467aa1a5..5a6bd5d8779a 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -147,9 +147,9 @@ xfs_nfs_get_inode(
                 * We don't use ESTALE directly down the chain to not
                 * confuse applications using bulkstat that expect EINVAL.
                 */
-                if (error == EINVAL || error == ENOENT)
+                if (error == -EINVAL || error == -ENOENT)
-                        error = ESTALE;
+                        error = -ESTALE;
-                return ERR_PTR(-error);
+                return ERR_PTR(error);
        }
        if (ip->i_d.di_gen != generation) {
@@ -217,7 +217,7 @@ xfs_fs_get_parent(
        error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
        if (unlikely(error))
-                return ERR_PTR(-error);
+                return ERR_PTR(error);
        return d_obtain_alias(VFS_I(cip));
 }
@@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata(
        if (!lsn)
                return 0;
-        return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+        return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index fb7a4c1ce1c5..c4327419dc5c 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -298,7 +298,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
                }
                return 0;
        }
-        return EFSCORRUPTED;
+        return -EFSCORRUPTED;
 }
 /*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1f66779d7a46..076b1708d134 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,6 +38,7 @@
 #include "xfs_trace.h"
 #include "xfs_log.h"
 #include "xfs_dinode.h"
+#include "xfs_icache.h"
 #include <linux/aio.h>
 #include <linux/dcache.h>
@@ -155,7 +156,7 @@ xfs_dir_fsync(
        if (!lsn)
                return 0;
-        return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+        return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 STATIC int
@@ -179,7 +180,7 @@ xfs_file_fsync(
                return error;
        if (XFS_FORCED_SHUTDOWN(mp))
-                return -XFS_ERROR(EIO);
+                return -EIO;
        xfs_iflags_clear(ip, XFS_ITRUNCATED);
@@ -225,7 +226,7 @@ xfs_file_fsync(
            !log_flushed)
                xfs_blkdev_issue_flush(mp->m_ddev_targp);
-        return -error;
+        return error;
 }
 STATIC ssize_t
@@ -246,11 +247,11 @@ xfs_file_read_iter(
        XFS_STATS_INC(xs_read_calls);
        if (unlikely(file->f_flags & O_DIRECT))
-                ioflags |= IO_ISDIRECT;
+                ioflags |= XFS_IO_ISDIRECT;
        if (file->f_mode & FMODE_NOCMTIME)
-                ioflags |= IO_INVIS;
+                ioflags |= XFS_IO_INVIS;
-        if (unlikely(ioflags & IO_ISDIRECT)) {
+        if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
                xfs_buftarg_t   *target =
                        XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -258,7 +259,7 @@ xfs_file_read_iter(
                if ((pos | size) & target->bt_logical_sectormask) {
                        if (pos == i_size_read(inode))
                                return 0;
-                        return -XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
        }
@@ -283,7 +284,7 @@ xfs_file_read_iter(
         * proceeed concurrently without serialisation.
         */
        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-        if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) {
+        if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
                xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
@@ -325,7 +326,7 @@ xfs_file_splice_read(
        XFS_STATS_INC(xs_read_calls);
        if (infilp->f_mode & FMODE_NOCMTIME)
-                ioflags |= IO_INVIS;
+                ioflags |= XFS_IO_INVIS;
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
@@ -524,7 +525,7 @@ restart:
                        xfs_rw_ilock(ip, *iolock);
                        goto restart;
                }
-                error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
+                error = xfs_zero_eof(ip, *pos, i_size_read(inode));
                if (error)
                        return error;
        }
@@ -594,7 +595,7 @@ xfs_file_dio_aio_write(
        /* DIO must be aligned to device logical sector size */
        if ((pos | count) & target->bt_logical_sectormask)
-                return -XFS_ERROR(EINVAL);
+                return -EINVAL;
        /* "unaligned" here means not aligned to a filesystem block */
        if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
@@ -689,14 +690,28 @@ write_retry:
        ret = generic_perform_write(file, from, pos);
        if (likely(ret >= 0))
                iocb->ki_pos = pos + ret;
        /*
-         * If we just got an ENOSPC, try to write back all dirty inodes to
+         * If we hit a space limit, try to free up some lingering preallocated
-         * convert delalloc space to free up some of the excess reserved
+         * space before returning an error. In the case of ENOSPC, first try to
-         * metadata space.
+         * write back all dirty inodes to free up some of the excess reserved
+         * metadata space. This reduces the chances that the eofblocks scan
+         * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
+         * also behaves as a filter to prevent too many eofblocks scans from
+         * running at the same time.
         */
-        if (ret == -ENOSPC && !enospc) {
+        if (ret == -EDQUOT && !enospc) {
+                enospc = xfs_inode_free_quota_eofblocks(ip);
+                if (enospc)
+                        goto write_retry;
+        } else if (ret == -ENOSPC && !enospc) {
+                struct xfs_eofblocks eofb = {0};
                enospc = 1;
                xfs_flush_inodes(ip->i_mount);
+                eofb.eof_scan_owner = ip->i_ino; /* for locking */
+                eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
+                xfs_icache_free_eofblocks(ip->i_mount, &eofb);
                goto write_retry;
        }
@@ -772,7 +787,7 @@ xfs_file_fallocate(
                unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
                if (offset & blksize_mask || len & blksize_mask) {
-                        error = EINVAL;
+                        error = -EINVAL;
                        goto out_unlock;
                }
@@ -781,7 +796,7 @@ xfs_file_fallocate(
                 * in which case it is effectively a truncate operation
                 */
                if (offset + len >= i_size_read(inode)) {
-                        error = EINVAL;
+                        error = -EINVAL;
                        goto out_unlock;
                }
@@ -794,7 +809,7 @@ xfs_file_fallocate(
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
                    offset + len > i_size_read(inode)) {
                        new_size = offset + len;
-                        error = -inode_newsize_ok(inode, new_size);
+                        error = inode_newsize_ok(inode, new_size);
                        if (error)
                                goto out_unlock;
                }
@@ -844,7 +859,7 @@ xfs_file_fallocate(
 out_unlock:
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-        return -error;
+        return error;
 }
@@ -889,7 +904,7 @@ xfs_file_release(
        struct inode    *inode,
        struct file     *filp)
 {
-        return -xfs_release(XFS_I(inode));
+        return xfs_release(XFS_I(inode));
 }
 STATIC int
@@ -918,7 +933,7 @@ xfs_file_readdir(
        error = xfs_readdir(ip, ctx, bufsize);
        if (error)
-                return -error;
+                return error;
        return 0;
 }
@@ -1184,7 +1199,7 @@ xfs_seek_data(
        isize = i_size_read(inode);
        if (start >= isize) {
-                error = ENXIO;
+                error = -ENXIO;
                goto out_unlock;
        }
@@ -1206,7 +1221,7 @@ xfs_seek_data(
                /* No extents at given offset, must be beyond EOF */
                if (nmap == 0) {
-                        error = ENXIO;
+                        error = -ENXIO;
                        goto out_unlock;
                }
@@ -1237,7 +1252,7 @@ xfs_seek_data(
                 * we are reading after EOF if nothing in map[1].
                 */
                if (nmap == 1) {
-                        error = ENXIO;
+                        error = -ENXIO;
                        goto out_unlock;
                }
@@ -1250,7 +1265,7 @@ xfs_seek_data(
                fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
                start = XFS_FSB_TO_B(mp, fsbno);
                if (start >= isize) {
-                        error = ENXIO;
+                        error = -ENXIO;
                        goto out_unlock;
                }
        }
@@ -1262,7 +1277,7 @@ out_unlock:
        xfs_iunlock(ip, lock);
        if (error)
-                return -error;
+                return error;
        return offset;
 }
@@ -1282,13 +1297,13 @@ xfs_seek_hole(
        int                     error;
        if (XFS_FORCED_SHUTDOWN(mp))
-                return -XFS_ERROR(EIO);
+                return -EIO;
        lock = xfs_ilock_data_map_shared(ip);
        isize = i_size_read(inode);
        if (start >= isize) {
-                error = ENXIO;
+                error = -ENXIO;
                goto out_unlock;
        }
@@ -1307,7 +1322,7 @@ xfs_seek_hole(
                /* No extents at given offset, must be beyond EOF */
                if (nmap == 0) {
-                        error = ENXIO;
+                        error = -ENXIO;
                        goto out_unlock;
                }
@@ -1370,7 +1385,7 @@ out_unlock:
        xfs_iunlock(ip, lock);
        if (error)
-                return -error;
+                return error;
        return offset;
 }
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 8ec81bed7992..e92730c1d3ca 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -258,7 +258,7 @@ next_ag:
        if (*agp == NULLAGNUMBER)
                return 0;
-        err = ENOMEM;
+        err = -ENOMEM;
        item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
        if (!item)
                goto out_put_ag;
@@ -268,7 +268,7 @@ next_ag:
        err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
        if (err) {
-                if (err == EEXIST)
+                if (err == -EEXIST)
                        err = 0;
                goto out_free_item;
        }
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index d34703dbcb42..18dc721ca19f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -255,8 +255,8 @@ typedef struct xfs_fsop_resblks {
        ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
 /* Used for sanity checks on superblock */
-#define XFS_MAX_DBLOCKS(s) ((xfs_drfsbno_t)(s)->sb_agcount * (s)->sb_agblocks)
+#define XFS_MAX_DBLOCKS(s) ((xfs_rfsblock_t)(s)->sb_agcount * (s)->sb_agblocks)
-#define XFS_MIN_DBLOCKS(s) ((xfs_drfsbno_t)((s)->sb_agcount - 1) *      \
+#define XFS_MIN_DBLOCKS(s) ((xfs_rfsblock_t)((s)->sb_agcount - 1) *     \
                         (s)->sb_agblocks + XFS_MIN_AG_BLOCKS)
 /*
@@ -375,6 +375,9 @@ struct xfs_fs_eofblocks {
 #define XFS_EOF_FLAGS_GID               (1 << 2) /* filter by gid */
 #define XFS_EOF_FLAGS_PRID              (1 << 3) /* filter by project id */
 #define XFS_EOF_FLAGS_MINFILESIZE       (1 << 4) /* filter by min file size */
+#define XFS_EOF_FLAGS_UNION             (1 << 5) /* union filter algorithm;
+                                                  * kernel only, not included in
+                                                  * valid mask */
 #define XFS_EOF_FLAGS_VALID     \
        (XFS_EOF_FLAGS_SYNC |   \
         XFS_EOF_FLAGS_UID |    \
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index d2295561570a..f91de1ef05e1 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -168,7 +168,7 @@ xfs_growfs_data_private(
        nb = in->newblocks;
        pct = in->imaxpct;
        if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
                return error;
        dpct = pct - mp->m_sb.sb_imax_pct;
@@ -176,7 +176,7 @@ xfs_growfs_data_private(
                                XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
                                XFS_FSS_TO_BB(mp, 1), 0, NULL);
        if (!bp)
-                return EIO;
+                return -EIO;
        if (bp->b_error) {
                error = bp->b_error;
                xfs_buf_relse(bp);
@@ -191,7 +191,7 @@ xfs_growfs_data_private(
                nagcount--;
                nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
                if (nb < mp->m_sb.sb_dblocks)
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
        }
        new = nb - mp->m_sb.sb_dblocks;
        oagcount = mp->m_sb.sb_agcount;
@@ -229,7 +229,7 @@ xfs_growfs_data_private(
                                XFS_FSS_TO_BB(mp, 1), 0,
                                &xfs_agf_buf_ops);
                if (!bp) {
-                        error = ENOMEM;
+                        error = -ENOMEM;
                        goto error0;
                }
@@ -270,7 +270,7 @@ xfs_growfs_data_private(
                                XFS_FSS_TO_BB(mp, 1), 0,
                                &xfs_agfl_buf_ops);
                if (!bp) {
-                        error = ENOMEM;
+                        error = -ENOMEM;
                        goto error0;
                }
@@ -298,7 +298,7 @@ xfs_growfs_data_private(
                                XFS_FSS_TO_BB(mp, 1), 0,
                                &xfs_agi_buf_ops);
                if (!bp) {
-                        error = ENOMEM;
+                        error = -ENOMEM;
                        goto error0;
                }
@@ -336,7 +336,7 @@ xfs_growfs_data_private(
                                &xfs_allocbt_buf_ops);
                if (!bp) {
-                        error = ENOMEM;
+                        error = -ENOMEM;
                        goto error0;
                }
@@ -365,7 +365,7 @@ xfs_growfs_data_private(
                                BTOBB(mp->m_sb.sb_blocksize), 0,
                                &xfs_allocbt_buf_ops);
                if (!bp) {
-                        error = ENOMEM;
+                        error = -ENOMEM;
                        goto error0;
                }
@@ -395,7 +395,7 @@ xfs_growfs_data_private(
                                BTOBB(mp->m_sb.sb_blocksize), 0,
                                &xfs_inobt_buf_ops);
                if (!bp) {
-                        error = ENOMEM;
+                        error = -ENOMEM;
                        goto error0;
                }
@@ -420,7 +420,7 @@ xfs_growfs_data_private(
                                BTOBB(mp->m_sb.sb_blocksize), 0,
                                &xfs_inobt_buf_ops);
                        if (!bp) {
-                                error = ENOMEM;
+                                error = -ENOMEM;
                                goto error0;
                        }
@@ -531,7 +531,7 @@ xfs_growfs_data_private(
                                bp->b_ops = &xfs_sb_buf_ops;
                                xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
                        } else
-                                error = ENOMEM;
+                                error = -ENOMEM;
                }
                /*
@@ -576,17 +576,17 @@ xfs_growfs_log_private(
        nb = in->newblocks;
        if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        if (nb == mp->m_sb.sb_logblocks &&
            in->isint == (mp->m_sb.sb_logstart != 0))
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        /*
         * Moving the log is hard, need new interfaces to sync
         * the log first, hold off all activity while moving it.
         * Can have shorter or longer log in the same space,
         * or transform internal to external log or vice versa.
         */
-        return XFS_ERROR(ENOSYS);
+        return -ENOSYS;
 }
 /*
@@ -604,9 +604,9 @@ xfs_growfs_data(
        int error;
        if (!capable(CAP_SYS_ADMIN))
-                return XFS_ERROR(EPERM);
+                return -EPERM;
        if (!mutex_trylock(&mp->m_growlock))
-                return XFS_ERROR(EWOULDBLOCK);
+                return -EWOULDBLOCK;
        error = xfs_growfs_data_private(mp, in);
        mutex_unlock(&mp->m_growlock);
        return error;
@@ -620,9 +620,9 @@ xfs_growfs_log(
        int error;
        if (!capable(CAP_SYS_ADMIN))
-                return XFS_ERROR(EPERM);
+                return -EPERM;
        if (!mutex_trylock(&mp->m_growlock))
-                return XFS_ERROR(EWOULDBLOCK);
+                return -EWOULDBLOCK;
        error = xfs_growfs_log_private(mp, in);
        mutex_unlock(&mp->m_growlock);
        return error;
@@ -674,7 +674,7 @@ xfs_reserve_blocks(
        /* If inval is null, report current values and return */
        if (inval == (__uint64_t *)NULL) {
                if (!outval)
-                        return EINVAL;
+                        return -EINVAL;
                outval->resblks = mp->m_resblks;
                outval->resblks_avail = mp->m_resblks_avail;
                return 0;
@@ -757,7 +757,7 @@ out:
                int error;
                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
                                                 fdblks_delta, 0);
-                if (error == ENOSPC)
+                if (error == -ENOSPC)
                        goto retry;
        }
        return 0;
@@ -818,7 +818,7 @@ xfs_fs_goingdown(
                                SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
                break;
        default:
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        return 0;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index c48df5f25b9f..981b2cf51985 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -33,6 +33,9 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_bmap_util.h"
+#include "xfs_quota.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -158,7 +161,7 @@ xfs_iget_cache_hit(
        if (ip->i_ino != ino) {
                trace_xfs_iget_skip(ip);
                XFS_STATS_INC(xs_ig_frecycle);
-                error = EAGAIN;
+                error = -EAGAIN;
                goto out_error;
        }
@@ -176,7 +179,7 @@ xfs_iget_cache_hit(
        if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
                trace_xfs_iget_skip(ip);
                XFS_STATS_INC(xs_ig_frecycle);
-                error = EAGAIN;
+                error = -EAGAIN;
                goto out_error;
        }
@@ -184,7 +187,7 @@ xfs_iget_cache_hit(
         * If lookup is racing with unlink return an error immediately.
         */
        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-                error = ENOENT;
+                error = -ENOENT;
                goto out_error;
        }
@@ -206,7 +209,7 @@ xfs_iget_cache_hit(
                spin_unlock(&ip->i_flags_lock);
                rcu_read_unlock();
-                error = -inode_init_always(mp->m_super, inode);
+                error = inode_init_always(mp->m_super, inode);
                if (error) {
                        /*
                         * Re-initializing the inode failed, and we are in deep
@@ -243,7 +246,7 @@ xfs_iget_cache_hit(
                /* If the VFS inode is being torn down, pause and try again. */
                if (!igrab(inode)) {
                        trace_xfs_iget_skip(ip);
-                        error = EAGAIN;
+                        error = -EAGAIN;
                        goto out_error;
                }
@@ -285,7 +288,7 @@ xfs_iget_cache_miss(
        ip = xfs_inode_alloc(mp, ino);
        if (!ip)
-                return ENOMEM;
+                return -ENOMEM;
        error = xfs_iread(mp, tp, ip, flags);
        if (error)
@@ -294,7 +297,7 @@ xfs_iget_cache_miss(
        trace_xfs_iget_miss(ip);
        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-                error = ENOENT;
+                error = -ENOENT;
                goto out_destroy;
        }
@@ -305,7 +308,7 @@ xfs_iget_cache_miss(
         * recurse into the file system.
         */
        if (radix_tree_preload(GFP_NOFS)) {
-                error = EAGAIN;
+                error = -EAGAIN;
                goto out_destroy;
        }
@@ -341,7 +344,7 @@ xfs_iget_cache_miss(
        if (unlikely(error)) {
                WARN_ON(error != -EEXIST);
                XFS_STATS_INC(xs_ig_dup);
-                error = EAGAIN;
+                error = -EAGAIN;
                goto out_preload_end;
        }
        spin_unlock(&pag->pag_ici_lock);
@@ -408,7 +411,7 @@ xfs_iget(
        /* reject inode numbers outside existing AGs */
        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
-                return EINVAL;
+                return -EINVAL;
        /* get the perag structure and ensure that it's inode capable */
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
@@ -445,7 +448,7 @@ again:
        return 0;
 out_error_or_again:
-        if (error == EAGAIN) {
+        if (error == -EAGAIN) {
                delay(1);
                goto again;
        }
@@ -489,18 +492,18 @@ xfs_inode_ag_walk_grab(
        /* nothing to sync during shutdown */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                return EFSCORRUPTED;
+                return -EFSCORRUPTED;
        /* If we can't grab the inode, it must on it's way to reclaim. */
        if (!igrab(inode))
-                return ENOENT;
+                return -ENOENT;
        /* inode is valid */
        return 0;
 out_unlock_noent:
        spin_unlock(&ip->i_flags_lock);
-        return ENOENT;
+        return -ENOENT;
 }
 STATIC int
@@ -583,16 +586,16 @@ restart:
                                continue;
                        error = execute(batch[i], flags, args);
                        IRELE(batch[i]);
-                        if (error == EAGAIN) {
+                        if (error == -EAGAIN) {
                                skipped++;
                                continue;
                        }
-                        if (error && last_error != EFSCORRUPTED)
+                        if (error && last_error != -EFSCORRUPTED)
                                last_error = error;
                }
                /* bail out if the filesystem is corrupted.  */
-                if (error == EFSCORRUPTED)
+                if (error == -EFSCORRUPTED)
                        break;
                cond_resched();
@@ -652,11 +655,11 @@ xfs_inode_ag_iterator(
                xfs_perag_put(pag);
                if (error) {
                        last_error = error;
-                        if (error == EFSCORRUPTED)
+                        if (error == -EFSCORRUPTED)
                                break;
                }
        }
-        return XFS_ERROR(last_error);
+        return last_error;
 }
 int
@@ -680,11 +683,11 @@ xfs_inode_ag_iterator_tag(
                xfs_perag_put(pag);
                if (error) {
                        last_error = error;
-                        if (error == EFSCORRUPTED)
+                        if (error == -EFSCORRUPTED)
                                break;
                }
        }
-        return XFS_ERROR(last_error);
+        return last_error;
 }
 /*
@@ -944,7 +947,7 @@ restart:
         * see the stale flag set on the inode.
         */
        error = xfs_iflush(ip, &bp);
-        if (error == EAGAIN) {
+        if (error == -EAGAIN) {
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                /* backoff longer than in xfs_ifree_cluster */
                delay(2);
@@ -997,7 +1000,7 @@ out:
        xfs_iflags_clear(ip, XFS_IRECLAIM);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        /*
-         * We could return EAGAIN here to make reclaim rescan the inode tree in
+         * We could return -EAGAIN here to make reclaim rescan the inode tree in
         * a short while. However, this just burns CPU time scanning the tree
         * waiting for IO to complete and the reclaim work never goes back to
         * the idle state. Instead, return 0 to let the next scheduled
@@ -1100,7 +1103,7 @@ restart:
                                if (!batch[i])
                                        continue;
                                error = xfs_reclaim_inode(batch[i], pag, flags);
-                                if (error && last_error != EFSCORRUPTED)
+                                if (error && last_error != -EFSCORRUPTED)
                                        last_error = error;
                        }
@@ -1129,7 +1132,7 @@ restart:
                trylock = 0;
                goto restart;
        }
-        return XFS_ERROR(last_error);
+        return last_error;
 }
 int
@@ -1203,6 +1206,30 @@ xfs_inode_match_id(
        return 1;
 }
+/*
+ * A union-based inode filtering algorithm. Process the inode if any of the
+ * criteria match. This is for global/internal scans only.
+ */
+STATIC int
+xfs_inode_match_id_union(
+        struct xfs_inode        *ip,
+        struct xfs_eofblocks    *eofb)
+{
+        if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
+            uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
+                return 1;
+        if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
+            gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
+                return 1;
+        if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
+            xfs_get_projid(ip) == eofb->eof_prid)
+                return 1;
+        return 0;
+}
 STATIC int
 xfs_inode_free_eofblocks(
        struct xfs_inode        *ip,
@@ -1211,6 +1238,10 @@ xfs_inode_free_eofblocks(
 {
        int ret;
        struct xfs_eofblocks *eofb = args;
+        bool need_iolock = true;
+        int match;
+        ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
        if (!xfs_can_free_eofblocks(ip, false)) {
                /* inode could be preallocated or append-only */
@@ -1228,19 +1259,31 @@ xfs_inode_free_eofblocks(
                return 0;
        if (eofb) {
-                if (!xfs_inode_match_id(ip, eofb))
+                if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
+                        match = xfs_inode_match_id_union(ip, eofb);
+                else
+                        match = xfs_inode_match_id(ip, eofb);
+                if (!match)
                        return 0;
                /* skip the inode if the file size is too small */
                if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
                    XFS_ISIZE(ip) < eofb->eof_min_file_size)
                        return 0;
+                /*
+                 * A scan owner implies we already hold the iolock. Skip it in
+                 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
+                 * the possibility of EAGAIN being returned.
+                 */
+                if (eofb->eof_scan_owner == ip->i_ino)
+                        need_iolock = false;
        }
-        ret = xfs_free_eofblocks(ip->i_mount, ip, true);
+        ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock);
        /* don't revisit the inode if we're not waiting */
-        if (ret == EAGAIN && !(flags & SYNC_WAIT))
+        if (ret == -EAGAIN && !(flags & SYNC_WAIT))
                ret = 0;
        return ret;
@@ -1260,6 +1303,55 @@ xfs_icache_free_eofblocks(
                                         eofb, XFS_ICI_EOFBLOCKS_TAG);
 }
+/*
+ * Run eofblocks scans on the quotas applicable to the inode. For inodes with
+ * multiple quotas, we don't know exactly which quota caused an allocation
+ * failure. We make a best effort by including each quota under low free space
+ * conditions (less than 1% free space) in the scan.
+ */
+int
+xfs_inode_free_quota_eofblocks(
+        struct xfs_inode *ip)
+{
+        int scan = 0;
+        struct xfs_eofblocks eofb = {0};
+        struct xfs_dquot *dq;
+        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+        /*
+         * Set the scan owner to avoid a potential livelock. Otherwise, the scan
+         * can repeatedly trylock on the inode we're currently processing. We
+         * run a sync scan to increase effectiveness and use the union filter to
+         * cover all applicable quotas in a single scan.
+         */
+        eofb.eof_scan_owner = ip->i_ino;
+        eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
+        if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
+                dq = xfs_inode_dquot(ip, XFS_DQ_USER);
+                if (dq && xfs_dquot_lowsp(dq)) {
+                        eofb.eof_uid = VFS_I(ip)->i_uid;
+                        eofb.eof_flags |= XFS_EOF_FLAGS_UID;
+                        scan = 1;
+                }
+        }
+        if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
+                dq = xfs_inode_dquot(ip, XFS_DQ_GROUP);
+                if (dq && xfs_dquot_lowsp(dq)) {
+                        eofb.eof_gid = VFS_I(ip)->i_gid;
+                        eofb.eof_flags |= XFS_EOF_FLAGS_GID;
+                        scan = 1;
+                }
+        }
+        if (scan)
+                xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+        return scan;
+}
 void
 xfs_inode_set_eofblocks_tag(
        xfs_inode_t     *ip)
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 9cf017b899be..46748b86b12f 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -27,6 +27,7 @@ struct xfs_eofblocks {
        kgid_t          eof_gid;
        prid_t          eof_prid;
        __u64           eof_min_file_size;
+        xfs_ino_t       eof_scan_owner;
 };
 #define SYNC_WAIT               0x0001  /* wait for i/o to complete */
@@ -57,6 +58,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
 void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
 int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
+int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
 void xfs_eofblocks_worker(struct work_struct *);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
@@ -72,31 +74,32 @@ xfs_fs_eofblocks_from_user(
        struct xfs_eofblocks            *dst)
 {
        if (src->eof_version != XFS_EOFBLOCKS_VERSION)
-                return EINVAL;
+                return -EINVAL;
        if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
-                return EINVAL;
+                return -EINVAL;
        if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
            memchr_inv(src->pad64, 0, sizeof(src->pad64)))
-                return EINVAL;
+                return -EINVAL;
        dst->eof_flags = src->eof_flags;
        dst->eof_prid = src->eof_prid;
        dst->eof_min_file_size = src->eof_min_file_size;
+        dst->eof_scan_owner = NULLFSINO;
        dst->eof_uid = INVALID_UID;
        if (src->eof_flags & XFS_EOF_FLAGS_UID) {
                dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
                if (!uid_valid(dst->eof_uid))
-                        return EINVAL;
+                        return -EINVAL;
        }
        dst->eof_gid = INVALID_GID;
        if (src->eof_flags & XFS_EOF_FLAGS_GID) {
                dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
                if (!gid_valid(dst->eof_gid))
-                        return EINVAL;
+                        return -EINVAL;
        }
        return 0;
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a6115fe1ac94..fea3c92fb3f0 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -583,7 +583,7 @@ xfs_lookup(
        trace_xfs_lookup(dp, name);
        if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-                return XFS_ERROR(EIO);
+                return -EIO;
        lock_mode = xfs_ilock_data_map_shared(dp);
        error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
@@ -893,7 +893,7 @@ xfs_dir_ialloc(
        }
        if (!ialloc_context && !ip) {
                *ipp = NULL;
-                return XFS_ERROR(ENOSPC);
+                return -ENOSPC;
        }
        /*
@@ -1088,7 +1088,7 @@ xfs_create(
        trace_xfs_create(dp, name);
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        prid = xfs_get_initial_prid(dp);
@@ -1125,12 +1125,12 @@ xfs_create(
         */
        tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
        error = xfs_trans_reserve(tp, &tres, resblks, 0);
-        if (error == ENOSPC) {
+        if (error == -ENOSPC) {
                /* flush outstanding delalloc blocks and retry */
                xfs_flush_inodes(mp);
                error = xfs_trans_reserve(tp, &tres, resblks, 0);
        }
-        if (error == ENOSPC) {
+        if (error == -ENOSPC) {
                /* No space at all so try a "no-allocation" reservation */
                resblks = 0;
                error = xfs_trans_reserve(tp, &tres, 0, 0);
@@ -1165,7 +1165,7 @@ xfs_create(
        error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
                               prid, resblks > 0, &ip, &committed);
        if (error) {
-                if (error == ENOSPC)
+                if (error == -ENOSPC)
                        goto out_trans_cancel;
                goto out_trans_abort;
        }
@@ -1184,7 +1184,7 @@ xfs_create(
                                        &first_block, &free_list, resblks ?
                                        resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
        if (error) {
-                ASSERT(error != ENOSPC);
+                ASSERT(error != -ENOSPC);
                goto out_trans_abort;
        }
        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -1274,7 +1274,7 @@ xfs_create_tmpfile(
        uint                    resblks;
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        prid = xfs_get_initial_prid(dp);
@@ -1293,7 +1293,7 @@ xfs_create_tmpfile(
        tres = &M_RES(mp)->tr_create_tmpfile;
        error = xfs_trans_reserve(tp, tres, resblks, 0);
-        if (error == ENOSPC) {
+        if (error == -ENOSPC) {
                /* No space at all so try a "no-allocation" reservation */
                resblks = 0;
                error = xfs_trans_reserve(tp, tres, 0, 0);
@@ -1311,7 +1311,7 @@ xfs_create_tmpfile(
        error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
                                prid, resblks > 0, &ip, NULL);
        if (error) {
-                if (error == ENOSPC)
+                if (error == -ENOSPC)
                        goto out_trans_cancel;
                goto out_trans_abort;
        }
@@ -1382,7 +1382,7 @@ xfs_link(
        ASSERT(!S_ISDIR(sip->i_d.di_mode));
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        error = xfs_qm_dqattach(sip, 0);
        if (error)
@@ -1396,7 +1396,7 @@ xfs_link(
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
        resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
-        if (error == ENOSPC) {
+        if (error == -ENOSPC) {
                resblks = 0;
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
        }
@@ -1417,7 +1417,7 @@ xfs_link(
         */
        if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
                     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
-                error = XFS_ERROR(EXDEV);
+                error = -EXDEV;
                goto error_return;
        }
@@ -1635,8 +1635,8 @@ xfs_release(
                truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
                if (truncated) {
                        xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
-                        if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
+                        if (ip->i_delayed_blks > 0) {
-                                error = -filemap_flush(VFS_I(ip)->i_mapping);
+                                error = filemap_flush(VFS_I(ip)->i_mapping);
                                if (error)
                                        return error;
                        }
@@ -1673,7 +1673,7 @@ xfs_release(
                        return 0;
                error = xfs_free_eofblocks(mp, ip, true);
-                if (error && error != EAGAIN)
+                if (error && error != -EAGAIN)
                        return error;
                /* delalloc blocks after truncation means it really is dirty */
@@ -1772,7 +1772,7 @@ xfs_inactive_ifree(
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
                                  XFS_IFREE_SPACE_RES(mp), 0);
        if (error) {
-                if (error == ENOSPC) {
+                if (error == -ENOSPC) {
                        xfs_warn_ratelimited(mp,
                        "Failed to remove inode(s) from unlinked list. "
                        "Please free space, unmount and run xfs_repair.");
@@ -2219,7 +2219,7 @@ xfs_ifree_cluster(
                                        XBF_UNMAPPED);
                if (!bp)
-                        return ENOMEM;
+                        return -ENOMEM;
                /*
                 * This buffer may not have been correctly initialised as we
@@ -2491,7 +2491,7 @@ xfs_remove(
        trace_xfs_remove(dp, name);
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        error = xfs_qm_dqattach(dp, 0);
        if (error)
@@ -2521,12 +2521,12 @@ xfs_remove(
         */
        resblks = XFS_REMOVE_SPACE_RES(mp);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
-        if (error == ENOSPC) {
+        if (error == -ENOSPC) {
                resblks = 0;
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
        }
        if (error) {
-                ASSERT(error != ENOSPC);
+                ASSERT(error != -ENOSPC);
                cancel_flags = 0;
                goto out_trans_cancel;
        }
@@ -2543,11 +2543,11 @@ xfs_remove(
        if (is_dir) {
                ASSERT(ip->i_d.di_nlink >= 2);
                if (ip->i_d.di_nlink != 2) {
-                        error = XFS_ERROR(ENOTEMPTY);
+                        error = -ENOTEMPTY;
                        goto out_trans_cancel;
                }
                if (!xfs_dir_isempty(ip)) {
-                        error = XFS_ERROR(ENOTEMPTY);
+                        error = -ENOTEMPTY;
                        goto out_trans_cancel;
                }
@@ -2582,7 +2582,7 @@ xfs_remove(
        error = xfs_dir_removename(tp, dp, name, ip->i_ino,
                                        &first_block, &free_list, resblks);
        if (error) {
-                ASSERT(error != ENOENT);
+                ASSERT(error != -ENOENT);
                goto out_bmap_cancel;
        }
@@ -2702,7 +2702,7 @@ xfs_rename(
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
        spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
-        if (error == ENOSPC) {
+        if (error == -ENOSPC) {
                spaceres = 0;
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
        }
@@ -2747,7 +2747,7 @@ xfs_rename(
         */
        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
                     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
-                error = XFS_ERROR(EXDEV);
+                error = -EXDEV;
                goto error_return;
        }
@@ -2770,7 +2770,7 @@ xfs_rename(
                error = xfs_dir_createname(tp, target_dp, target_name,
                                                src_ip->i_ino, &first_block,
                                                &free_list, spaceres);
-                if (error == ENOSPC)
+                if (error == -ENOSPC)
                        goto error_return;
                if (error)
                        goto abort_return;
@@ -2795,7 +2795,7 @@ xfs_rename(
                         */
                        if (!(xfs_dir_isempty(target_ip)) ||
                            (target_ip->i_d.di_nlink > 2)) {
-                                error = XFS_ERROR(EEXIST);
+                                error = -EEXIST;
                                goto error_return;
                        }
                }
@@ -2847,7 +2847,7 @@ xfs_rename(
                error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
                                        target_dp->i_ino,
                                        &first_block, &free_list, spaceres);
-                ASSERT(error != EEXIST);
+                ASSERT(error != -EEXIST);
                if (error)
                        goto abort_return;
        }
@@ -3055,7 +3055,7 @@ cluster_corrupt_out:
                if (bp->b_iodone) {
                        XFS_BUF_UNDONE(bp);
                        xfs_buf_stale(bp);
-                        xfs_buf_ioerror(bp, EIO);
+                        xfs_buf_ioerror(bp, -EIO);
                        xfs_buf_ioend(bp, 0);
                } else {
                        xfs_buf_stale(bp);
@@ -3069,7 +3069,7 @@ cluster_corrupt_out:
        xfs_iflush_abort(iq, false);
        kmem_free(ilist);
        xfs_perag_put(pag);
-        return XFS_ERROR(EFSCORRUPTED);
+        return -EFSCORRUPTED;
 }
 /*
@@ -3124,7 +3124,7 @@ xfs_iflush(
         * as we wait for an empty AIL as part of the unmount process.
         */
        if (XFS_FORCED_SHUTDOWN(mp)) {
-                error = XFS_ERROR(EIO);
+                error = -EIO;
                goto abort_out;
        }
@@ -3167,7 +3167,7 @@ corrupt_out:
        xfs_buf_relse(bp);
        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 cluster_corrupt_out:
-        error = XFS_ERROR(EFSCORRUPTED);
+        error = -EFSCORRUPTED;
 abort_out:
        /*
         * Unlocks the flush lock
@@ -3331,5 +3331,5 @@ xfs_iflush_int(
        return 0;
 corrupt_out:
-        return XFS_ERROR(EFSCORRUPTED);
+        return -EFSCORRUPTED;
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f72bffa67266..c10e3fadd9af 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -398,4 +398,14 @@ do { \
 extern struct kmem_zone *xfs_inode_zone;
+/*
+ * Flags for read/write calls
+ */
+#define XFS_IO_ISDIRECT 0x00001         /* bypass page cache */
+#define XFS_IO_INVIS    0x00002         /* don't update inode timestamps */
+#define XFS_IO_FLAGS \
+        { XFS_IO_ISDIRECT,      "DIRECT" }, \
+        { XFS_IO_INVIS,         "INVIS"}
 #endif  /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index a640137b3573..de5a7be36e60 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -788,5 +788,5 @@ xfs_inode_item_format_convert(
                in_f->ilf_boffset = in_f64->ilf_boffset;
                return 0;
        }
-        return EFSCORRUPTED;
+        return -EFSCORRUPTED;
 }
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 8bc1bbce7451..3799695b9249 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -207,7 +207,7 @@ xfs_open_by_handle(
        struct path             path;
        if (!capable(CAP_SYS_ADMIN))
-                return -XFS_ERROR(EPERM);
+                return -EPERM;
        dentry = xfs_handlereq_to_dentry(parfilp, hreq);
        if (IS_ERR(dentry))
@@ -216,7 +216,7 @@ xfs_open_by_handle(
        /* Restrict xfs_open_by_handle to directories & regular files. */
        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
-                error = -XFS_ERROR(EPERM);
+                error = -EPERM;
                goto out_dput;
        }
@@ -228,18 +228,18 @@ xfs_open_by_handle(
        fmode = OPEN_FMODE(permflag);
        if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
            (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
-                error = -XFS_ERROR(EPERM);
+                error = -EPERM;
                goto out_dput;
        }
        if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
-                error = -XFS_ERROR(EACCES);
+                error = -EACCES;
                goto out_dput;
        }
        /* Can't write directories. */
        if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
-                error = -XFS_ERROR(EISDIR);
+                error = -EISDIR;
                goto out_dput;
        }
@@ -282,7 +282,7 @@ xfs_readlink_by_handle(
        int                     error;
        if (!capable(CAP_SYS_ADMIN))
-                return -XFS_ERROR(EPERM);
+                return -EPERM;
        dentry = xfs_handlereq_to_dentry(parfilp, hreq);
        if (IS_ERR(dentry))
@@ -290,22 +290,22 @@ xfs_readlink_by_handle(
        /* Restrict this handle operation to symlinks only. */
        if (!S_ISLNK(dentry->d_inode->i_mode)) {
-                error = -XFS_ERROR(EINVAL);
+                error = -EINVAL;
                goto out_dput;
        }
        if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
-                error = -XFS_ERROR(EFAULT);
+                error = -EFAULT;
                goto out_dput;
        }
        link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
        if (!link) {
-                error = -XFS_ERROR(ENOMEM);
+                error = -ENOMEM;
                goto out_dput;
        }
-        error = -xfs_readlink(XFS_I(dentry->d_inode), link);
+        error = xfs_readlink(XFS_I(dentry->d_inode), link);
        if (error)
                goto out_kfree;
        error = readlink_copy(hreq->ohandle, olen, link);
@@ -330,10 +330,10 @@ xfs_set_dmattrs(
        int             error;
        if (!capable(CAP_SYS_ADMIN))
-                return XFS_ERROR(EPERM);
+                return -EPERM;
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
@@ -364,9 +364,9 @@ xfs_fssetdm_by_handle(
        struct dentry           *dentry;
        if (!capable(CAP_MKNOD))
-                return -XFS_ERROR(EPERM);
+                return -EPERM;
        if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        error = mnt_want_write_file(parfilp);
        if (error)
@@ -379,16 +379,16 @@ xfs_fssetdm_by_handle(
        }
        if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
-                error = -XFS_ERROR(EPERM);
+                error = -EPERM;
                goto out;
        }
        if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
-                error = -XFS_ERROR(EFAULT);
+                error = -EFAULT;
                goto out;
        }
-        error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+        error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
                                 fsd.fsd_dmstate);
 out:
@@ -409,18 +409,18 @@ xfs_attrlist_by_handle(
        char                    *kbuf;
        if (!capable(CAP_SYS_ADMIN))
-                return -XFS_ERROR(EPERM);
+                return -EPERM;
        if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        if (al_hreq.buflen < sizeof(struct attrlist) ||
            al_hreq.buflen > XATTR_LIST_MAX)
-                return -XFS_ERROR(EINVAL);
+                return -EINVAL;
        /*
         * Reject flags, only allow namespaces.
         */
        if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
-                return -XFS_ERROR(EINVAL);
+                return -EINVAL;
        dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
        if (IS_ERR(dentry))
@@ -431,7 +431,7 @@ xfs_attrlist_by_handle(
                goto out_dput;
        cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-        error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+        error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
                                        al_hreq.flags, cursor);
        if (error)
                goto out_kfree;
@@ -455,20 +455,20 @@ xfs_attrmulti_attr_get(
        __uint32_t              flags)
 {
        unsigned char           *kbuf;
-        int                     error = EFAULT;
+        int                     error = -EFAULT;
        if (*len > XATTR_SIZE_MAX)
-                return EINVAL;
+                return -EINVAL;
        kbuf = kmem_zalloc_large(*len, KM_SLEEP);
        if (!kbuf)
-                return ENOMEM;
+                return -ENOMEM;
        error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
        if (error)
                goto out_kfree;
        if (copy_to_user(ubuf, kbuf, *len))
-                error = EFAULT;
+                error = -EFAULT;
 out_kfree:
        kmem_free(kbuf);
@@ -484,20 +484,17 @@ xfs_attrmulti_attr_set(
        __uint32_t              flags)
 {
        unsigned char           *kbuf;
-        int                     error = EFAULT;
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-                return EPERM;
+                return -EPERM;
        if (len > XATTR_SIZE_MAX)
-                return EINVAL;
+                return -EINVAL;
        kbuf = memdup_user(ubuf, len);
        if (IS_ERR(kbuf))
                return PTR_ERR(kbuf);
-        error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+        return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
-        return error;
 }
 int
@@ -507,7 +504,7 @@ xfs_attrmulti_attr_remove(
        __uint32_t              flags)
 {
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-                return EPERM;
+                return -EPERM;
        return xfs_attr_remove(XFS_I(inode), name, flags);
 }
@@ -524,9 +521,9 @@ xfs_attrmulti_by_handle(
        unsigned char           *attr_name;
        if (!capable(CAP_SYS_ADMIN))
-                return -XFS_ERROR(EPERM);
+                return -EPERM;
        if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        /* overflow check */
        if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
@@ -536,18 +533,18 @@ xfs_attrmulti_by_handle(
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
-        error = E2BIG;
+        error = -E2BIG;
        size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
        if (!size || size > 16 * PAGE_SIZE)
                goto out_dput;
        ops = memdup_user(am_hreq.ops, size);
        if (IS_ERR(ops)) {
-                error = -PTR_ERR(ops);
+                error = PTR_ERR(ops);
                goto out_dput;
        }
-        error = ENOMEM;
+        error = -ENOMEM;
        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
        if (!attr_name)
                goto out_kfree_ops;
@@ -557,7 +554,7 @@ xfs_attrmulti_by_handle(
                ops[i].am_error = strncpy_from_user((char *)attr_name,
                                ops[i].am_attrname, MAXNAMELEN);
                if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
-                        error = ERANGE;
+                        error = -ERANGE;
                if (ops[i].am_error < 0)
                        break;
@@ -588,19 +585,19 @@ xfs_attrmulti_by_handle(
                        mnt_drop_write_file(parfilp);
                        break;
                default:
-                        ops[i].am_error = EINVAL;
+                        ops[i].am_error = -EINVAL;
                }
        }
        if (copy_to_user(am_hreq.ops, ops, size))
-                error = XFS_ERROR(EFAULT);
+                error = -EFAULT;
        kfree(attr_name);
 out_kfree_ops:
        kfree(ops);
 out_dput:
        dput(dentry);
-        return -error;
+        return error;
 }
 int
@@ -625,16 +622,16 @@ xfs_ioc_space(
         */
        if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
            !capable(CAP_SYS_ADMIN))
-                return -XFS_ERROR(EPERM);
+                return -EPERM;
        if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
-                return -XFS_ERROR(EPERM);
+                return -EPERM;
        if (!(filp->f_mode & FMODE_WRITE))
-                return -XFS_ERROR(EBADF);
+                return -EBADF;
        if (!S_ISREG(inode->i_mode))
-                return -XFS_ERROR(EINVAL);
+                return -EINVAL;
        error = mnt_want_write_file(filp);
        if (error)
@@ -652,7 +649,7 @@ xfs_ioc_space(
                bf->l_start += XFS_ISIZE(ip);
                break;
        default:
-                error = XFS_ERROR(EINVAL);
+                error = -EINVAL;
                goto out_unlock;
        }
@@ -669,7 +666,7 @@ xfs_ioc_space(
        case XFS_IOC_UNRESVSP:
        case XFS_IOC_UNRESVSP64:
                if (bf->l_len <= 0) {
-                        error = XFS_ERROR(EINVAL);
+                        error = -EINVAL;
                        goto out_unlock;
                }
                break;
@@ -682,7 +679,7 @@ xfs_ioc_space(
            bf->l_start > mp->m_super->s_maxbytes ||
            bf->l_start + bf->l_len < 0 ||
            bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) {
-                error = XFS_ERROR(EINVAL);
+                error = -EINVAL;
                goto out_unlock;
        }
@@ -723,7 +720,7 @@ xfs_ioc_space(
                break;
        default:
                ASSERT(0);
-                error = XFS_ERROR(EINVAL);
+                error = -EINVAL;
        }
        if (error)
@@ -739,7 +736,7 @@ xfs_ioc_space(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-        if (!(ioflags & IO_INVIS)) {
+        if (!(ioflags & XFS_IO_INVIS)) {
                ip->i_d.di_mode &= ~S_ISUID;
                if (ip->i_d.di_mode & S_IXGRP)
                        ip->i_d.di_mode &= ~S_ISGID;
@@ -759,7 +756,7 @@ xfs_ioc_space(
 out_unlock:
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
        mnt_drop_write_file(filp);
-        return -error;
+        return error;
 }
 STATIC int
@@ -781,41 +778,41 @@ xfs_ioc_bulkstat(
                return -EPERM;
        if (XFS_FORCED_SHUTDOWN(mp))
-                return -XFS_ERROR(EIO);
+                return -EIO;
        if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        if ((count = bulkreq.icount) <= 0)
-                return -XFS_ERROR(EINVAL);
+                return -EINVAL;
        if (bulkreq.ubuffer == NULL)
-                return -XFS_ERROR(EINVAL);
+                return -EINVAL;
        if (cmd == XFS_IOC_FSINUMBERS)
                error = xfs_inumbers(mp, &inlast, &count,
                                        bulkreq.ubuffer, xfs_inumbers_fmt);
        else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
-                error = xfs_bulkstat_single(mp, &inlast,
+                error = xfs_bulkstat_one(mp, inlast, bulkreq.ubuffer,
-                                                bulkreq.ubuffer, &done);
+                                        sizeof(xfs_bstat_t), NULL, &done);
        else    /* XFS_IOC_FSBULKSTAT */
                error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
                                     sizeof(xfs_bstat_t), bulkreq.ubuffer,
                                     &done);
        if (error)
-                return -error;
+                return error;
        if (bulkreq.ocount != NULL) {
                if (copy_to_user(bulkreq.lastip, &inlast,
                                                sizeof(xfs_ino_t)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
        }
        return 0;
@@ -831,7 +828,7 @@ xfs_ioc_fsgeometry_v1(
        error = xfs_fs_geometry(mp, &fsgeo, 3);
        if (error)
-                return -error;
+                return error;
        /*
         * Caller should have passed an argument of type
@@ -839,7 +836,7 @@ xfs_ioc_fsgeometry_v1(
         * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
         */
        if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        return 0;
 }
@@ -853,10 +850,10 @@ xfs_ioc_fsgeometry(
        error = xfs_fs_geometry(mp, &fsgeo, 4);
        if (error)
-                return -error;
+                return error;
        if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        return 0;
 }
@@ -1041,16 +1038,16 @@ xfs_ioctl_setattr(
        trace_xfs_ioctl_setattr(ip);
        if (mp->m_flags & XFS_MOUNT_RDONLY)
-                return XFS_ERROR(EROFS);
+                return -EROFS;
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        /*
         * Disallow 32bit project ids when projid32bit feature is not enabled.
         */
        if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
                        !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        /*
         * If disk quotas is on, we make sure that the dquots do exist on disk,
@@ -1088,7 +1085,7 @@ xfs_ioctl_setattr(
         * CAP_FSETID capability is applicable.
         */
        if (!inode_owner_or_capable(VFS_I(ip))) {
-                code = XFS_ERROR(EPERM);
+                code = -EPERM;
                goto error_return;
        }
@@ -1099,7 +1096,7 @@ xfs_ioctl_setattr(
         */
        if (mask & FSX_PROJID) {
                if (current_user_ns() != &init_user_ns) {
-                        code = XFS_ERROR(EINVAL);
+                        code = -EINVAL;
                        goto error_return;
                }
@@ -1122,7 +1119,7 @@ xfs_ioctl_setattr(
                if (ip->i_d.di_nextents &&
                    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
                     fa->fsx_extsize)) {
-                        code = XFS_ERROR(EINVAL);       /* EFBIG? */
+                        code = -EINVAL; /* EFBIG? */
                        goto error_return;
                }
@@ -1141,7 +1138,7 @@ xfs_ioctl_setattr(
                        extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
                        if (extsize_fsb > MAXEXTLEN) {
-                                code = XFS_ERROR(EINVAL);
+                                code = -EINVAL;
                                goto error_return;
                        }
@@ -1153,13 +1150,13 @@ xfs_ioctl_setattr(
                        } else {
                                size = mp->m_sb.sb_blocksize;
                                if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
-                                        code = XFS_ERROR(EINVAL);
+                                        code = -EINVAL;
                                        goto error_return;
                                }
                        }
                        if (fa->fsx_extsize % size) {
-                                code = XFS_ERROR(EINVAL);
+                                code = -EINVAL;
                                goto error_return;
                        }
                }
@@ -1173,7 +1170,7 @@ xfs_ioctl_setattr(
                if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
                    (XFS_IS_REALTIME_INODE(ip)) !=
                    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
-                        code = XFS_ERROR(EINVAL);       /* EFBIG? */
+                        code = -EINVAL; /* EFBIG? */
                        goto error_return;
                }
@@ -1184,7 +1181,7 @@ xfs_ioctl_setattr(
                        if ((mp->m_sb.sb_rblocks == 0) ||
                            (mp->m_sb.sb_rextsize == 0) ||
                            (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
-                                code = XFS_ERROR(EINVAL);
+                                code = -EINVAL;
                                goto error_return;
                        }
                }
@@ -1198,7 +1195,7 @@ xfs_ioctl_setattr(
                     (fa->fsx_xflags &
                                (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
                    !capable(CAP_LINUX_IMMUTABLE)) {
-                        code = XFS_ERROR(EPERM);
+                        code = -EPERM;
                        goto error_return;
                }
        }
@@ -1301,7 +1298,7 @@ xfs_ioc_fssetxattr(
                return error;
        error = xfs_ioctl_setattr(ip, &fa, mask);
        mnt_drop_write_file(filp);
-        return -error;
+        return error;
 }
 STATIC int
@@ -1346,7 +1343,7 @@ xfs_ioc_setxflags(
                return error;
        error = xfs_ioctl_setattr(ip, &fa, mask);
        mnt_drop_write_file(filp);
-        return -error;
+        return error;
 }
 STATIC int
@@ -1356,7 +1353,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
        /* copy only getbmap portion (not getbmapx) */
        if (copy_to_user(base, bmv, sizeof(struct getbmap)))
-                return XFS_ERROR(EFAULT);
+                return -EFAULT;
        *ap += sizeof(struct getbmap);
        return 0;
@@ -1373,23 +1370,23 @@ xfs_ioc_getbmap(
        int                     error;
        if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        if (bmx.bmv_count < 2)
-                return -XFS_ERROR(EINVAL);
+                return -EINVAL;
        bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
-        if (ioflags & IO_INVIS)
+        if (ioflags & XFS_IO_INVIS)
                bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
        error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
                            (struct getbmap *)arg+1);
        if (error)
-                return -error;
+                return error;
        /* copy back header - only size of getbmap */
        if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        return 0;
 }
@@ -1399,7 +1396,7 @@ xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
        struct getbmapx __user  *base = *ap;
        if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
-                return XFS_ERROR(EFAULT);
+                return -EFAULT;
        *ap += sizeof(struct getbmapx);
        return 0;
@@ -1414,22 +1411,22 @@ xfs_ioc_getbmapx(
        int                     error;
        if (copy_from_user(&bmx, arg, sizeof(bmx)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        if (bmx.bmv_count < 2)
-                return -XFS_ERROR(EINVAL);
+                return -EINVAL;
        if (bmx.bmv_iflags & (~BMV_IF_VALID))
-                return -XFS_ERROR(EINVAL);
+                return -EINVAL;
        error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
                            (struct getbmapx *)arg+1);
        if (error)
-                return -error;
+                return error;
        /* copy back header */
        if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        return 0;
 }
@@ -1445,33 +1442,33 @@ xfs_ioc_swapext(
        /* Pull information for the target fd */
        f = fdget((int)sxp->sx_fdtarget);
        if (!f.file) {
-                error = XFS_ERROR(EINVAL);
+                error = -EINVAL;
                goto out;
        }
        if (!(f.file->f_mode & FMODE_WRITE) ||
            !(f.file->f_mode & FMODE_READ) ||
            (f.file->f_flags & O_APPEND)) {
-                error = XFS_ERROR(EBADF);
+                error = -EBADF;
                goto out_put_file;
        }
        tmp = fdget((int)sxp->sx_fdtmp);
        if (!tmp.file) {
-                error = XFS_ERROR(EINVAL);
+                error = -EINVAL;
                goto out_put_file;
        }
        if (!(tmp.file->f_mode & FMODE_WRITE) ||
            !(tmp.file->f_mode & FMODE_READ) ||
            (tmp.file->f_flags & O_APPEND)) {
-                error = XFS_ERROR(EBADF);
+                error = -EBADF;
                goto out_put_tmp_file;
        }
        if (IS_SWAPFILE(file_inode(f.file)) ||
            IS_SWAPFILE(file_inode(tmp.file))) {
-                error = XFS_ERROR(EINVAL);
+                error = -EINVAL;
                goto out_put_tmp_file;
        }
@@ -1479,17 +1476,17 @@ xfs_ioc_swapext(
        tip = XFS_I(file_inode(tmp.file));
        if (ip->i_mount != tip->i_mount) {
-                error = XFS_ERROR(EINVAL);
+                error = -EINVAL;
                goto out_put_tmp_file;
        }
        if (ip->i_ino == tip->i_ino) {
-                error = XFS_ERROR(EINVAL);
+                error = -EINVAL;
                goto out_put_tmp_file;
        }
        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                error = XFS_ERROR(EIO);
+                error = -EIO;
                goto out_put_tmp_file;
        }
@@ -1523,7 +1520,7 @@ xfs_file_ioctl(
        int                     error;
        if (filp->f_mode & FMODE_NOCMTIME)
-                ioflags |= IO_INVIS;
+                ioflags |= XFS_IO_INVIS;
        trace_xfs_file_ioctl(ip);
@@ -1542,7 +1539,7 @@ xfs_file_ioctl(
                xfs_flock64_t           bf;
                if (copy_from_user(&bf, arg, sizeof(bf)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
        }
        case XFS_IOC_DIOINFO: {
@@ -1555,7 +1552,7 @@ xfs_file_ioctl(
                da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
                if (copy_to_user(arg, &da, sizeof(da)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                return 0;
        }
@@ -1588,7 +1585,7 @@ xfs_file_ioctl(
                struct fsdmidata        dmi;
                if (copy_from_user(&dmi, arg, sizeof(dmi)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                error = mnt_want_write_file(filp);
                if (error)
@@ -1597,7 +1594,7 @@ xfs_file_ioctl(
                error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
                                dmi.fsd_dmstate);
                mnt_drop_write_file(filp);
-                return -error;
+                return error;
        }
        case XFS_IOC_GETBMAP:
@@ -1613,14 +1610,14 @@ xfs_file_ioctl(
                xfs_fsop_handlereq_t    hreq;
                if (copy_from_user(&hreq, arg, sizeof(hreq)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                return xfs_find_handle(cmd, &hreq);
        }
        case XFS_IOC_OPEN_BY_HANDLE: {
                xfs_fsop_handlereq_t    hreq;
                if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                return xfs_open_by_handle(filp, &hreq);
        }
        case XFS_IOC_FSSETDM_BY_HANDLE:
@@ -1630,7 +1627,7 @@ xfs_file_ioctl(
                xfs_fsop_handlereq_t    hreq;
                if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                return xfs_readlink_by_handle(filp, &hreq);
        }
        case XFS_IOC_ATTRLIST_BY_HANDLE:
@@ -1643,13 +1640,13 @@ xfs_file_ioctl(
                struct xfs_swapext      sxp;
                if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
                error = xfs_ioc_swapext(&sxp);
                mnt_drop_write_file(filp);
-                return -error;
+                return error;
        }
        case XFS_IOC_FSCOUNTS: {
@@ -1657,10 +1654,10 @@ xfs_file_ioctl(
                error = xfs_fs_counts(mp, &out);
                if (error)
-                        return -error;
+                        return error;
                if (copy_to_user(arg, &out, sizeof(out)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                return 0;
        }
@@ -1672,10 +1669,10 @@ xfs_file_ioctl(
                        return -EPERM;
                if (mp->m_flags & XFS_MOUNT_RDONLY)
-                        return -XFS_ERROR(EROFS);
+                        return -EROFS;
                if (copy_from_user(&inout, arg, sizeof(inout)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                error = mnt_want_write_file(filp);
                if (error)
@@ -1686,10 +1683,10 @@ xfs_file_ioctl(
                error = xfs_reserve_blocks(mp, &in, &inout);
                mnt_drop_write_file(filp);
                if (error)
-                        return -error;
+                        return error;
                if (copy_to_user(arg, &inout, sizeof(inout)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                return 0;
        }
@@ -1701,10 +1698,10 @@ xfs_file_ioctl(
                error = xfs_reserve_blocks(mp, NULL, &out);
                if (error)
-                        return -error;
+                        return error;
                if (copy_to_user(arg, &out, sizeof(out)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                return 0;
        }
@@ -1713,42 +1710,42 @@ xfs_file_ioctl(
                xfs_growfs_data_t in;
                if (copy_from_user(&in, arg, sizeof(in)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
                error = xfs_growfs_data(mp, &in);
                mnt_drop_write_file(filp);
-                return -error;
+                return error;
        }
        case XFS_IOC_FSGROWFSLOG: {
                xfs_growfs_log_t in;
                if (copy_from_user(&in, arg, sizeof(in)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
                error = xfs_growfs_log(mp, &in);
                mnt_drop_write_file(filp);
-                return -error;
+                return error;
        }
        case XFS_IOC_FSGROWFSRT: {
                xfs_growfs_rt_t in;
                if (copy_from_user(&in, arg, sizeof(in)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
                error = xfs_growfs_rt(mp, &in);
                mnt_drop_write_file(filp);
-                return -error;
+                return error;
        }
        case XFS_IOC_GOINGDOWN: {
@@ -1758,10 +1755,9 @@ xfs_file_ioctl(
                        return -EPERM;
                if (get_user(in, (__uint32_t __user *)arg))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
-                error = xfs_fs_goingdown(mp, in);
+                return xfs_fs_goingdown(mp, in);
-                return -error;
        }
        case XFS_IOC_ERROR_INJECTION: {
@@ -1771,18 +1767,16 @@ xfs_file_ioctl(
                        return -EPERM;
                if (copy_from_user(&in, arg, sizeof(in)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
-                error = xfs_errortag_add(in.errtag, mp);
+                return xfs_errortag_add(in.errtag, mp);
-                return -error;
        }
        case XFS_IOC_ERROR_CLEARALL:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
-                error = xfs_errortag_clearall(mp, 1);
+                return xfs_errortag_clearall(mp, 1);
-                return -error;
        case XFS_IOC_FREE_EOFBLOCKS: {
                struct xfs_fs_eofblocks eofb;
@@ -1792,16 +1786,16 @@ xfs_file_ioctl(
                        return -EPERM;
                if (mp->m_flags & XFS_MOUNT_RDONLY)
-                        return -XFS_ERROR(EROFS);
+                        return -EROFS;
                if (copy_from_user(&eofb, arg, sizeof(eofb)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                error = xfs_fs_eofblocks_from_user(&eofb, &keofb);
                if (error)
-                        return -error;
+                        return error;
-                return -xfs_icache_free_eofblocks(mp, &keofb);
+                return xfs_icache_free_eofblocks(mp, &keofb);
        }
        default:
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 944d5baa710a..a554646ff141 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -28,7 +28,6 @@
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_vnode.h"
 #include "xfs_inode.h"
 #include "xfs_itable.h"
 #include "xfs_error.h"
@@ -56,7 +55,7 @@ xfs_compat_flock64_copyin(
            get_user(bf->l_sysid,       &arg32->l_sysid) ||
            get_user(bf->l_pid,         &arg32->l_pid) ||
            copy_from_user(bf->l_pad,   &arg32->l_pad,  4*sizeof(u32)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        return 0;
 }
@@ -70,10 +69,10 @@ xfs_compat_ioc_fsgeometry_v1(
        error = xfs_fs_geometry(mp, &fsgeo, 3);
        if (error)
-                return -error;
+                return error;
        /* The 32-bit variant simply has some padding at the end */
        if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        return 0;
 }
@@ -84,7 +83,7 @@ xfs_compat_growfs_data_copyin(
 {
        if (get_user(in->newblocks, &arg32->newblocks) ||
            get_user(in->imaxpct,   &arg32->imaxpct))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        return 0;
 }
@@ -95,14 +94,14 @@ xfs_compat_growfs_rt_copyin(
 {
        if (get_user(in->newblocks, &arg32->newblocks) ||
            get_user(in->extsize,   &arg32->extsize))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        return 0;
 }
 STATIC int
 xfs_inumbers_fmt_compat(
        void                    __user *ubuffer,
-        const xfs_inogrp_t      *buffer,
+        const struct xfs_inogrp *buffer,
        long                    count,
        long                    *written)
 {
@@ -113,7 +112,7 @@ xfs_inumbers_fmt_compat(
                if (put_user(buffer[i].xi_startino,   &p32[i].xi_startino) ||
                    put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
                    put_user(buffer[i].xi_allocmask,  &p32[i].xi_allocmask))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
        }
        *written = count * sizeof(*p32);
        return 0;
@@ -132,7 +131,7 @@ xfs_ioctl32_bstime_copyin(
        if (get_user(sec32,             &bstime32->tv_sec)      ||
            get_user(bstime->tv_nsec,   &bstime32->tv_nsec))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        bstime->tv_sec = sec32;
        return 0;
 }
@@ -164,7 +163,7 @@ xfs_ioctl32_bstat_copyin(
            get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
            get_user(bstat->bs_dmstate, &bstat32->bs_dmstate)   ||
            get_user(bstat->bs_aextents, &bstat32->bs_aextents))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        return 0;
 }
@@ -180,7 +179,7 @@ xfs_bstime_store_compat(
        sec32 = p->tv_sec;
        if (put_user(sec32, &p32->tv_sec) ||
            put_user(p->tv_nsec, &p32->tv_nsec))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        return 0;
 }
@@ -195,7 +194,7 @@ xfs_bulkstat_one_fmt_compat(
        compat_xfs_bstat_t      __user *p32 = ubuffer;
        if (ubsize < sizeof(*p32))
-                return XFS_ERROR(ENOMEM);
+                return -ENOMEM;
        if (put_user(buffer->bs_ino,      &p32->bs_ino)         ||
            put_user(buffer->bs_mode,     &p32->bs_mode)        ||
@@ -218,7 +217,7 @@ xfs_bulkstat_one_fmt_compat(
            put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)    ||
            put_user(buffer->bs_dmstate,  &p32->bs_dmstate)     ||
            put_user(buffer->bs_aextents, &p32->bs_aextents))
-                return XFS_ERROR(EFAULT);
+                return -EFAULT;
        if (ubused)
                *ubused = sizeof(*p32);
        return 0;
@@ -256,30 +255,30 @@ xfs_compat_ioc_bulkstat(
        /* should be called again (unused here, but used in dmapi) */
        if (!capable(CAP_SYS_ADMIN))
-                return -XFS_ERROR(EPERM);
+                return -EPERM;
        if (XFS_FORCED_SHUTDOWN(mp))
-                return -XFS_ERROR(EIO);
+                return -EIO;
        if (get_user(addr, &p32->lastip))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        bulkreq.lastip = compat_ptr(addr);
        if (get_user(bulkreq.icount, &p32->icount) ||
            get_user(addr, &p32->ubuffer))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        bulkreq.ubuffer = compat_ptr(addr);
        if (get_user(addr, &p32->ocount))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        bulkreq.ocount = compat_ptr(addr);
        if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        if ((count = bulkreq.icount) <= 0)
-                return -XFS_ERROR(EINVAL);
+                return -EINVAL;
        if (bulkreq.ubuffer == NULL)
-                return -XFS_ERROR(EINVAL);
+                return -EINVAL;
        if (cmd == XFS_IOC_FSINUMBERS_32) {
                error = xfs_inumbers(mp, &inlast, &count,
@@ -294,17 +293,17 @@ xfs_compat_ioc_bulkstat(
                        xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
                        bulkreq.ubuffer, &done);
        } else
-                error = XFS_ERROR(EINVAL);
+                error = -EINVAL;
        if (error)
-                return -error;
+                return error;
        if (bulkreq.ocount != NULL) {
                if (copy_to_user(bulkreq.lastip, &inlast,
                                                sizeof(xfs_ino_t)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
        }
        return 0;
@@ -318,7 +317,7 @@ xfs_compat_handlereq_copyin(
        compat_xfs_fsop_handlereq_t     hreq32;
        if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        hreq->fd = hreq32.fd;
        hreq->path = compat_ptr(hreq32.path);
@@ -352,19 +351,19 @@ xfs_compat_attrlist_by_handle(
        char                    *kbuf;
        if (!capable(CAP_SYS_ADMIN))
-                return -XFS_ERROR(EPERM);
+                return -EPERM;
        if (copy_from_user(&al_hreq, arg,
                           sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        if (al_hreq.buflen < sizeof(struct attrlist) ||
            al_hreq.buflen > XATTR_LIST_MAX)
-                return -XFS_ERROR(EINVAL);
+                return -EINVAL;
        /*
         * Reject flags, only allow namespaces.
         */
        if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
-                return -XFS_ERROR(EINVAL);
+                return -EINVAL;
        dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
        if (IS_ERR(dentry))
@@ -376,7 +375,7 @@ xfs_compat_attrlist_by_handle(
                goto out_dput;
        cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-        error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+        error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
                                        al_hreq.flags, cursor);
        if (error)
                goto out_kfree;
@@ -404,10 +403,10 @@ xfs_compat_attrmulti_by_handle(
        unsigned char                           *attr_name;
        if (!capable(CAP_SYS_ADMIN))
-                return -XFS_ERROR(EPERM);
+                return -EPERM;
        if (copy_from_user(&am_hreq, arg,
                           sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        /* overflow check */
        if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
@@ -417,7 +416,7 @@ xfs_compat_attrmulti_by_handle(
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
-        error = E2BIG;
+        error = -E2BIG;
        size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
        if (!size || size > 16 * PAGE_SIZE)
                goto out_dput;
@@ -428,7 +427,7 @@ xfs_compat_attrmulti_by_handle(
                goto out_dput;
        }
-        error = ENOMEM;
+        error = -ENOMEM;
        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
        if (!attr_name)
                goto out_kfree_ops;
@@ -439,7 +438,7 @@ xfs_compat_attrmulti_by_handle(
                                compat_ptr(ops[i].am_attrname),
                                MAXNAMELEN);
                if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
-                        error = ERANGE;
+                        error = -ERANGE;
                if (ops[i].am_error < 0)
                        break;
@@ -470,19 +469,19 @@ xfs_compat_attrmulti_by_handle(
                        mnt_drop_write_file(parfilp);
                        break;
                default:
-                        ops[i].am_error = EINVAL;
+                        ops[i].am_error = -EINVAL;
                }
        }
        if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
-                error = XFS_ERROR(EFAULT);
+                error = -EFAULT;
        kfree(attr_name);
 out_kfree_ops:
        kfree(ops);
 out_dput:
        dput(dentry);
-        return -error;
+        return error;
 }
 STATIC int
@@ -496,26 +495,26 @@ xfs_compat_fssetdm_by_handle(
        struct dentry           *dentry;
        if (!capable(CAP_MKNOD))
-                return -XFS_ERROR(EPERM);
+                return -EPERM;
        if (copy_from_user(&dmhreq, arg,
                           sizeof(compat_xfs_fsop_setdm_handlereq_t)))
-                return -XFS_ERROR(EFAULT);
+                return -EFAULT;
        dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
        if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
-                error = -XFS_ERROR(EPERM);
+                error = -EPERM;
                goto out;
        }
        if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
-                error = -XFS_ERROR(EFAULT);
+                error = -EFAULT;
                goto out;
        }
-        error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+        error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
                                 fsd.fsd_dmstate);
 out:
@@ -537,7 +536,7 @@ xfs_file_compat_ioctl(
        int                     error;
        if (filp->f_mode & FMODE_NOCMTIME)
-                ioflags |= IO_INVIS;
+                ioflags |= XFS_IO_INVIS;
        trace_xfs_file_compat_ioctl(ip);
@@ -588,7 +587,7 @@ xfs_file_compat_ioctl(
                struct xfs_flock64      bf;
                if (xfs_compat_flock64_copyin(&bf, arg))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
        }
@@ -598,25 +597,25 @@ xfs_file_compat_ioctl(
                struct xfs_growfs_data  in;
                if (xfs_compat_growfs_data_copyin(&in, arg))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
                error = xfs_growfs_data(mp, &in);
                mnt_drop_write_file(filp);
-                return -error;
+                return error;
        }
        case XFS_IOC_FSGROWFSRT_32: {
                struct xfs_growfs_rt    in;
                if (xfs_compat_growfs_rt_copyin(&in, arg))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
                error = xfs_growfs_rt(mp, &in);
                mnt_drop_write_file(filp);
-                return -error;
+                return error;
        }
 #endif
        /* long changes size, but xfs only copiese out 32 bits */
@@ -633,13 +632,13 @@ xfs_file_compat_ioctl(
                if (copy_from_user(&sxp, sxu,
                                   offsetof(struct xfs_swapext, sx_stat)) ||
                    xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                error = mnt_want_write_file(filp);
                if (error)
                        return error;
                error = xfs_ioc_swapext(&sxp);
                mnt_drop_write_file(filp);
-                return -error;
+                return error;
        }
        case XFS_IOC_FSBULKSTAT_32:
        case XFS_IOC_FSBULKSTAT_SINGLE_32:
@@ -651,7 +650,7 @@ xfs_file_compat_ioctl(
                struct xfs_fsop_handlereq       hreq;
                if (xfs_compat_handlereq_copyin(&hreq, arg))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
                return xfs_find_handle(cmd, &hreq);
        }
@@ -659,14 +658,14 @@ xfs_file_compat_ioctl(
                struct xfs_fsop_handlereq       hreq;
                if (xfs_compat_handlereq_copyin(&hreq, arg))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                return xfs_open_by_handle(filp, &hreq);
        }
        case XFS_IOC_READLINK_BY_HANDLE_32: {
                struct xfs_fsop_handlereq       hreq;
                if (xfs_compat_handlereq_copyin(&hreq, arg))
-                        return -XFS_ERROR(EFAULT);
+                        return -EFAULT;
                return xfs_readlink_by_handle(filp, &hreq);
        }
        case XFS_IOC_ATTRLIST_BY_HANDLE_32:
@@ -676,6 +675,6 @@ xfs_file_compat_ioctl(
        case XFS_IOC_FSSETDM_BY_HANDLE_32:
                return xfs_compat_fssetdm_by_handle(filp, arg);
        default:
-                return -XFS_ERROR(ENOIOCTLCMD);
+                return -ENOIOCTLCMD;
        }
 }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 6d3ec2b6ee29..e9c47b6f5e5a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -110,7 +110,7 @@ xfs_alert_fsblock_zero(
                (unsigned long long)imap->br_startoff,
                (unsigned long long)imap->br_blockcount,
                imap->br_state);
-        return EFSCORRUPTED;
+        return -EFSCORRUPTED;
 }
 int
@@ -138,7 +138,7 @@ xfs_iomap_write_direct(
        error = xfs_qm_dqattach(ip, 0);
        if (error)
-                return XFS_ERROR(error);
+                return error;
        rt = XFS_IS_REALTIME_INODE(ip);
        extsz = xfs_get_extsz_hint(ip);
@@ -148,7 +148,7 @@ xfs_iomap_write_direct(
        if ((offset + count) > XFS_ISIZE(ip)) {
                error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
                if (error)
-                        return XFS_ERROR(error);
+                        return error;
        } else {
                if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
                        last_fsb = MIN(last_fsb, (xfs_fileoff_t)
@@ -188,7 +188,7 @@ xfs_iomap_write_direct(
         */
        if (error) {
                xfs_trans_cancel(tp, 0);
-                return XFS_ERROR(error);
+                return error;
        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -225,7 +225,7 @@ xfs_iomap_write_direct(
         * Copy any maps to caller's array and return any error.
         */
        if (nimaps == 0) {
-                error = XFS_ERROR(ENOSPC);
+                error = -ENOSPC;
                goto out_unlock;
        }
@@ -397,7 +397,8 @@ xfs_quota_calc_throttle(
        struct xfs_inode *ip,
        int type,
        xfs_fsblock_t *qblocks,
-        int *qshift)
+        int *qshift,
+        int64_t *qfreesp)
 {
        int64_t freesp;
        int shift = 0;
@@ -406,6 +407,7 @@ xfs_quota_calc_throttle(
        /* over hi wmark, squash the prealloc completely */
        if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
                *qblocks = 0;
+                *qfreesp = 0;
                return;
        }
@@ -418,6 +420,9 @@ xfs_quota_calc_throttle(
                        shift += 2;
        }
+        if (freesp < *qfreesp)
+                *qfreesp = freesp;
        /* only overwrite the throttle values if we are more aggressive */
        if ((freesp >> shift) < (*qblocks >> *qshift)) {
                *qblocks = freesp;
@@ -476,15 +481,18 @@ xfs_iomap_prealloc_size(
        }
        /*
-         * Check each quota to cap the prealloc size and provide a shift
+         * Check each quota to cap the prealloc size, provide a shift value to
-         * value to throttle with.
+         * throttle with and adjust amount of available space.
         */
        if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks))
-                xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift);
+                xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift,
+                                        &freesp);
        if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks))
-                xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift);
+                xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift,
+                                        &freesp);
        if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks))
-                xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift);
+                xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift,
+                                        &freesp);
        /*
         * The final prealloc size is set to the minimum of free space available
@@ -552,7 +560,7 @@ xfs_iomap_write_delay(
         */
        error = xfs_qm_dqattach_locked(ip, 0);
        if (error)
-                return XFS_ERROR(error);
+                return error;
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -596,11 +604,11 @@ retry:
                                imap, &nimaps, XFS_BMAPI_ENTIRE);
        switch (error) {
        case 0:
-        case ENOSPC:
+        case -ENOSPC:
-        case EDQUOT:
+        case -EDQUOT:
                break;
        default:
-                return XFS_ERROR(error);
+                return error;
        }
        /*
@@ -614,7 +622,7 @@ retry:
                        error = 0;
                        goto retry;
                }
-                return XFS_ERROR(error ? error : ENOSPC);
+                return error ? error : -ENOSPC;
        }
        if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
@@ -663,7 +671,7 @@ xfs_iomap_write_allocate(
         */
        error = xfs_qm_dqattach(ip, 0);
        if (error)
-                return XFS_ERROR(error);
+                return error;
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        count_fsb = imap->br_blockcount;
@@ -690,7 +698,7 @@ xfs_iomap_write_allocate(
                                                  nres, 0);
                        if (error) {
                                xfs_trans_cancel(tp, 0);
-                                return XFS_ERROR(error);
+                                return error;
                        }
                        xfs_ilock(ip, XFS_ILOCK_EXCL);
                        xfs_trans_ijoin(tp, ip, 0);
@@ -739,7 +747,7 @@ xfs_iomap_write_allocate(
                        if ((map_start_fsb + count_fsb) > last_block) {
                                count_fsb = last_block - map_start_fsb;
                                if (count_fsb == 0) {
-                                        error = EAGAIN;
+                                        error = -EAGAIN;
                                        goto trans_cancel;
                                }
                        }
@@ -793,7 +801,7 @@ trans_cancel:
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
 error0:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        return XFS_ERROR(error);
+        return error;
 }
 int
@@ -853,7 +861,7 @@ xfs_iomap_write_unwritten(
                                          resblks, 0);
                if (error) {
                        xfs_trans_cancel(tp, 0);
-                        return XFS_ERROR(error);
+                        return error;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -892,7 +900,7 @@ xfs_iomap_write_unwritten(
                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                if (error)
-                        return XFS_ERROR(error);
+                        return error;
                if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
                        return xfs_alert_fsblock_zero(ip, &imap);
@@ -915,5 +923,5 @@ error_on_bmapi_transaction:
        xfs_bmap_cancel(&free_list);
        xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        return XFS_ERROR(error);
+        return error;
 }
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 205613a06068..72129493e9d3 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -72,7 +72,7 @@ xfs_initxattrs(
        int                     error = 0;
        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-                error = -xfs_attr_set(ip, xattr->name, xattr->value,
+                error = xfs_attr_set(ip, xattr->name, xattr->value,
                                      xattr->value_len, ATTR_SECURE);
                if (error < 0)
                        break;
@@ -93,7 +93,7 @@ xfs_init_security(
        struct inode    *dir,
        const struct qstr *qstr)
 {
-        return -security_inode_init_security(inode, dir, qstr,
+        return security_inode_init_security(inode, dir, qstr,
                                             &xfs_initxattrs, NULL);
 }
@@ -173,12 +173,12 @@ xfs_generic_create(
 #ifdef CONFIG_XFS_POSIX_ACL
        if (default_acl) {
-                error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+                error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
                if (error)
                        goto out_cleanup_inode;
        }
        if (acl) {
-                error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+                error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
                if (error)
                        goto out_cleanup_inode;
        }
@@ -194,7 +194,7 @@ xfs_generic_create(
                posix_acl_release(default_acl);
        if (acl)
                posix_acl_release(acl);
-        return -error;
+        return error;
 out_cleanup_inode:
        if (!tmpfile)
@@ -248,8 +248,8 @@ xfs_vn_lookup(
        xfs_dentry_to_name(&name, dentry, 0);
        error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
        if (unlikely(error)) {
-                if (unlikely(error != ENOENT))
+                if (unlikely(error != -ENOENT))
-                        return ERR_PTR(-error);
+                        return ERR_PTR(error);
                d_add(dentry, NULL);
                return NULL;
        }
@@ -275,8 +275,8 @@ xfs_vn_ci_lookup(
        xfs_dentry_to_name(&xname, dentry, 0);
        error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
        if (unlikely(error)) {
-                if (unlikely(error != ENOENT))
+                if (unlikely(error != -ENOENT))
-                        return ERR_PTR(-error);
+                        return ERR_PTR(error);
                /*
                 * call d_add(dentry, NULL) here when d_drop_negative_children
                 * is called in xfs_vn_mknod (ie. allow negative dentries
@@ -311,7 +311,7 @@ xfs_vn_link(
        error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
        if (unlikely(error))
-                return -error;
+                return error;
        ihold(inode);
        d_instantiate(dentry, inode);
@@ -328,7 +328,7 @@ xfs_vn_unlink(
        xfs_dentry_to_name(&name, dentry, 0);
-        error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
+        error = xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
        if (error)
                return error;
@@ -375,7 +375,7 @@ xfs_vn_symlink(
        xfs_cleanup_inode(dir, inode, dentry);
        iput(inode);
 out:
-        return -error;
+        return error;
 }
 STATIC int
@@ -392,8 +392,8 @@ xfs_vn_rename(
        xfs_dentry_to_name(&oname, odentry, 0);
        xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
-        return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+        return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
-                           XFS_I(ndir), &nname, new_inode ?
+                          XFS_I(ndir), &nname, new_inode ?
                                                XFS_I(new_inode) : NULL);
 }
@@ -414,7 +414,7 @@ xfs_vn_follow_link(
        if (!link)
                goto out_err;
-        error = -xfs_readlink(XFS_I(dentry->d_inode), link);
+        error = xfs_readlink(XFS_I(dentry->d_inode), link);
        if (unlikely(error))
                goto out_kfree;
@@ -441,7 +441,7 @@ xfs_vn_getattr(
        trace_xfs_getattr(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
-                return -XFS_ERROR(EIO);
+                return -EIO;
        stat->size = XFS_ISIZE(ip);
        stat->dev = inode->i_sb->s_dev;
@@ -546,14 +546,14 @@ xfs_setattr_nonsize(
        /* If acls are being inherited, we already have this checked */
        if (!(flags & XFS_ATTR_NOACL)) {
                if (mp->m_flags & XFS_MOUNT_RDONLY)
-                        return XFS_ERROR(EROFS);
+                        return -EROFS;
                if (XFS_FORCED_SHUTDOWN(mp))
-                        return XFS_ERROR(EIO);
+                        return -EIO;
-                error = -inode_change_ok(inode, iattr);
+                error = inode_change_ok(inode, iattr);
                if (error)
-                        return XFS_ERROR(error);
+                        return error;
        }
        ASSERT((mask & ATTR_SIZE) == 0);
@@ -703,7 +703,7 @@ xfs_setattr_nonsize(
        xfs_qm_dqrele(gdqp);
        if (error)
-                return XFS_ERROR(error);
+                return error;
        /*
         * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
@@ -713,9 +713,9 @@ xfs_setattr_nonsize(
         *           Posix ACL code seems to care about this issue either.
         */
        if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
-                error = -posix_acl_chmod(inode, inode->i_mode);
+                error = posix_acl_chmod(inode, inode->i_mode);
                if (error)
-                        return XFS_ERROR(error);
+                        return error;
        }
        return 0;
@@ -748,14 +748,14 @@ xfs_setattr_size(
        trace_xfs_setattr(ip);
        if (mp->m_flags & XFS_MOUNT_RDONLY)
-                return XFS_ERROR(EROFS);
+                return -EROFS;
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
-        error = -inode_change_ok(inode, iattr);
+        error = inode_change_ok(inode, iattr);
        if (error)
-                return XFS_ERROR(error);
+                return error;
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
        ASSERT(S_ISREG(ip->i_d.di_mode));
@@ -818,7 +818,7 @@ xfs_setattr_size(
         * care about here.
         */
        if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
-                error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+                error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
                                                      ip->i_d.di_size, newsize);
                if (error)
                        return error;
@@ -844,7 +844,7 @@ xfs_setattr_size(
         * much we can do about this, except to hope that the caller sees ENOMEM
         * and retries the truncate operation.
         */
-        error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
+        error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
        if (error)
                return error;
        truncate_setsize(inode, newsize);
@@ -950,7 +950,7 @@ xfs_vn_setattr(
                error = xfs_setattr_nonsize(ip, iattr, 0);
        }
-        return -error;
+        return error;
 }
 STATIC int
@@ -970,7 +970,7 @@ xfs_vn_update_time(
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
-                return -error;
+                return error;
        }
        xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -991,7 +991,7 @@ xfs_vn_update_time(
        }
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
-        return -xfs_trans_commit(tp, 0);
+        return xfs_trans_commit(tp, 0);
 }
 #define XFS_FIEMAP_FLAGS        (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -1036,7 +1036,7 @@ xfs_fiemap_format(
                *full = 1;      /* user array now full */
        }
-        return -error;
+        return error;
 }
 STATIC int
@@ -1055,12 +1055,12 @@ xfs_vn_fiemap(
                return error;
        /* Set up bmap header for xfs internal routine */
-        bm.bmv_offset = BTOBB(start);
+        bm.bmv_offset = BTOBBT(start);
        /* Special case for whole file */
        if (length == FIEMAP_MAX_OFFSET)
                bm.bmv_length = -1LL;
        else
-                bm.bmv_length = BTOBB(length);
+                bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
        /* We add one because in getbmap world count includes the header */
        bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
@@ -1075,7 +1075,7 @@ xfs_vn_fiemap(
        error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
        if (error)
-                return -error;
+                return error;
        return 0;
 }
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cb64f222d607..f71be9c68017 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -67,19 +67,17 @@ xfs_bulkstat_one_int(
        *stat = BULKSTAT_RV_NOTHING;
        if (!buffer || xfs_internal_inum(mp, ino))
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
        if (!buf)
-                return XFS_ERROR(ENOMEM);
+                return -ENOMEM;
        error = xfs_iget(mp, NULL, ino,
                         (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
                         XFS_ILOCK_SHARED, &ip);
-        if (error) {
+        if (error)
-                *stat = BULKSTAT_RV_NOTHING;
                goto out_free;
-        }
        ASSERT(ip != NULL);
        ASSERT(ip->i_imap.im_blkno != 0);
@@ -136,7 +134,6 @@ xfs_bulkstat_one_int(
        IRELE(ip);
        error = formatter(buffer, ubsize, ubused, buf);
        if (!error)
                *stat = BULKSTAT_RV_DIDONE;
@@ -154,9 +151,9 @@ xfs_bulkstat_one_fmt(
        const xfs_bstat_t       *buffer)
 {
        if (ubsize < sizeof(*buffer))
-                return XFS_ERROR(ENOMEM);
+                return -ENOMEM;
        if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
-                return XFS_ERROR(EFAULT);
+                return -EFAULT;
        if (ubused)
                *ubused = sizeof(*buffer);
        return 0;
@@ -175,9 +172,170 @@ xfs_bulkstat_one(
                                    xfs_bulkstat_one_fmt, ubused, stat);
 }
+/*
+ * Loop over all clusters in a chunk for a given incore inode allocation btree
+ * record.  Do a readahead if there are any allocated inodes in that cluster.
+ */
+STATIC void
+xfs_bulkstat_ichunk_ra(
+        struct xfs_mount                *mp,
+        xfs_agnumber_t                  agno,
+        struct xfs_inobt_rec_incore     *irec)
+{
+        xfs_agblock_t                   agbno;
+        struct blk_plug                 plug;
+        int                             blks_per_cluster;
+        int                             inodes_per_cluster;
+        int                             i;      /* inode chunk index */
+        agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
+        blks_per_cluster = xfs_icluster_size_fsb(mp);
+        inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+        blk_start_plug(&plug);
+        for (i = 0; i < XFS_INODES_PER_CHUNK;
+             i += inodes_per_cluster, agbno += blks_per_cluster) {
+                if (xfs_inobt_maskn(i, inodes_per_cluster) & ~irec->ir_free) {
+                        xfs_btree_reada_bufs(mp, agno, agbno, blks_per_cluster,
+                                             &xfs_inode_buf_ops);
+                }
+        }
+        blk_finish_plug(&plug);
+}
+/*
+ * Lookup the inode chunk that the given inode lives in and then get the record
+ * if we found the chunk.  If the inode was not the last in the chunk and there
+ * are some left allocated, update the data for the pointed-to record as well as
+ * return the count of grabbed inodes.
+ */
+STATIC int
+xfs_bulkstat_grab_ichunk(
+        struct xfs_btree_cur            *cur,   /* btree cursor */
+        xfs_agino_t                     agino,  /* starting inode of chunk */
+        int                             *icount,/* return # of inodes grabbed */
+        struct xfs_inobt_rec_incore     *irec)  /* btree record */
+{
+        int                             idx;    /* index into inode chunk */
+        int                             stat;
+        int                             error = 0;
+        /* Lookup the inode chunk that this inode lives in */
+        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &stat);
+        if (error)
+                return error;
+        if (!stat) {
+                *icount = 0;
+                return error;
+        }
+        /* Get the record, should always work */
+        error = xfs_inobt_get_rec(cur, irec, &stat);
+        if (error)
+                return error;
+        XFS_WANT_CORRUPTED_RETURN(stat == 1);
+        /* Check if the record contains the inode in request */
+        if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
+                return -EINVAL;
+        idx = agino - irec->ir_startino + 1;
+        if (idx < XFS_INODES_PER_CHUNK &&
+            (xfs_inobt_maskn(idx, XFS_INODES_PER_CHUNK - idx) & ~irec->ir_free)) {
+                int     i;
+                /* We got a right chunk with some left inodes allocated at it.
+                 * Grab the chunk record.  Mark all the uninteresting inodes
+                 * free -- because they're before our start point.
+                 */
+                for (i = 0; i < idx; i++) {
+                        if (XFS_INOBT_MASK(i) & ~irec->ir_free)
+                                irec->ir_freecount++;
+                }
+                irec->ir_free |= xfs_inobt_maskn(0, idx);
+                *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount;
+        }
+        return 0;
+}
 #define XFS_BULKSTAT_UBLEFT(ubleft)     ((ubleft) >= statstruct_size)
 /*
+ * Process inodes in chunk with a pointer to a formatter function
+ * that will iget the inode and fill in the appropriate structure.
+ */
+int
+xfs_bulkstat_ag_ichunk(
+        struct xfs_mount                *mp,
+        xfs_agnumber_t                  agno,
+        struct xfs_inobt_rec_incore     *irbp,
+        bulkstat_one_pf                 formatter,
+        size_t                          statstruct_size,
+        struct xfs_bulkstat_agichunk    *acp)
+{
+        xfs_ino_t                       lastino = acp->ac_lastino;
+        char                            __user **ubufp = acp->ac_ubuffer;
+        int                             ubleft = acp->ac_ubleft;
+        int                             ubelem = acp->ac_ubelem;
+        int                             chunkidx, clustidx;
+        int                             error = 0;
+        xfs_agino_t                     agino;
+        for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
+             XFS_BULKSTAT_UBLEFT(ubleft) &&
+             irbp->ir_freecount < XFS_INODES_PER_CHUNK;
+             chunkidx++, clustidx++, agino++) {
+                int             fmterror;       /* bulkstat formatter result */
+                int             ubused;
+                xfs_ino_t       ino = XFS_AGINO_TO_INO(mp, agno, agino);
+                ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
+                /* Skip if this inode is free */
+                if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
+                        lastino = ino;
+                        continue;
+                }
+                /*
+                 * Count used inodes as free so we can tell when the
+                 * chunk is used up.
+                 */
+                irbp->ir_freecount++;
+                /* Get the inode and fill in a single buffer */
+                ubused = statstruct_size;
+                error = formatter(mp, ino, *ubufp, ubleft, &ubused, &fmterror);
+                if (fmterror == BULKSTAT_RV_NOTHING) {
+                        if (error && error != -ENOENT && error != -EINVAL) {
+                                ubleft = 0;
+                                break;
+                        }
+                        lastino = ino;
+                        continue;
+                }
+                if (fmterror == BULKSTAT_RV_GIVEUP) {
+                        ubleft = 0;
+                        ASSERT(error);
+                        break;
+                }
+                if (*ubufp)
+                        *ubufp += ubused;
+                ubleft -= ubused;
+                ubelem++;
+                lastino = ino;
+        }
+        acp->ac_lastino = lastino;
+        acp->ac_ubleft = ubleft;
+        acp->ac_ubelem = ubelem;
+        return error;
+}
+/*
 * Return stat information in bulk (by-inode) for the filesystem.
 */
 int                                     /* error status */
@@ -190,13 +348,10 @@ xfs_bulkstat(
        char                    __user *ubuffer, /* buffer with inode stats */
        int                     *done)  /* 1 if there are more stats to get */
 {
-        xfs_agblock_t           agbno=0;/* allocation group block number */
        xfs_buf_t               *agbp;  /* agi header buffer */
        xfs_agi_t               *agi;   /* agi header data */
        xfs_agino_t             agino;  /* inode # in allocation group */
        xfs_agnumber_t          agno;   /* allocation group number */
-        int                     chunkidx; /* current index into inode chunk */
-        int                     clustidx; /* current index into inode cluster */
        xfs_btree_cur_t         *cur;   /* btree cursor for ialloc btree */
        int                     end_of_ag; /* set if we've seen the ag end */
        int                     error;  /* error code */
@@ -209,8 +364,6 @@ xfs_bulkstat(
        xfs_inobt_rec_incore_t  *irbuf; /* start of irec buffer */
        xfs_inobt_rec_incore_t  *irbufend; /* end of good irec buffer entries */
        xfs_ino_t               lastino; /* last inode number returned */
-        int                     blks_per_cluster; /* # of blocks per cluster */
-        int                     inodes_per_cluster;/* # of inodes per cluster */
        int                     nirbuf; /* size of irbuf */
        int                     rval;   /* return value error code */
        int                     tmp;    /* result value from btree calls */
@@ -218,7 +371,6 @@ xfs_bulkstat(
        int                     ubleft; /* bytes left in user's buffer */
        char                    __user *ubufp;  /* pointer into user's buffer */
        int                     ubelem; /* spaces used in user's buffer */
-        int                     ubused; /* bytes used by formatter */
        /*
         * Get the last inode value, see if there's nothing to do.
@@ -233,20 +385,16 @@ xfs_bulkstat(
                *ubcountp = 0;
                return 0;
        }
-        if (!ubcountp || *ubcountp <= 0) {
-                return EINVAL;
-        }
        ubcount = *ubcountp; /* statstruct's */
        ubleft = ubcount * statstruct_size; /* bytes */
        *ubcountp = ubelem = 0;
        *done = 0;
        fmterror = 0;
        ubufp = ubuffer;
-        blks_per_cluster = xfs_icluster_size_fsb(mp);
-        inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
        irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
        if (!irbuf)
-                return ENOMEM;
+                return -ENOMEM;
        nirbuf = irbsize / sizeof(*irbuf);
@@ -258,14 +406,8 @@ xfs_bulkstat(
        while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
                cond_resched();
                error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
-                if (error) {
+                if (error)
-                        /*
+                        break;
-                         * Skip this allocation group and go to the next one.
-                         */
-                        agno++;
-                        agino = 0;
-                        continue;
-                }
                agi = XFS_BUF_TO_AGI(agbp);
                /*
                 * Allocate and initialize a btree cursor for ialloc btree.
@@ -275,96 +417,39 @@ xfs_bulkstat(
                irbp = irbuf;
                irbufend = irbuf + nirbuf;
                end_of_ag = 0;
-                /*
+                icount = 0;
-                 * If we're returning in the middle of an allocation group,
-                 * we need to get the remainder of the chunk we're in.
-                 */
                if (agino > 0) {
-                        xfs_inobt_rec_incore_t r;
                        /*
-                         * Lookup the inode chunk that this inode lives in.
+                         * In the middle of an allocation group, we need to get
+                         * the remainder of the chunk we're in.
                         */
-                        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
+                        struct xfs_inobt_rec_incore     r;
-                                                 &tmp);
-                        if (!error &&   /* no I/O error */
+                        error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
-                            tmp &&      /* lookup succeeded */
+                        if (error)
-                                        /* got the record, should always work */
+                                break;
-                            !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
+                        if (icount) {
-                            i == 1 &&
-                                        /* this is the right chunk */
-                            agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
-                                        /* lastino was not last in chunk */
-                            (chunkidx = agino - r.ir_startino + 1) <
-                                    XFS_INODES_PER_CHUNK &&
-                                        /* there are some left allocated */
-                            xfs_inobt_maskn(chunkidx,
-                                    XFS_INODES_PER_CHUNK - chunkidx) &
-                                    ~r.ir_free) {
-                                /*
-                                 * Grab the chunk record.  Mark all the
-                                 * uninteresting inodes (because they're
-                                 * before our start point) free.
-                                 */
-                                for (i = 0; i < chunkidx; i++) {
-                                        if (XFS_INOBT_MASK(i) & ~r.ir_free)
-                                                r.ir_freecount++;
-                                }
-                                r.ir_free |= xfs_inobt_maskn(0, chunkidx);
                                irbp->ir_startino = r.ir_startino;
                                irbp->ir_freecount = r.ir_freecount;
                                irbp->ir_free = r.ir_free;
                                irbp++;
                                agino = r.ir_startino + XFS_INODES_PER_CHUNK;
-                                icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
-                        } else {
-                                /*
-                                 * If any of those tests failed, bump the
-                                 * inode number (just in case).
-                                 */
-                                agino++;
-                                icount = 0;
                        }
-                        /*
+                        /* Increment to the next record */
-                         * In any case, increment to the next record.
+                        error = xfs_btree_increment(cur, 0, &tmp);
-                         */
-                        if (!error)
-                                error = xfs_btree_increment(cur, 0, &tmp);
                } else {
-                        /*
+                        /* Start of ag.  Lookup the first inode chunk */
-                         * Start of ag.  Lookup the first inode chunk.
-                         */
                        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
-                        icount = 0;
                }
+                if (error)
+                        break;
                /*
                 * Loop through inode btree records in this ag,
                 * until we run out of inodes or space in the buffer.
                 */
                while (irbp < irbufend && icount < ubcount) {
-                        xfs_inobt_rec_incore_t r;
+                        struct xfs_inobt_rec_incore     r;
-                        /*
-                         * Loop as long as we're unable to read the
-                         * inode btree.
-                         */
-                        while (error) {
-                                agino += XFS_INODES_PER_CHUNK;
-                                if (XFS_AGINO_TO_AGBNO(mp, agino) >=
-                                                be32_to_cpu(agi->agi_length))
-                                        break;
-                                error = xfs_inobt_lookup(cur, agino,
-                                                         XFS_LOOKUP_GE, &tmp);
-                                cond_resched();
-                        }
-                        /*
-                         * If ran off the end of the ag either with an error,
-                         * or the normal way, set end and stop collecting.
-                         */
-                        if (error) {
-                                end_of_ag = 1;
-                                break;
-                        }
                        error = xfs_inobt_get_rec(cur, &r, &i);
                        if (error || i == 0) {
@@ -377,25 +462,7 @@ xfs_bulkstat(
                         * Also start read-ahead now for this chunk.
                         */
                        if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
-                                struct blk_plug plug;
+                                xfs_bulkstat_ichunk_ra(mp, agno, &r);
-                                /*
-                                 * Loop over all clusters in the next chunk.
-                                 * Do a readahead if there are any allocated
-                                 * inodes in that cluster.
-                                 */
-                                blk_start_plug(&plug);
-                                agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
-                                for (chunkidx = 0;
-                                     chunkidx < XFS_INODES_PER_CHUNK;
-                                     chunkidx += inodes_per_cluster,
-                                     agbno += blks_per_cluster) {
-                                        if (xfs_inobt_maskn(chunkidx,
-                                            inodes_per_cluster) & ~r.ir_free)
-                                                xfs_btree_reada_bufs(mp, agno,
-                                                        agbno, blks_per_cluster,
-                                                        &xfs_inode_buf_ops);
-                                }
-                                blk_finish_plug(&plug);
                                irbp->ir_startino = r.ir_startino;
                                irbp->ir_freecount = r.ir_freecount;
                                irbp->ir_free = r.ir_free;
@@ -422,57 +489,20 @@ xfs_bulkstat(
                irbufend = irbp;
                for (irbp = irbuf;
                     irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
-                        /*
+                        struct xfs_bulkstat_agichunk ac;
-                         * Now process this chunk of inodes.
-                         */
+                        ac.ac_lastino = lastino;
-                        for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
+                        ac.ac_ubuffer = &ubuffer;
-                             XFS_BULKSTAT_UBLEFT(ubleft) &&
+                        ac.ac_ubleft = ubleft;
-                                irbp->ir_freecount < XFS_INODES_PER_CHUNK;
+                        ac.ac_ubelem = ubelem;
-                             chunkidx++, clustidx++, agino++) {
+                        error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
-                                ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
+                                        formatter, statstruct_size, &ac);
+                        if (error)
-                                ino = XFS_AGINO_TO_INO(mp, agno, agino);
+                                rval = error;
-                                /*
-                                 * Skip if this inode is free.
+                        lastino = ac.ac_lastino;
-                                 */
+                        ubleft = ac.ac_ubleft;
-                                if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
+                        ubelem = ac.ac_ubelem;
-                                        lastino = ino;
-                                        continue;
-                                }
-                                /*
-                                 * Count used inodes as free so we can tell
-                                 * when the chunk is used up.
-                                 */
-                                irbp->ir_freecount++;
-                                /*
-                                 * Get the inode and fill in a single buffer.
-                                 */
-                                ubused = statstruct_size;
-                                error = formatter(mp, ino, ubufp, ubleft,
-                                                  &ubused, &fmterror);
-                                if (fmterror == BULKSTAT_RV_NOTHING) {
-                                        if (error && error != ENOENT &&
-                                                error != EINVAL) {
-                                                ubleft = 0;
-                                                rval = error;
-                                                break;
-                                        }
-                                        lastino = ino;
-                                        continue;
-                                }
-                                if (fmterror == BULKSTAT_RV_GIVEUP) {
-                                        ubleft = 0;
-                                        ASSERT(error);
-                                        rval = error;
-                                        break;
-                                }
-                                if (ubufp)
-                                        ubufp += ubused;
-                                ubleft -= ubused;
-                                ubelem++;
-                                lastino = ino;
-                        }
                        cond_resched();
                }
@@ -512,58 +542,10 @@ xfs_bulkstat(
        return rval;
 }
-/*
- * Return stat information in bulk (by-inode) for the filesystem.
- * Special case for non-sequential one inode bulkstat.
- */
-int                                     /* error status */
-xfs_bulkstat_single(
-        xfs_mount_t             *mp,    /* mount point for filesystem */
-        xfs_ino_t               *lastinop, /* inode to return */
-        char                    __user *buffer, /* buffer with inode stats */
-        int                     *done)  /* 1 if there are more stats to get */
-{
-        int                     count;  /* count value for bulkstat call */
-        int                     error;  /* return value */
-        xfs_ino_t               ino;    /* filesystem inode number */
-        int                     res;    /* result from bs1 */
-        /*
-         * note that requesting valid inode numbers which are not allocated
-         * to inodes will most likely cause xfs_imap_to_bp to generate warning
-         * messages about bad magic numbers. This is ok. The fact that
-         * the inode isn't actually an inode is handled by the
-         * error check below. Done this way to make the usual case faster
-         * at the expense of the error case.
-         */
-        ino = *lastinop;
-        error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
-                                 NULL, &res);
-        if (error) {
-                /*
-                 * Special case way failed, do it the "long" way
-                 * to see if that works.
-                 */
-                (*lastinop)--;
-                count = 1;
-                if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,
-                                sizeof(xfs_bstat_t), buffer, done))
-                        return error;
-                if (count == 0 || (xfs_ino_t)*lastinop != ino)
-                        return error == EFSCORRUPTED ?
-                                XFS_ERROR(EINVAL) : error;
-                else
-                        return 0;
-        }
-        *done = 0;
-        return 0;
-}
 int
 xfs_inumbers_fmt(
        void                    __user *ubuffer, /* buffer to write to */
-        const xfs_inogrp_t      *buffer,        /* buffer to read from */
+        const struct xfs_inogrp *buffer,        /* buffer to read from */
        long                    count,          /* # of elements to read */
        long                    *written)       /* # of bytes written */
 {
@@ -578,127 +560,104 @@ xfs_inumbers_fmt(
 */
 int                                     /* error status */
 xfs_inumbers(
-        xfs_mount_t     *mp,            /* mount point for filesystem */
+        struct xfs_mount        *mp,/* mount point for filesystem */
-        xfs_ino_t       *lastino,       /* last inode returned */
+        xfs_ino_t               *lastino,/* last inode returned */
-        int             *count,         /* size of buffer/count returned */
+        int                     *count,/* size of buffer/count returned */
-        void            __user *ubuffer,/* buffer with inode descriptions */
+        void                    __user *ubuffer,/* buffer with inode descriptions */
-        inumbers_fmt_pf formatter)
+        inumbers_fmt_pf         formatter)
 {
-        xfs_buf_t       *agbp;
+        xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, *lastino);
-        xfs_agino_t     agino;
+        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, *lastino);
-        xfs_agnumber_t  agno;
+        struct xfs_btree_cur    *cur = NULL;
-        int             bcount;
+        struct xfs_buf          *agbp = NULL;
-        xfs_inogrp_t    *buffer;
+        struct xfs_inogrp       *buffer;
-        int             bufidx;
+        int                     bcount;
-        xfs_btree_cur_t *cur;
+        int                     left = *count;
-        int             error;
+        int                     bufidx = 0;
-        xfs_inobt_rec_incore_t r;
+        int                     error = 0;
-        int             i;
-        xfs_ino_t       ino;
-        int             left;
-        int             tmp;
-        ino = (xfs_ino_t)*lastino;
-        agno = XFS_INO_TO_AGNO(mp, ino);
-        agino = XFS_INO_TO_AGINO(mp, ino);
-        left = *count;
        *count = 0;
+        if (agno >= mp->m_sb.sb_agcount ||
+            *lastino != XFS_AGINO_TO_INO(mp, agno, agino))
+                return error;
        bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer)));
        buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
-        error = bufidx = 0;
+        do {
-        cur = NULL;
+                struct xfs_inobt_rec_incore     r;
-        agbp = NULL;
+                int                             stat;
-        while (left > 0 && agno < mp->m_sb.sb_agcount) {
-                if (agbp == NULL) {
+                if (!agbp) {
                        error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
-                        if (error) {
+                        if (error)
-                                /*
+                                break;
-                                 * If we can't read the AGI of this ag,
-                                 * then just skip to the next one.
-                                 */
-                                ASSERT(cur == NULL);
-                                agbp = NULL;
-                                agno++;
-                                agino = 0;
-                                continue;
-                        }
                        cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
                                                    XFS_BTNUM_INO);
                        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
-                                                 &tmp);
+                                                 &stat);
-                        if (error) {
+                        if (error)
-                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                                break;
-                                cur = NULL;
+                        if (!stat)
-                                xfs_buf_relse(agbp);
+                                goto next_ag;
-                                agbp = NULL;
-                                /*
-                                 * Move up the last inode in the current
-                                 * chunk.  The lookup_ge will always get
-                                 * us the first inode in the next chunk.
-                                 */
-                                agino += XFS_INODES_PER_CHUNK - 1;
-                                continue;
-                        }
-                }
-                error = xfs_inobt_get_rec(cur, &r, &i);
-                if (error || i == 0) {
-                        xfs_buf_relse(agbp);
-                        agbp = NULL;
-                        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
-                        cur = NULL;
-                        agno++;
-                        agino = 0;
-                        continue;
                }
+                error = xfs_inobt_get_rec(cur, &r, &stat);
+                if (error)
+                        break;
+                if (!stat)
+                        goto next_ag;
                agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
                buffer[bufidx].xi_startino =
                        XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
                buffer[bufidx].xi_alloccount =
                        XFS_INODES_PER_CHUNK - r.ir_freecount;
                buffer[bufidx].xi_allocmask = ~r.ir_free;
-                bufidx++;
+                if (++bufidx == bcount) {
-                left--;
+                        long    written;
-                if (bufidx == bcount) {
-                        long written;
+                        error = formatter(ubuffer, buffer, bufidx, &written);
-                        if (formatter(ubuffer, buffer, bufidx, &written)) {
+                        if (error)
-                                error = XFS_ERROR(EFAULT);
                                break;
-                        }
                        ubuffer += written;
                        *count += bufidx;
                        bufidx = 0;
                }
-                if (left) {
+                if (!--left)
-                        error = xfs_btree_increment(cur, 0, &tmp);
+                        break;
-                        if (error) {
-                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+                error = xfs_btree_increment(cur, 0, &stat);
-                                cur = NULL;
+                if (error)
-                                xfs_buf_relse(agbp);
+                        break;
-                                agbp = NULL;
+                if (stat)
-                                /*
+                        continue;
-                                 * The agino value has already been bumped.
-                                 * Just try to skip up to it.
+next_ag:
-                                 */
+                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-                                agino += XFS_INODES_PER_CHUNK;
+                cur = NULL;
-                                continue;
+                xfs_buf_relse(agbp);
-                        }
+                agbp = NULL;
-                }
+                agino = 0;
-        }
+        } while (++agno < mp->m_sb.sb_agcount);
        if (!error) {
                if (bufidx) {
-                        long written;
+                        long    written;
-                        if (formatter(ubuffer, buffer, bufidx, &written))
-                                error = XFS_ERROR(EFAULT);
+                        error = formatter(ubuffer, buffer, bufidx, &written);
-                        else
+                        if (!error)
                                *count += bufidx;
                }
                *lastino = XFS_AGINO_TO_INO(mp, agno, agino);
        }
        kmem_free(buffer);
        if (cur)
                xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
                                           XFS_BTREE_NOERROR));
        if (agbp)
                xfs_buf_relse(agbp);
        return error;
 }
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 97295d91d170..aaed08022eb9 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -30,6 +30,22 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
                               int              *ubused,
                               int              *stat);
+struct xfs_bulkstat_agichunk {
+        xfs_ino_t       ac_lastino;     /* last inode returned */
+        char            __user **ac_ubuffer;/* pointer into user's buffer */
+        int             ac_ubleft;      /* bytes left in user's buffer */
+        int             ac_ubelem;      /* spaces used in user's buffer */
+};
+int
+xfs_bulkstat_ag_ichunk(
+        struct xfs_mount                *mp,
+        xfs_agnumber_t                  agno,
+        struct xfs_inobt_rec_incore     *irbp,
+        bulkstat_one_pf                 formatter,
+        size_t                          statstruct_size,
+        struct xfs_bulkstat_agichunk    *acp);
 /*
 * Values for stat return value.
 */
@@ -50,13 +66,6 @@ xfs_bulkstat(
        char            __user *ubuffer,/* buffer with inode stats */
        int             *done);         /* 1 if there are more stats to get */
-int
-xfs_bulkstat_single(
-        xfs_mount_t             *mp,
-        xfs_ino_t               *lastinop,
-        char                    __user *buffer,
-        int                     *done);
 typedef int (*bulkstat_one_fmt_pf)(  /* used size in bytes or negative error */
        void                    __user *ubuffer, /* buffer to write to */
        int                     ubsize,          /* remaining user buffer sz */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 825249d2dfc1..d10dc8f397c9 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -21,18 +21,6 @@
 #include <linux/types.h>
 /*
- * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
- * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
- */
-#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
-# define XFS_BIG_BLKNOS 1
-# define XFS_BIG_INUMS  1
-#else
-# define XFS_BIG_BLKNOS 0
-# define XFS_BIG_INUMS  0
-#endif
-/*
 * Kernel specific type declarations for XFS
 */
 typedef signed char             __int8_t;
@@ -113,7 +101,7 @@ typedef __uint64_t __psunsigned_t;
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
-#include "xfs_vnode.h"
+#include "xfs_fs.h"
 #include "xfs_stats.h"
 #include "xfs_sysctl.h"
 #include "xfs_iops.h"
@@ -191,6 +179,17 @@ typedef __uint64_t __psunsigned_t;
 #define MAX(a,b)        (max(a,b))
 #define howmany(x, y)   (((x)+((y)-1))/(y))
+/*
+ * XFS wrapper structure for sysfs support. It depends on external data
+ * structures and is embedded in various internal data structures to implement
+ * the XFS sysfs object heirarchy. Define it here for broad access throughout
+ * the codebase.
+ */
+struct xfs_kobj {
+        struct kobject          kobject;
+        struct completion       complete;
+};
 /* Kernel uid/gid conversion. These are used to convert to/from the on disk
 * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
 * The conversion here is type only, the value will remain the same since we
@@ -331,7 +330,7 @@ static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
 {
        x += y - 1;
        do_div(x, y);
-        return(x * y);
+        return x * y;
 }
 static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 292308dede6d..ca4fd5bd8522 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,7 @@
 #include "xfs_trace.h"
 #include "xfs_fsops.h"
 #include "xfs_cksum.h"
+#include "xfs_sysfs.h"
 kmem_zone_t     *xfs_log_ticket_zone;
@@ -283,7 +284,7 @@ xlog_grant_head_wait(
        return 0;
 shutdown:
        list_del_init(&tic->t_queue);
-        return XFS_ERROR(EIO);
+        return -EIO;
 }
 /*
@@ -377,7 +378,7 @@ xfs_log_regrant(
        int                     error = 0;
        if (XLOG_FORCED_SHUTDOWN(log))
-                return XFS_ERROR(EIO);
+                return -EIO;
        XFS_STATS_INC(xs_try_logspace);
@@ -446,7 +447,7 @@ xfs_log_reserve(
        ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
        if (XLOG_FORCED_SHUTDOWN(log))
-                return XFS_ERROR(EIO);
+                return -EIO;
        XFS_STATS_INC(xs_try_logspace);
@@ -454,7 +455,7 @@ xfs_log_reserve(
        tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
                                KM_SLEEP | KM_MAYFAIL);
        if (!tic)
-                return XFS_ERROR(ENOMEM);
+                return -ENOMEM;
        tic->t_trans_type = t_type;
        *ticp = tic;
@@ -590,7 +591,7 @@ xfs_log_release_iclog(
 {
        if (xlog_state_release_iclog(mp->m_log, iclog)) {
                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
-                return EIO;
+                return -EIO;
        }
        return 0;
@@ -628,7 +629,7 @@ xfs_log_mount(
        mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
        if (IS_ERR(mp->m_log)) {
-                error = -PTR_ERR(mp->m_log);
+                error = PTR_ERR(mp->m_log);
                goto out;
        }
@@ -652,18 +653,18 @@ xfs_log_mount(
                xfs_warn(mp,
                "Log size %d blocks too small, minimum size is %d blocks",
                         mp->m_sb.sb_logblocks, min_logfsbs);
-                error = EINVAL;
+                error = -EINVAL;
        } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
                xfs_warn(mp,
                "Log size %d blocks too large, maximum size is %lld blocks",
                         mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
-                error = EINVAL;
+                error = -EINVAL;
        } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
                xfs_warn(mp,
                "log size %lld bytes too large, maximum size is %lld bytes",
                         XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
                         XFS_MAX_LOG_BYTES);
-                error = EINVAL;
+                error = -EINVAL;
        }
        if (error) {
                if (xfs_sb_version_hascrc(&mp->m_sb)) {
@@ -707,6 +708,11 @@ xfs_log_mount(
                }
        }
+        error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj,
+                               "log");
+        if (error)
+                goto out_destroy_ail;
        /* Normal transactions can now occur */
        mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
@@ -947,6 +953,9 @@ xfs_log_unmount(
        xfs_log_quiesce(mp);
        xfs_trans_ail_destroy(mp);
+        xfs_sysfs_del(&mp->m_log->l_kobj);
        xlog_dealloc_log(mp->m_log);
 }
@@ -1313,7 +1322,7 @@ xlog_alloc_log(
        xlog_in_core_t          *iclog, *prev_iclog=NULL;
        xfs_buf_t               *bp;
        int                     i;
-        int                     error = ENOMEM;
+        int                     error = -ENOMEM;
        uint                    log2_size = 0;
        log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL);
@@ -1340,7 +1349,7 @@ xlog_alloc_log(
        xlog_grant_head_init(&log->l_reserve_head);
        xlog_grant_head_init(&log->l_write_head);
-        error = EFSCORRUPTED;
+        error = -EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
                log2_size = mp->m_sb.sb_logsectlog;
                if (log2_size < BBSHIFT) {
@@ -1369,8 +1378,14 @@ xlog_alloc_log(
        xlog_get_iclog_buffer_size(mp, log);
-        error = ENOMEM;
+        /*
-        bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
+         * Use a NULL block for the extra log buffer used during splits so that
+         * it will trigger errors if we ever try to do IO on it without first
+         * having set it up properly.
+         */
+        error = -ENOMEM;
+        bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
+                           BTOBB(log->l_iclog_size), 0);
        if (!bp)
                goto out_free_log;
@@ -1463,7 +1478,7 @@ out_free_iclog:
 out_free_log:
        kmem_free(log);
 out:
-        return ERR_PTR(-error);
+        return ERR_PTR(error);
 }       /* xlog_alloc_log */
@@ -1661,7 +1676,7 @@ xlog_bdstrat(
        xfs_buf_lock(bp);
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
-                xfs_buf_ioerror(bp, EIO);
+                xfs_buf_ioerror(bp, -EIO);
                xfs_buf_stale(bp);
                xfs_buf_ioend(bp, 0);
                /*
@@ -2360,7 +2375,7 @@ xlog_write(
                        ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
                        if (!ophdr)
-                                return XFS_ERROR(EIO);
+                                return -EIO;
                        xlog_write_adv_cnt(&ptr, &len, &log_offset,
                                           sizeof(struct xlog_op_header));
@@ -2859,7 +2874,7 @@ restart:
        spin_lock(&log->l_icloglock);
        if (XLOG_FORCED_SHUTDOWN(log)) {
                spin_unlock(&log->l_icloglock);
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        iclog = log->l_iclog;
@@ -3047,7 +3062,7 @@ xlog_state_release_iclog(
        int             sync = 0;       /* do we sync? */
        if (iclog->ic_state & XLOG_STATE_IOERROR)
-                return XFS_ERROR(EIO);
+                return -EIO;
        ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
        if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
@@ -3055,7 +3070,7 @@ xlog_state_release_iclog(
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                spin_unlock(&log->l_icloglock);
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
               iclog->ic_state == XLOG_STATE_WANT_SYNC);
@@ -3172,7 +3187,7 @@ _xfs_log_force(
        iclog = log->l_iclog;
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                spin_unlock(&log->l_icloglock);
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        /* If the head iclog is not active nor dirty, we just attach
@@ -3210,7 +3225,7 @@ _xfs_log_force(
                                spin_unlock(&log->l_icloglock);
                                if (xlog_state_release_iclog(log, iclog))
-                                        return XFS_ERROR(EIO);
+                                        return -EIO;
                                if (log_flushed)
                                        *log_flushed = 1;
@@ -3246,7 +3261,7 @@ maybe_sleep:
                 */
                if (iclog->ic_state & XLOG_STATE_IOERROR) {
                        spin_unlock(&log->l_icloglock);
-                        return XFS_ERROR(EIO);
+                        return -EIO;
                }
                XFS_STATS_INC(xs_log_force_sleep);
                xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
@@ -3256,7 +3271,7 @@ maybe_sleep:
                 * and the memory read should be atomic.
                 */
                if (iclog->ic_state & XLOG_STATE_IOERROR)
-                        return XFS_ERROR(EIO);
+                        return -EIO;
                if (log_flushed)
                        *log_flushed = 1;
        } else {
@@ -3324,7 +3339,7 @@ try_again:
        iclog = log->l_iclog;
        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                spin_unlock(&log->l_icloglock);
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        do {
@@ -3375,7 +3390,7 @@ try_again:
                        xlog_state_switch_iclogs(log, iclog, 0);
                        spin_unlock(&log->l_icloglock);
                        if (xlog_state_release_iclog(log, iclog))
-                                return XFS_ERROR(EIO);
+                                return -EIO;
                        if (log_flushed)
                                *log_flushed = 1;
                        spin_lock(&log->l_icloglock);
@@ -3390,7 +3405,7 @@ try_again:
                         */
                        if (iclog->ic_state & XLOG_STATE_IOERROR) {
                                spin_unlock(&log->l_icloglock);
-                                return XFS_ERROR(EIO);
+                                return -EIO;
                        }
                        XFS_STATS_INC(xs_log_force_sleep);
                        xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
@@ -3400,7 +3415,7 @@ try_again:
                         * and the memory read should be atomic.
                         */
                        if (iclog->ic_state & XLOG_STATE_IOERROR)
-                                return XFS_ERROR(EIO);
+                                return -EIO;
                        if (log_flushed)
                                *log_flushed = 1;
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index b3425b34e3d5..f6b79e5325dd 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -78,8 +78,6 @@ xlog_cil_init_post_recovery(
 {
        log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
        log->l_cilp->xc_ctx->sequence = 1;
-        log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
-                                                                log->l_curr_block);
 }
 /*
@@ -634,7 +632,7 @@ out_abort_free_ticket:
        xfs_log_ticket_put(tic);
 out_abort:
        xlog_cil_committed(ctx, XFS_LI_ABORTED);
-        return XFS_ERROR(EIO);
+        return -EIO;
 }
 static void
@@ -928,12 +926,12 @@ xlog_cil_init(
        cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
        if (!cil)
-                return ENOMEM;
+                return -ENOMEM;
        ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
        if (!ctx) {
                kmem_free(cil);
-                return ENOMEM;
+                return -ENOMEM;
        }
        INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 9bc403a9e54f..db7cbdeb2b42 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -405,6 +405,8 @@ struct xlog {
        struct xlog_grant_head  l_reserve_head;
        struct xlog_grant_head  l_write_head;
+        struct xfs_kobj         l_kobj;
        /* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
        char                    *l_iclog_bak[XLOG_MAX_ICLOGS];
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 981af0f6504b..1fd5787add99 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -179,7 +179,7 @@ xlog_bread_noalign(
                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
-                return EFSCORRUPTED;
+                return -EFSCORRUPTED;
        }
        blk_no = round_down(blk_no, log->l_sectBBsize);
@@ -194,7 +194,7 @@ xlog_bread_noalign(
        bp->b_error = 0;
        if (XFS_FORCED_SHUTDOWN(log->l_mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        xfs_buf_iorequest(bp);
        error = xfs_buf_iowait(bp);
@@ -268,7 +268,7 @@ xlog_bwrite(
                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
                        nbblks);
                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
-                return EFSCORRUPTED;
+                return -EFSCORRUPTED;
        }
        blk_no = round_down(blk_no, log->l_sectBBsize);
@@ -330,14 +330,14 @@ xlog_header_check_recover(
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_recover(1)",
                                 XFS_ERRLEVEL_HIGH, mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
                xfs_warn(mp,
        "dirty log entry has mismatched uuid - can't recover");
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_recover(2)",
                                 XFS_ERRLEVEL_HIGH, mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        return 0;
 }
@@ -364,7 +364,7 @@ xlog_header_check_mount(
                xlog_header_check_dump(mp, head);
                XFS_ERROR_REPORT("xlog_header_check_mount",
                                 XFS_ERRLEVEL_HIGH, mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        return 0;
 }
@@ -462,7 +462,7 @@ xlog_find_verify_cycle(
        while (!(bp = xlog_get_bp(log, bufblks))) {
                bufblks >>= 1;
                if (bufblks < log->l_sectBBsize)
-                        return ENOMEM;
+                        return -ENOMEM;
        }
        for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
@@ -524,7 +524,7 @@ xlog_find_verify_log_record(
        if (!(bp = xlog_get_bp(log, num_blks))) {
                if (!(bp = xlog_get_bp(log, 1)))
-                        return ENOMEM;
+                        return -ENOMEM;
                smallmem = 1;
        } else {
                error = xlog_bread(log, start_blk, num_blks, bp, &offset);
@@ -539,7 +539,7 @@ xlog_find_verify_log_record(
                        xfs_warn(log->l_mp,
                "Log inconsistent (didn't find previous header)");
                        ASSERT(0);
-                        error = XFS_ERROR(EIO);
+                        error = -EIO;
                        goto out;
                }
@@ -564,7 +564,7 @@ xlog_find_verify_log_record(
         * will be called again for the end of the physical log.
         */
        if (i == -1) {
-                error = -1;
+                error = 1;
                goto out;
        }
@@ -628,7 +628,12 @@ xlog_find_head(
        int             error, log_bbnum = log->l_logBBsize;
        /* Is the end of the log device zeroed? */
-        if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
+        error = xlog_find_zeroed(log, &first_blk);
+        if (error < 0) {
+                xfs_warn(log->l_mp, "empty log check failed");
+                return error;
+        }
+        if (error == 1) {
                *return_head_blk = first_blk;
                /* Is the whole lot zeroed? */
@@ -641,15 +646,12 @@ xlog_find_head(
                }
                return 0;
-        } else if (error) {
-                xfs_warn(log->l_mp, "empty log check failed");
-                return error;
        }
        first_blk = 0;                  /* get cycle # of 1st block */
        bp = xlog_get_bp(log, 1);
        if (!bp)
-                return ENOMEM;
+                return -ENOMEM;
        error = xlog_bread(log, 0, 1, bp, &offset);
        if (error)
@@ -818,29 +820,29 @@ validate_head:
                start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
                /* start ptr at last block ptr before head_blk */
-                if ((error = xlog_find_verify_log_record(log, start_blk,
+                error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
-                                                        &head_blk, 0)) == -1) {
+                if (error == 1)
-                        error = XFS_ERROR(EIO);
+                        error = -EIO;
-                        goto bp_err;
+                if (error)
-                } else if (error)
                        goto bp_err;
        } else {
                start_blk = 0;
                ASSERT(head_blk <= INT_MAX);
-                if ((error = xlog_find_verify_log_record(log, start_blk,
+                error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
-                                                        &head_blk, 0)) == -1) {
+                if (error < 0)
+                        goto bp_err;
+                if (error == 1) {
                        /* We hit the beginning of the log during our search */
                        start_blk = log_bbnum - (num_scan_bblks - head_blk);
                        new_blk = log_bbnum;
                        ASSERT(start_blk <= INT_MAX &&
                                (xfs_daddr_t) log_bbnum-start_blk >= 0);
                        ASSERT(head_blk <= INT_MAX);
-                        if ((error = xlog_find_verify_log_record(log,
+                        error = xlog_find_verify_log_record(log, start_blk,
-                                                        start_blk, &new_blk,
+                                                        &new_blk, (int)head_blk);
-                                                        (int)head_blk)) == -1) {
+                        if (error == 1)
-                                error = XFS_ERROR(EIO);
+                                error = -EIO;
-                                goto bp_err;
+                        if (error)
-                        } else if (error)
                                goto bp_err;
                        if (new_blk != log_bbnum)
                                head_blk = new_blk;
@@ -911,7 +913,7 @@ xlog_find_tail(
        bp = xlog_get_bp(log, 1);
        if (!bp)
-                return ENOMEM;
+                return -ENOMEM;
        if (*head_blk == 0) {                           /* special case */
                error = xlog_bread(log, 0, 1, bp, &offset);
                if (error)
@@ -961,7 +963,7 @@ xlog_find_tail(
                xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
                xlog_put_bp(bp);
                ASSERT(0);
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        /* find blk_no of tail of log */
@@ -1092,8 +1094,8 @@ done:
 *
 * Return:
 *      0  => the log is completely written to
- *      -1 => use *blk_no as the first block of the log
+ *      1 => use *blk_no as the first block of the log
- *      >0 => error has occurred
+ *      <0 => error has occurred
 */
 STATIC int
 xlog_find_zeroed(
@@ -1112,7 +1114,7 @@ xlog_find_zeroed(
        /* check totally zeroed log */
        bp = xlog_get_bp(log, 1);
        if (!bp)
-                return ENOMEM;
+                return -ENOMEM;
        error = xlog_bread(log, 0, 1, bp, &offset);
        if (error)
                goto bp_err;
@@ -1121,7 +1123,7 @@ xlog_find_zeroed(
        if (first_cycle == 0) {         /* completely zeroed log */
                *blk_no = 0;
                xlog_put_bp(bp);
-                return -1;
+                return 1;
        }
        /* check partially zeroed log */
@@ -1141,7 +1143,7 @@ xlog_find_zeroed(
                 */
                xfs_warn(log->l_mp,
                        "Log inconsistent or not a log (last==0, first!=1)");
-                error = XFS_ERROR(EINVAL);
+                error = -EINVAL;
                goto bp_err;
        }
@@ -1179,19 +1181,18 @@ xlog_find_zeroed(
         * Potentially backup over partial log record write.  We don't need
         * to search the end of the log because we know it is zero.
         */
-        if ((error = xlog_find_verify_log_record(log, start_blk,
+        error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
-                                &last_blk, 0)) == -1) {
+        if (error == 1)
-            error = XFS_ERROR(EIO);
+                error = -EIO;
-            goto bp_err;
+        if (error)
-        } else if (error)
+                goto bp_err;
-            goto bp_err;
        *blk_no = last_blk;
 bp_err:
        xlog_put_bp(bp);
        if (error)
                return error;
-        return -1;
+        return 1;
 }
 /*
@@ -1251,7 +1252,7 @@ xlog_write_log_records(
        while (!(bp = xlog_get_bp(log, bufblks))) {
                bufblks >>= 1;
                if (bufblks < sectbb)
-                        return ENOMEM;
+                        return -ENOMEM;
        }
        /* We may need to do a read at the start to fill in part of
@@ -1354,7 +1355,7 @@ xlog_clear_stale_blocks(
                if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
                        XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
                                         XFS_ERRLEVEL_LOW, log->l_mp);
-                        return XFS_ERROR(EFSCORRUPTED);
+                        return -EFSCORRUPTED;
                }
                tail_distance = tail_block + (log->l_logBBsize - head_block);
        } else {
@@ -1366,7 +1367,7 @@ xlog_clear_stale_blocks(
                if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
                        XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
                                         XFS_ERRLEVEL_LOW, log->l_mp);
-                        return XFS_ERROR(EFSCORRUPTED);
+                        return -EFSCORRUPTED;
                }
                tail_distance = tail_block - head_block;
        }
@@ -1551,7 +1552,7 @@ xlog_recover_add_to_trans(
                        xfs_warn(log->l_mp, "%s: bad header magic number",
                                __func__);
                        ASSERT(0);
-                        return XFS_ERROR(EIO);
+                        return -EIO;
                }
                if (len == sizeof(xfs_trans_header_t))
                        xlog_recover_add_item(&trans->r_itemq);
@@ -1581,7 +1582,7 @@ xlog_recover_add_to_trans(
                                  in_f->ilf_size);
                        ASSERT(0);
                        kmem_free(ptr);
-                        return XFS_ERROR(EIO);
+                        return -EIO;
                }
                item->ri_total = in_f->ilf_size;
@@ -1702,7 +1703,7 @@ xlog_recover_reorder_trans(
                         */
                        if (!list_empty(&sort_list))
                                list_splice_init(&sort_list, &trans->r_itemq);
-                        error = XFS_ERROR(EIO);
+                        error = -EIO;
                        goto out;
                }
        }
@@ -1943,7 +1944,7 @@ xlog_recover_do_inode_buffer(
                                item, bp);
                        XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
                                         XFS_ERRLEVEL_LOW, mp);
-                        return XFS_ERROR(EFSCORRUPTED);
+                        return -EFSCORRUPTED;
                }
                buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
@@ -2125,6 +2126,17 @@ xlog_recover_validate_buf_type(
        __uint16_t              magic16;
        __uint16_t              magicda;
+        /*
+         * We can only do post recovery validation on items on CRC enabled
+         * fielsystems as we need to know when the buffer was written to be able
+         * to determine if we should have replayed the item. If we replay old
+         * metadata over a newer buffer, then it will enter a temporarily
+         * inconsistent state resulting in verification failures. Hence for now
+         * just avoid the verification stage for non-crc filesystems
+         */
+        if (!xfs_sb_version_hascrc(&mp->m_sb))
+                return;
        magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
        magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
        magicda = be16_to_cpu(info->magic);
@@ -2162,8 +2174,6 @@ xlog_recover_validate_buf_type(
                bp->b_ops = &xfs_agf_buf_ops;
                break;
        case XFS_BLFT_AGFL_BUF:
-                if (!xfs_sb_version_hascrc(&mp->m_sb))
-                        break;
                if (magic32 != XFS_AGFL_MAGIC) {
                        xfs_warn(mp, "Bad AGFL block magic!");
                        ASSERT(0);
@@ -2196,10 +2206,6 @@ xlog_recover_validate_buf_type(
 #endif
                break;
        case XFS_BLFT_DINO_BUF:
-                /*
-                 * we get here with inode allocation buffers, not buffers that
-                 * track unlinked list changes.
-                 */
                if (magic16 != XFS_DINODE_MAGIC) {
                        xfs_warn(mp, "Bad INODE block magic!");
                        ASSERT(0);
@@ -2279,8 +2285,6 @@ xlog_recover_validate_buf_type(
                bp->b_ops = &xfs_attr3_leaf_buf_ops;
                break;
        case XFS_BLFT_ATTR_RMT_BUF:
-                if (!xfs_sb_version_hascrc(&mp->m_sb))
-                        break;
                if (magic32 != XFS_ATTR3_RMT_MAGIC) {
                        xfs_warn(mp, "Bad attr remote magic!");
                        ASSERT(0);
@@ -2387,16 +2391,7 @@ xlog_recover_do_reg_buffer(
        /* Shouldn't be any more regions */
        ASSERT(i == item->ri_total);
-        /*
+        xlog_recover_validate_buf_type(mp, bp, buf_f);
-         * We can only do post recovery validation on items on CRC enabled
-         * fielsystems as we need to know when the buffer was written to be able
-         * to determine if we should have replayed the item. If we replay old
-         * metadata over a newer buffer, then it will enter a temporarily
-         * inconsistent state resulting in verification failures. Hence for now
-         * just avoid the verification stage for non-crc filesystems
-         */
-        if (xfs_sb_version_hascrc(&mp->m_sb))
-                xlog_recover_validate_buf_type(mp, bp, buf_f);
 }
 /*
@@ -2404,8 +2399,11 @@ xlog_recover_do_reg_buffer(
 * Simple algorithm: if we have found a QUOTAOFF log item of the same type
 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
 * Else, treat it as a regular buffer and do recovery.
+ *
+ * Return false if the buffer was tossed and true if we recovered the buffer to
+ * indicate to the caller if the buffer needs writing.
 */
-STATIC void
+STATIC bool
 xlog_recover_do_dquot_buffer(
        struct xfs_mount                *mp,
        struct xlog                     *log,
@@ -2420,9 +2418,8 @@ xlog_recover_do_dquot_buffer(
        /*
         * Filesystems are required to send in quota flags at mount time.
         */
-        if (mp->m_qflags == 0) {
+        if (!mp->m_qflags)
-                return;
+                return false;
-        }
        type = 0;
        if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
@@ -2435,9 +2432,10 @@ xlog_recover_do_dquot_buffer(
         * This type of quotas was turned off, so ignore this buffer
         */
        if (log->l_quotaoffs_flag & type)
-                return;
+                return false;
        xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+        return true;
 }
 /*
@@ -2496,7 +2494,7 @@ xlog_recover_buffer_pass2(
        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
                          buf_flags, NULL);
        if (!bp)
-                return XFS_ERROR(ENOMEM);
+                return -ENOMEM;
        error = bp->b_error;
        if (error) {
                xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
@@ -2504,23 +2502,44 @@ xlog_recover_buffer_pass2(
        }
        /*
-         * recover the buffer only if we get an LSN from it and it's less than
+         * Recover the buffer only if we get an LSN from it and it's less than
         * the lsn of the transaction we are replaying.
+         *
+         * Note that we have to be extremely careful of readahead here.
+         * Readahead does not attach verfiers to the buffers so if we don't
+         * actually do any replay after readahead because of the LSN we found
+         * in the buffer if more recent than that current transaction then we
+         * need to attach the verifier directly. Failure to do so can lead to
+         * future recovery actions (e.g. EFI and unlinked list recovery) can
+         * operate on the buffers and they won't get the verifier attached. This
+         * can lead to blocks on disk having the correct content but a stale
+         * CRC.
+         *
+         * It is safe to assume these clean buffers are currently up to date.
+         * If the buffer is dirtied by a later transaction being replayed, then
+         * the verifier will be reset to match whatever recover turns that
+         * buffer into.
         */
        lsn = xlog_recover_get_buf_lsn(mp, bp);
-        if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
+        if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+                xlog_recover_validate_buf_type(mp, bp, buf_f);
                goto out_release;
+        }
        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
+                if (error)
+                        goto out_release;
        } else if (buf_f->blf_flags &
                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
-                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+                bool    dirty;
+                dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+                if (!dirty)
+                        goto out_release;
        } else {
                xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
        }
-        if (error)
-                goto out_release;
        /*
         * Perform delayed write on the buffer.  Asynchronous writes will be
@@ -2598,7 +2617,7 @@ xfs_recover_inode_owner_change(
        ip = xfs_inode_alloc(mp, in_f->ilf_ino);
        if (!ip)
-                return ENOMEM;
+                return -ENOMEM;
        /* instantiate the inode */
        xfs_dinode_from_disk(&ip->i_d, dip);
@@ -2676,7 +2695,7 @@ xlog_recover_inode_pass2(
        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
                          &xfs_inode_buf_ops);
        if (!bp) {
-                error = ENOMEM;
+                error = -ENOMEM;
                goto error;
        }
        error = bp->b_error;
@@ -2697,7 +2716,7 @@ xlog_recover_inode_pass2(
                        __func__, dip, bp, in_f->ilf_ino);
                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
                                 XFS_ERRLEVEL_LOW, mp);
-                error = EFSCORRUPTED;
+                error = -EFSCORRUPTED;
                goto out_release;
        }
        dicp = item->ri_buf[1].i_addr;
@@ -2707,7 +2726,7 @@ xlog_recover_inode_pass2(
                        __func__, item, in_f->ilf_ino);
                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
                                 XFS_ERRLEVEL_LOW, mp);
-                error = EFSCORRUPTED;
+                error = -EFSCORRUPTED;
                goto out_release;
        }
@@ -2764,7 +2783,7 @@ xlog_recover_inode_pass2(
                "%s: Bad regular inode log record, rec ptr 0x%p, "
                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
                                __func__, item, dip, bp, in_f->ilf_ino);
-                        error = EFSCORRUPTED;
+                        error = -EFSCORRUPTED;
                        goto out_release;
                }
        } else if (unlikely(S_ISDIR(dicp->di_mode))) {
@@ -2777,7 +2796,7 @@ xlog_recover_inode_pass2(
                "%s: Bad dir inode log record, rec ptr 0x%p, "
                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
                                __func__, item, dip, bp, in_f->ilf_ino);
-                        error = EFSCORRUPTED;
+                        error = -EFSCORRUPTED;
                        goto out_release;
                }
        }
@@ -2790,7 +2809,7 @@ xlog_recover_inode_pass2(
                        __func__, item, dip, bp, in_f->ilf_ino,
                        dicp->di_nextents + dicp->di_anextents,
                        dicp->di_nblocks);
-                error = EFSCORRUPTED;
+                error = -EFSCORRUPTED;
                goto out_release;
        }
        if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
@@ -2800,7 +2819,7 @@ xlog_recover_inode_pass2(
        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
        "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
                        item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
-                error = EFSCORRUPTED;
+                error = -EFSCORRUPTED;
                goto out_release;
        }
        isize = xfs_icdinode_size(dicp->di_version);
@@ -2810,7 +2829,7 @@ xlog_recover_inode_pass2(
                xfs_alert(mp,
                        "%s: Bad inode log record length %d, rec ptr 0x%p",
                        __func__, item->ri_buf[1].i_len, item);
-                error = EFSCORRUPTED;
+                error = -EFSCORRUPTED;
                goto out_release;
        }
@@ -2898,7 +2917,7 @@ xlog_recover_inode_pass2(
                default:
                        xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
                        ASSERT(0);
-                        error = EIO;
+                        error = -EIO;
                        goto out_release;
                }
        }
@@ -2919,7 +2938,7 @@ out_release:
 error:
        if (need_free)
                kmem_free(in_f);
-        return XFS_ERROR(error);
+        return error;
 }
 /*
@@ -2946,7 +2965,7 @@ xlog_recover_quotaoff_pass1(
        if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
                log->l_quotaoffs_flag |= XFS_DQ_GROUP;
-        return (0);
+        return 0;
 }
 /*
@@ -2971,17 +2990,17 @@ xlog_recover_dquot_pass2(
         * Filesystems are required to send in quota flags at mount time.
         */
        if (mp->m_qflags == 0)
-                return (0);
+                return 0;
        recddq = item->ri_buf[1].i_addr;
        if (recddq == NULL) {
                xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
                xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
                        item->ri_buf[1].i_len, __func__);
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        /*
@@ -2990,7 +3009,7 @@ xlog_recover_dquot_pass2(
        type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
        ASSERT(type);
        if (log->l_quotaoffs_flag & type)
-                return (0);
+                return 0;
        /*
         * At this point we know that quota was _not_ turned off.
@@ -3007,12 +3026,19 @@ xlog_recover_dquot_pass2(
        error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
                           "xlog_recover_dquot_pass2 (log copy)");
        if (error)
-                return XFS_ERROR(EIO);
+                return -EIO;
        ASSERT(dq_f->qlf_len == 1);
+        /*
+         * At this point we are assuming that the dquots have been allocated
+         * and hence the buffer has valid dquots stamped in it. It should,
+         * therefore, pass verifier validation. If the dquot is bad, then the
+         * we'll return an error here, so we don't need to specifically check
+         * the dquot in the buffer after the verifier has run.
+         */
        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
                                   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
-                                   NULL);
+                                   &xfs_dquot_buf_ops);
        if (error)
                return error;
@@ -3020,18 +3046,6 @@ xlog_recover_dquot_pass2(
        ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
        /*
-         * At least the magic num portion should be on disk because this
-         * was among a chunk of dquots created earlier, and we did some
-         * minimal initialization then.
-         */
-        error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
-                           "xlog_recover_dquot_pass2");
-        if (error) {
-                xfs_buf_relse(bp);
-                return XFS_ERROR(EIO);
-        }
-        /*
         * If the dquot has an LSN in it, recover the dquot only if it's less
         * than the lsn of the transaction we are replaying.
         */
@@ -3178,38 +3192,38 @@ xlog_recover_do_icreate_pass2(
        icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
        if (icl->icl_type != XFS_LI_ICREATE) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
-                return EINVAL;
+                return -EINVAL;
        }
        if (icl->icl_size != 1) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
-                return EINVAL;
+                return -EINVAL;
        }
        agno = be32_to_cpu(icl->icl_ag);
        if (agno >= mp->m_sb.sb_agcount) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
-                return EINVAL;
+                return -EINVAL;
        }
        agbno = be32_to_cpu(icl->icl_agbno);
        if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
-                return EINVAL;
+                return -EINVAL;
        }
        isize = be32_to_cpu(icl->icl_isize);
        if (isize != mp->m_sb.sb_inodesize) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
-                return EINVAL;
+                return -EINVAL;
        }
        count = be32_to_cpu(icl->icl_count);
        if (!count) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
-                return EINVAL;
+                return -EINVAL;
        }
        length = be32_to_cpu(icl->icl_length);
        if (!length || length >= mp->m_sb.sb_agblocks) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
-                return EINVAL;
+                return -EINVAL;
        }
        /* existing allocation is fixed value */
@@ -3218,7 +3232,7 @@ xlog_recover_do_icreate_pass2(
        if (count != mp->m_ialloc_inos ||
             length != mp->m_ialloc_blks) {
                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
-                return EINVAL;
+                return -EINVAL;
        }
        /*
@@ -3389,7 +3403,7 @@ xlog_recover_commit_pass1(
                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
                        __func__, ITEM_TYPE(item));
                ASSERT(0);
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
 }
@@ -3425,7 +3439,7 @@ xlog_recover_commit_pass2(
                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
                        __func__, ITEM_TYPE(item));
                ASSERT(0);
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
 }
@@ -3560,7 +3574,7 @@ xlog_recover_process_data(
        /* check the log format matches our own - else we can't recover */
        if (xlog_header_check_recover(log->l_mp, rhead))
-                return (XFS_ERROR(EIO));
+                return -EIO;
        while ((dp < lp) && num_logops) {
                ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
@@ -3571,7 +3585,7 @@ xlog_recover_process_data(
                        xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
                                        __func__, ohead->oh_clientid);
                        ASSERT(0);
-                        return (XFS_ERROR(EIO));
+                        return -EIO;
                }
                tid = be32_to_cpu(ohead->oh_tid);
                hash = XLOG_RHASH(tid);
@@ -3585,7 +3599,7 @@ xlog_recover_process_data(
                                xfs_warn(log->l_mp, "%s: bad length 0x%x",
                                        __func__, be32_to_cpu(ohead->oh_len));
                                WARN_ON(1);
-                                return (XFS_ERROR(EIO));
+                                return -EIO;
                        }
                        flags = ohead->oh_flags & ~XLOG_END_TRANS;
                        if (flags & XLOG_WAS_CONT_TRANS)
@@ -3607,7 +3621,7 @@ xlog_recover_process_data(
                                xfs_warn(log->l_mp, "%s: bad transaction",
                                        __func__);
                                ASSERT(0);
-                                error = XFS_ERROR(EIO);
+                                error = -EIO;
                                break;
                        case 0:
                        case XLOG_CONTINUE_TRANS:
@@ -3618,7 +3632,7 @@ xlog_recover_process_data(
                                xfs_warn(log->l_mp, "%s: bad flag 0x%x",
                                        __func__, flags);
                                ASSERT(0);
-                                error = XFS_ERROR(EIO);
+                                error = -EIO;
                                break;
                        }
                        if (error) {
@@ -3669,7 +3683,7 @@ xlog_recover_process_efi(
                         */
                        set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
                        xfs_efi_release(efip, efip->efi_format.efi_nextents);
-                        return XFS_ERROR(EIO);
+                        return -EIO;
                }
        }
@@ -3969,7 +3983,7 @@ xlog_unpack_data_crc(
                 * CRC protection by punting an error back up the stack.
                 */
                if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
-                        return EFSCORRUPTED;
+                        return -EFSCORRUPTED;
        }
        return 0;
@@ -4018,14 +4032,14 @@ xlog_valid_rec_header(
        if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
                XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
                                XFS_ERRLEVEL_LOW, log->l_mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        if (unlikely(
            (!rhead->h_version ||
            (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
                xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
                        __func__, be32_to_cpu(rhead->h_version));
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        /* LR body must have data or it wouldn't have been written */
@@ -4033,12 +4047,12 @@ xlog_valid_rec_header(
        if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
                XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
                                XFS_ERRLEVEL_LOW, log->l_mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
                XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
                                XFS_ERRLEVEL_LOW, log->l_mp);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        return 0;
 }
@@ -4081,7 +4095,7 @@ xlog_do_recovery_pass(
                 */
                hbp = xlog_get_bp(log, 1);
                if (!hbp)
-                        return ENOMEM;
+                        return -ENOMEM;
                error = xlog_bread(log, tail_blk, 1, hbp, &offset);
                if (error)
@@ -4110,11 +4124,11 @@ xlog_do_recovery_pass(
        }
        if (!hbp)
-                return ENOMEM;
+                return -ENOMEM;
        dbp = xlog_get_bp(log, BTOBB(h_size));
        if (!dbp) {
                xlog_put_bp(hbp);
-                return ENOMEM;
+                return -ENOMEM;
        }
        memset(rhash, 0, sizeof(rhash));
@@ -4388,7 +4402,7 @@ xlog_do_recover(
         * If IO errors happened during recovery, bail out.
         */
        if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
-                return (EIO);
+                return -EIO;
        }
        /*
@@ -4415,7 +4429,7 @@ xlog_do_recover(
        if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
                xfs_buf_relse(bp);
-                return XFS_ERROR(EIO);
+                return -EIO;
        }
        xfs_buf_iorequest(bp);
@@ -4492,7 +4506,7 @@ xlog_recover(
 "Please recover the log on a kernel that supports the unknown features.",
                                (log->l_mp->m_sb.sb_features_log_incompat &
                                        XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
-                        return EINVAL;
+                        return -EINVAL;
                }
                xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 3507cd0ec400..fbf0384a466f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_dinode.h"
+#include "xfs_sysfs.h"
 #ifdef HAVE_PERCPU_SB
@@ -60,6 +61,8 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
 static uuid_t *xfs_uuid_table;
+extern struct kset *xfs_kset;
 /*
 * See if the UUID is unique among mounted XFS filesystems.
 * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
@@ -76,7 +79,7 @@ xfs_uuid_mount(
        if (uuid_is_nil(uuid)) {
                xfs_warn(mp, "Filesystem has nil UUID - can't mount");
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        mutex_lock(&xfs_uuid_table_mutex);
@@ -104,7 +107,7 @@ xfs_uuid_mount(
 out_duplicate:
        mutex_unlock(&xfs_uuid_table_mutex);
        xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
-        return XFS_ERROR(EINVAL);
+        return -EINVAL;
 }
 STATIC void
@@ -173,13 +176,9 @@ xfs_sb_validate_fsb_count(
        ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
        ASSERT(sbp->sb_blocklog >= BBSHIFT);
-#if XFS_BIG_BLKNOS     /* Limited by ULONG_MAX of page cache index */
+        /* Limited by ULONG_MAX of page cache index */
        if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
-                return EFBIG;
+                return -EFBIG;
-#else                  /* Limited by UINT_MAX of sectors */
-        if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
-                return EFBIG;
-#endif
        return 0;
 }
@@ -250,9 +249,9 @@ xfs_initialize_perag(
                mp->m_flags &= ~XFS_MOUNT_32BITINODES;
        if (mp->m_flags & XFS_MOUNT_32BITINODES)
-                index = xfs_set_inode32(mp);
+                index = xfs_set_inode32(mp, agcount);
        else
-                index = xfs_set_inode64(mp);
+                index = xfs_set_inode64(mp, agcount);
        if (maxagi)
                *maxagi = index;
@@ -308,15 +307,15 @@ reread:
        if (!bp) {
                if (loud)
                        xfs_warn(mp, "SB buffer read failed");
-                return EIO;
+                return -EIO;
        }
        if (bp->b_error) {
                error = bp->b_error;
                if (loud)
                        xfs_warn(mp, "SB validate failed with error %d.", error);
                /* bad CRC means corrupted metadata */
-                if (error == EFSBADCRC)
+                if (error == -EFSBADCRC)
-                        error = EFSCORRUPTED;
+                        error = -EFSCORRUPTED;
                goto release_buf;
        }
@@ -324,7 +323,6 @@ reread:
         * Initialize the mount structure from the superblock.
         */
        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
-        xfs_sb_quota_from_disk(sbp);
        /*
         * If we haven't validated the superblock, do so now before we try
@@ -333,7 +331,7 @@ reread:
        if (sbp->sb_magicnum != XFS_SB_MAGIC) {
                if (loud)
                        xfs_warn(mp, "Invalid superblock magic number");
-                error = EINVAL;
+                error = -EINVAL;
                goto release_buf;
        }
@@ -344,7 +342,7 @@ reread:
                if (loud)
                        xfs_warn(mp, "device supports %u byte sectors (not %u)",
                                sector_size, sbp->sb_sectsize);
-                error = ENOSYS;
+                error = -ENOSYS;
                goto release_buf;
        }
@@ -392,7 +390,7 @@ xfs_update_alignment(xfs_mount_t *mp)
                        xfs_warn(mp,
                "alignment check failed: sunit/swidth vs. blocksize(%d)",
                                sbp->sb_blocksize);
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                } else {
                        /*
                         * Convert the stripe unit and width to FSBs.
@@ -402,14 +400,14 @@ xfs_update_alignment(xfs_mount_t *mp)
                                xfs_warn(mp,
                        "alignment check failed: sunit/swidth vs. agsize(%d)",
                                         sbp->sb_agblocks);
-                                return XFS_ERROR(EINVAL);
+                                return -EINVAL;
                        } else if (mp->m_dalign) {
                                mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
                        } else {
                                xfs_warn(mp,
                        "alignment check failed: sunit(%d) less than bsize(%d)",
                                         mp->m_dalign, sbp->sb_blocksize);
-                                return XFS_ERROR(EINVAL);
+                                return -EINVAL;
                        }
                }
@@ -429,7 +427,7 @@ xfs_update_alignment(xfs_mount_t *mp)
                } else {
                        xfs_warn(mp,
        "cannot change alignment: superblock does not support data alignment");
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
        } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
                    xfs_sb_version_hasdalign(&mp->m_sb)) {
@@ -556,14 +554,14 @@ xfs_check_sizes(xfs_mount_t *mp)
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
                xfs_warn(mp, "filesystem size mismatch detected");
-                return XFS_ERROR(EFBIG);
+                return -EFBIG;
        }
        bp = xfs_buf_read_uncached(mp->m_ddev_targp,
                                        d - XFS_FSS_TO_BB(mp, 1),
                                        XFS_FSS_TO_BB(mp, 1), 0, NULL);
        if (!bp) {
                xfs_warn(mp, "last sector read failed");
-                return EIO;
+                return -EIO;
        }
        xfs_buf_relse(bp);
@@ -571,14 +569,14 @@ xfs_check_sizes(xfs_mount_t *mp)
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
                        xfs_warn(mp, "log size mismatch detected");
-                        return XFS_ERROR(EFBIG);
+                        return -EFBIG;
                }
                bp = xfs_buf_read_uncached(mp->m_logdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
                                        XFS_FSB_TO_BB(mp, 1), 0, NULL);
                if (!bp) {
                        xfs_warn(mp, "log device read failed");
-                        return EIO;
+                        return -EIO;
                }
                xfs_buf_relse(bp);
        }
@@ -731,10 +729,15 @@ xfs_mountfs(
        xfs_set_maxicount(mp);
-        error = xfs_uuid_mount(mp);
+        mp->m_kobj.kobject.kset = xfs_kset;
+        error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
        if (error)
                goto out;
+        error = xfs_uuid_mount(mp);
+        if (error)
+                goto out_remove_sysfs;
        /*
         * Set the minimum read and write sizes
         */
@@ -816,7 +819,7 @@ xfs_mountfs(
        if (!sbp->sb_logblocks) {
                xfs_warn(mp, "no log defined");
                XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
-                error = XFS_ERROR(EFSCORRUPTED);
+                error = -EFSCORRUPTED;
                goto out_free_perag;
        }
@@ -855,7 +858,7 @@ xfs_mountfs(
             !mp->m_sb.sb_inprogress) {
                error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
                if (error)
-                        goto out_fail_wait;
+                        goto out_log_dealloc;
        }
        /*
@@ -876,7 +879,7 @@ xfs_mountfs(
                xfs_iunlock(rip, XFS_ILOCK_EXCL);
                XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
                                 mp);
-                error = XFS_ERROR(EFSCORRUPTED);
+                error = -EFSCORRUPTED;
                goto out_rele_rip;
        }
        mp->m_rootip = rip;     /* save it */
@@ -927,7 +930,7 @@ xfs_mountfs(
                        xfs_notice(mp, "resetting quota flags");
                        error = xfs_mount_reset_sbqflags(mp);
                        if (error)
-                                return error;
+                                goto out_rtunmount;
                }
        }
@@ -989,6 +992,8 @@ xfs_mountfs(
        xfs_da_unmount(mp);
 out_remove_uuid:
        xfs_uuid_unmount(mp);
+ out_remove_sysfs:
+        xfs_sysfs_del(&mp->m_kobj);
 out:
        return error;
 }
@@ -1071,6 +1076,8 @@ xfs_unmountfs(
        xfs_errortag_clearall(mp, 0);
 #endif
        xfs_free_perag(mp);
+        xfs_sysfs_del(&mp->m_kobj);
 }
 int
@@ -1152,7 +1159,7 @@ xfs_mod_incore_sb_unlocked(
                lcounter += delta;
                if (lcounter < 0) {
                        ASSERT(0);
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
                mp->m_sb.sb_icount = lcounter;
                return 0;
@@ -1161,7 +1168,7 @@ xfs_mod_incore_sb_unlocked(
                lcounter += delta;
                if (lcounter < 0) {
                        ASSERT(0);
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
                mp->m_sb.sb_ifree = lcounter;
                return 0;
@@ -1191,7 +1198,7 @@ xfs_mod_incore_sb_unlocked(
                         * blocks if were allowed to.
                         */
                        if (!rsvd)
-                                return XFS_ERROR(ENOSPC);
+                                return -ENOSPC;
                        lcounter = (long long)mp->m_resblks_avail + delta;
                        if (lcounter >= 0) {
@@ -1202,7 +1209,7 @@ xfs_mod_incore_sb_unlocked(
                                "Filesystem \"%s\": reserve blocks depleted! "
                                "Consider increasing reserve pool size.",
                                mp->m_fsname);
-                        return XFS_ERROR(ENOSPC);
+                        return -ENOSPC;
                }
                mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
@@ -1211,7 +1218,7 @@ xfs_mod_incore_sb_unlocked(
                lcounter = (long long)mp->m_sb.sb_frextents;
                lcounter += delta;
                if (lcounter < 0) {
-                        return XFS_ERROR(ENOSPC);
+                        return -ENOSPC;
                }
                mp->m_sb.sb_frextents = lcounter;
                return 0;
@@ -1220,7 +1227,7 @@ xfs_mod_incore_sb_unlocked(
                lcounter += delta;
                if (lcounter < 0) {
                        ASSERT(0);
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
                mp->m_sb.sb_dblocks = lcounter;
                return 0;
@@ -1229,7 +1236,7 @@ xfs_mod_incore_sb_unlocked(
                scounter += delta;
                if (scounter < 0) {
                        ASSERT(0);
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
                mp->m_sb.sb_agcount = scounter;
                return 0;
@@ -1238,7 +1245,7 @@ xfs_mod_incore_sb_unlocked(
                scounter += delta;
                if (scounter < 0) {
                        ASSERT(0);
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
                mp->m_sb.sb_imax_pct = scounter;
                return 0;
@@ -1247,7 +1254,7 @@ xfs_mod_incore_sb_unlocked(
                scounter += delta;
                if (scounter < 0) {
                        ASSERT(0);
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
                mp->m_sb.sb_rextsize = scounter;
                return 0;
@@ -1256,7 +1263,7 @@ xfs_mod_incore_sb_unlocked(
                scounter += delta;
                if (scounter < 0) {
                        ASSERT(0);
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
                mp->m_sb.sb_rbmblocks = scounter;
                return 0;
@@ -1265,7 +1272,7 @@ xfs_mod_incore_sb_unlocked(
                lcounter += delta;
                if (lcounter < 0) {
                        ASSERT(0);
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
                mp->m_sb.sb_rblocks = lcounter;
                return 0;
@@ -1274,7 +1281,7 @@ xfs_mod_incore_sb_unlocked(
                lcounter += delta;
                if (lcounter < 0) {
                        ASSERT(0);
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
                mp->m_sb.sb_rextents = lcounter;
                return 0;
@@ -1283,13 +1290,13 @@ xfs_mod_incore_sb_unlocked(
                scounter += delta;
                if (scounter < 0) {
                        ASSERT(0);
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
                mp->m_sb.sb_rextslog = scounter;
                return 0;
        default:
                ASSERT(0);
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
 }
@@ -1452,7 +1459,7 @@ xfs_dev_is_read_only(
            (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
                xfs_notice(mp, "%s required on read-only device.", message);
                xfs_notice(mp, "write access unavailable, cannot proceed.");
-                return EROFS;
+                return -EROFS;
        }
        return 0;
 }
@@ -1995,7 +2002,7 @@ slow_path:
         * (e.g. lots of space just got freed). After that
         * we are done.
         */
-        if (ret != ENOSPC)
+        if (ret != -ENOSPC)
                xfs_icsb_balance_counter(mp, field, 0);
        xfs_icsb_unlock(mp);
        return ret;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7295a0b7c343..b0447c86e7e2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -166,6 +166,7 @@ typedef struct xfs_mount {
                                                   on the next remount,rw */
        int64_t                 m_low_space[XFS_LOWSP_MAX];
                                                /* low free space thresholds */
+        struct xfs_kobj         m_kobj;
        struct workqueue_struct *m_data_workqueue;
        struct workqueue_struct *m_unwritten_workqueue;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index f99b4933dc22..1eb6f3df698c 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -337,20 +337,20 @@ xfs_mru_cache_create(
                *mrup = NULL;
        if (!mrup || !grp_count || !lifetime_ms || !free_func)
-                return EINVAL;
+                return -EINVAL;
        if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
-                return EINVAL;
+                return -EINVAL;
        if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
-                return ENOMEM;
+                return -ENOMEM;
        /* An extra list is needed to avoid reaping up to a grp_time early. */
        mru->grp_count = grp_count + 1;
        mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
        if (!mru->lists) {
-                err = ENOMEM;
+                err = -ENOMEM;
                goto exit;
        }
@@ -434,16 +434,16 @@ xfs_mru_cache_insert(
        ASSERT(mru && mru->lists);
        if (!mru || !mru->lists)
-                return EINVAL;
+                return -EINVAL;
        if (radix_tree_preload(GFP_KERNEL))
-                return ENOMEM;
+                return -ENOMEM;
        INIT_LIST_HEAD(&elem->list_node);
        elem->key = key;
        spin_lock(&mru->lock);
-        error = -radix_tree_insert(&mru->store, key, elem);
+        error = radix_tree_insert(&mru->store, key, elem);
        radix_tree_preload_end();
        if (!error)
                _xfs_mru_cache_list_insert(mru, elem);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 6d26759c779a..10232102b4a6 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -98,18 +98,18 @@ restart:
                        next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
                        error = execute(batch[i], data);
-                        if (error == EAGAIN) {
+                        if (error == -EAGAIN) {
                                skipped++;
                                continue;
                        }
-                        if (error && last_error != EFSCORRUPTED)
+                        if (error && last_error != -EFSCORRUPTED)
                                last_error = error;
                }
                mutex_unlock(&qi->qi_tree_lock);
                /* bail out if the filesystem is corrupted.  */
-                if (last_error == EFSCORRUPTED) {
+                if (last_error == -EFSCORRUPTED) {
                        skipped = 0;
                        break;
                }
@@ -138,7 +138,7 @@ xfs_qm_dqpurge(
        xfs_dqlock(dqp);
        if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
                xfs_dqunlock(dqp);
-                return EAGAIN;
+                return -EAGAIN;
        }
        dqp->dq_flags |= XFS_DQ_FREEING;
@@ -221,100 +221,6 @@ xfs_qm_unmount(
        }
 }
-/*
- * This is called from xfs_mountfs to start quotas and initialize all
- * necessary data structures like quotainfo.  This is also responsible for
- * running a quotacheck as necessary.  We are guaranteed that the superblock
- * is consistently read in at this point.
- *
- * If we fail here, the mount will continue with quota turned off. We don't
- * need to inidicate success or failure at all.
- */
-void
-xfs_qm_mount_quotas(
-        xfs_mount_t     *mp)
-{
-        int             error = 0;
-        uint            sbf;
-        /*
-         * If quotas on realtime volumes is not supported, we disable
-         * quotas immediately.
-         */
-        if (mp->m_sb.sb_rextents) {
-                xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
-                mp->m_qflags = 0;
-                goto write_changes;
-        }
-        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-        /*
-         * Allocate the quotainfo structure inside the mount struct, and
-         * create quotainode(s), and change/rev superblock if necessary.
-         */
-        error = xfs_qm_init_quotainfo(mp);
-        if (error) {
-                /*
-                 * We must turn off quotas.
-                 */
-                ASSERT(mp->m_quotainfo == NULL);
-                mp->m_qflags = 0;
-                goto write_changes;
-        }
-        /*
-         * If any of the quotas are not consistent, do a quotacheck.
-         */
-        if (XFS_QM_NEED_QUOTACHECK(mp)) {
-                error = xfs_qm_quotacheck(mp);
-                if (error) {
-                        /* Quotacheck failed and disabled quotas. */
-                        return;
-                }
-        }
-        /* 
-         * If one type of quotas is off, then it will lose its
-         * quotachecked status, since we won't be doing accounting for
-         * that type anymore.
-         */
-        if (!XFS_IS_UQUOTA_ON(mp))
-                mp->m_qflags &= ~XFS_UQUOTA_CHKD;
-        if (!XFS_IS_GQUOTA_ON(mp))
-                mp->m_qflags &= ~XFS_GQUOTA_CHKD;
-        if (!XFS_IS_PQUOTA_ON(mp))
-                mp->m_qflags &= ~XFS_PQUOTA_CHKD;
- write_changes:
-        /*
-         * We actually don't have to acquire the m_sb_lock at all.
-         * This can only be called from mount, and that's single threaded. XXX
-         */
-        spin_lock(&mp->m_sb_lock);
-        sbf = mp->m_sb.sb_qflags;
-        mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
-        spin_unlock(&mp->m_sb_lock);
-        if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
-                if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
-                        /*
-                         * We could only have been turning quotas off.
-                         * We aren't in very good shape actually because
-                         * the incore structures are convinced that quotas are
-                         * off, but the on disk superblock doesn't know that !
-                         */
-                        ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
-                        xfs_alert(mp, "%s: Superblock update failed!",
-                                __func__);
-                }
-        }
-        if (error) {
-                xfs_warn(mp, "Failed to initialize disk quotas.");
-                return;
-        }
-}
 /*
 * Called from the vfsops layer.
 */
@@ -671,7 +577,7 @@ xfs_qm_init_quotainfo(
        qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
-        error = -list_lru_init(&qinf->qi_lru);
+        error = list_lru_init(&qinf->qi_lru);
        if (error)
                goto out_free_qinf;
@@ -995,7 +901,7 @@ xfs_qm_dqiter_bufs(
                 * will leave a trace in the log indicating corruption has
                 * been detected.
                 */
-                if (error == EFSCORRUPTED) {
+                if (error == -EFSCORRUPTED) {
                        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
                                      XFS_FSB_TO_DADDR(mp, bno),
                                      mp->m_quotainfo->qi_dqchunklen, 0, &bp,
@@ -1005,6 +911,12 @@ xfs_qm_dqiter_bufs(
                if (error)
                        break;
+                /*
+                 * A corrupt buffer might not have a verifier attached, so
+                 * make sure we have the correct one attached before writeback
+                 * occurs.
+                 */
+                bp->b_ops = &xfs_dquot_buf_ops;
                xfs_qm_reset_dqcounts(mp, bp, firstid, type);
                xfs_buf_delwri_queue(bp, buffer_list);
                xfs_buf_relse(bp);
@@ -1090,7 +1002,7 @@ xfs_qm_dqiterate(
                                        xfs_buf_readahead(mp->m_ddev_targp,
                                               XFS_FSB_TO_DADDR(mp, rablkno),
                                               mp->m_quotainfo->qi_dqchunklen,
-                                               NULL);
+                                               &xfs_dquot_buf_ops);
                                        rablkno++;
                                }
                        }
@@ -1138,8 +1050,8 @@ xfs_qm_quotacheck_dqadjust(
                /*
                 * Shouldn't be able to turn off quotas here.
                 */
-                ASSERT(error != ESRCH);
+                ASSERT(error != -ESRCH);
-                ASSERT(error != ENOENT);
+                ASSERT(error != -ENOENT);
                return error;
        }
@@ -1226,7 +1138,7 @@ xfs_qm_dqusage_adjust(
         */
        if (xfs_is_quota_inode(&mp->m_sb, ino)) {
                *res = BULKSTAT_RV_NOTHING;
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        /*
@@ -1330,7 +1242,7 @@ out_unlock:
 * Walk thru all the filesystem inodes and construct a consistent view
 * of the disk quota world. If the quotacheck fails, disable quotas.
 */
-int
+STATIC int
 xfs_qm_quotacheck(
        xfs_mount_t     *mp)
 {
@@ -1463,7 +1375,100 @@ xfs_qm_quotacheck(
                }
        } else
                xfs_notice(mp, "Quotacheck: Done.");
-        return (error);
+        return error;
+}
+/*
+ * This is called from xfs_mountfs to start quotas and initialize all
+ * necessary data structures like quotainfo.  This is also responsible for
+ * running a quotacheck as necessary.  We are guaranteed that the superblock
+ * is consistently read in at this point.
+ *
+ * If we fail here, the mount will continue with quota turned off. We don't
+ * need to inidicate success or failure at all.
+ */
+void
+xfs_qm_mount_quotas(
+        struct xfs_mount        *mp)
+{
+        int                     error = 0;
+        uint                    sbf;
+        /*
+         * If quotas on realtime volumes is not supported, we disable
+         * quotas immediately.
+         */
+        if (mp->m_sb.sb_rextents) {
+                xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
+                mp->m_qflags = 0;
+                goto write_changes;
+        }
+        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+        /*
+         * Allocate the quotainfo structure inside the mount struct, and
+         * create quotainode(s), and change/rev superblock if necessary.
+         */
+        error = xfs_qm_init_quotainfo(mp);
+        if (error) {
+                /*
+                 * We must turn off quotas.
+                 */
+                ASSERT(mp->m_quotainfo == NULL);
+                mp->m_qflags = 0;
+                goto write_changes;
+        }
+        /*
+         * If any of the quotas are not consistent, do a quotacheck.
+         */
+        if (XFS_QM_NEED_QUOTACHECK(mp)) {
+                error = xfs_qm_quotacheck(mp);
+                if (error) {
+                        /* Quotacheck failed and disabled quotas. */
+                        return;
+                }
+        }
+        /*
+         * If one type of quotas is off, then it will lose its
+         * quotachecked status, since we won't be doing accounting for
+         * that type anymore.
+         */
+        if (!XFS_IS_UQUOTA_ON(mp))
+                mp->m_qflags &= ~XFS_UQUOTA_CHKD;
+        if (!XFS_IS_GQUOTA_ON(mp))
+                mp->m_qflags &= ~XFS_GQUOTA_CHKD;
+        if (!XFS_IS_PQUOTA_ON(mp))
+                mp->m_qflags &= ~XFS_PQUOTA_CHKD;
+ write_changes:
+        /*
+         * We actually don't have to acquire the m_sb_lock at all.
+         * This can only be called from mount, and that's single threaded. XXX
+         */
+        spin_lock(&mp->m_sb_lock);
+        sbf = mp->m_sb.sb_qflags;
+        mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
+        spin_unlock(&mp->m_sb_lock);
+        if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
+                if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
+                        /*
+                         * We could only have been turning quotas off.
+                         * We aren't in very good shape actually because
+                         * the incore structures are convinced that quotas are
+                         * off, but the on disk superblock doesn't know that !
+                         */
+                        ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
+                        xfs_alert(mp, "%s: Superblock update failed!",
+                                __func__);
+                }
+        }
+        if (error) {
+                xfs_warn(mp, "Failed to initialize disk quotas.");
+                return;
+        }
 }
 /*
@@ -1493,7 +1498,7 @@ xfs_qm_init_quotainos(
                        error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
                                             0, 0, &uip);
                        if (error)
-                                return XFS_ERROR(error);
+                                return error;
                }
                if (XFS_IS_GQUOTA_ON(mp) &&
                    mp->m_sb.sb_gquotino != NULLFSINO) {
@@ -1563,7 +1568,7 @@ error_rele:
                IRELE(gip);
        if (pip)
                IRELE(pip);
-        return XFS_ERROR(error);
+        return error;
 }
 STATIC void
@@ -1679,7 +1684,7 @@ xfs_qm_vop_dqalloc(
                                                 XFS_QMOPT_DOWARN,
                                                 &uq);
                        if (error) {
-                                ASSERT(error != ENOENT);
+                                ASSERT(error != -ENOENT);
                                return error;
                        }
                        /*
@@ -1706,7 +1711,7 @@ xfs_qm_vop_dqalloc(
                                                 XFS_QMOPT_DOWARN,
                                                 &gq);
                        if (error) {
-                                ASSERT(error != ENOENT);
+                                ASSERT(error != -ENOENT);
                                goto error_rele;
                        }
                        xfs_dqunlock(gq);
@@ -1726,7 +1731,7 @@ xfs_qm_vop_dqalloc(
                                                 XFS_QMOPT_DOWARN,
                                                 &pq);
                        if (error) {
-                                ASSERT(error != ENOENT);
+                                ASSERT(error != -ENOENT);
                                goto error_rele;
                        }
                        xfs_dqunlock(pq);
@@ -1895,7 +1900,7 @@ xfs_qm_vop_chown_reserve(
                                -((xfs_qcnt_t)delblks), 0, blkflags);
        }
-        return (0);
+        return 0;
 }
 int
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 797fd4636273..3a07a937e232 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -157,7 +157,6 @@ struct xfs_dquot_acct {
 #define XFS_QM_RTBWARNLIMIT     5
 extern void             xfs_qm_destroy_quotainfo(struct xfs_mount *);
-extern int              xfs_qm_quotacheck(struct xfs_mount *);
 extern int              xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
 /* dquot stuff */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index e9be63abd8d2..2c61e61b0205 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -117,7 +117,7 @@ xfs_qm_newmount(
                        (uquotaondisk ? " usrquota" : ""),
                        (gquotaondisk ? " grpquota" : ""),
                        (pquotaondisk ? " prjquota" : ""));
-                return XFS_ERROR(EPERM);
+                return -EPERM;
        }
        if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index bbc813caba4c..80f2d77d929a 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -64,10 +64,10 @@ xfs_qm_scall_quotaoff(
        /*
         * No file system can have quotas enabled on disk but not in core.
         * Note that quota utilities (like quotaoff) _expect_
-         * errno == EEXIST here.
+         * errno == -EEXIST here.
         */
        if ((mp->m_qflags & flags) == 0)
-                return XFS_ERROR(EEXIST);
+                return -EEXIST;
        error = 0;
        flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
@@ -94,7 +94,7 @@ xfs_qm_scall_quotaoff(
                /* XXX what to do if error ? Revert back to old vals incore ? */
                error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
-                return (error);
+                return error;
        }
        dqtype = 0;
@@ -198,7 +198,7 @@ xfs_qm_scall_quotaoff(
        if (mp->m_qflags == 0) {
                mutex_unlock(&q->qi_quotaofflock);
                xfs_qm_destroy_quotainfo(mp);
-                return (0);
+                return 0;
        }
        /*
@@ -278,13 +278,13 @@ xfs_qm_scall_trunc_qfiles(
        xfs_mount_t     *mp,
        uint            flags)
 {
-        int             error = EINVAL;
+        int             error = -EINVAL;
        if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 ||
            (flags & ~XFS_DQ_ALLTYPES)) {
                xfs_debug(mp, "%s: flags=%x m_qflags=%x",
                        __func__, flags, mp->m_qflags);
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        if (flags & XFS_DQ_USER) {
@@ -328,7 +328,7 @@ xfs_qm_scall_quotaon(
        if (flags == 0) {
                xfs_debug(mp, "%s: zero flags, m_qflags=%x",
                        __func__, mp->m_qflags);
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        /* No fs can turn on quotas with a delayed effect */
@@ -351,13 +351,13 @@ xfs_qm_scall_quotaon(
                xfs_debug(mp,
                        "%s: Can't enforce without acct, flags=%x sbflags=%x",
                        __func__, flags, mp->m_sb.sb_qflags);
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        /*
         * If everything's up to-date incore, then don't waste time.
         */
        if ((mp->m_qflags & flags) == flags)
-                return XFS_ERROR(EEXIST);
+                return -EEXIST;
        /*
         * Change sb_qflags on disk but not incore mp->qflags
@@ -372,11 +372,11 @@ xfs_qm_scall_quotaon(
         * There's nothing to change if it's the same.
         */
        if ((qf & flags) == flags && sbflags == 0)
-                return XFS_ERROR(EEXIST);
+                return -EEXIST;
        sbflags |= XFS_SB_QFLAGS;
        if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
-                return (error);
+                return error;
        /*
         * If we aren't trying to switch on quota enforcement, we are done.
         */
@@ -387,10 +387,10 @@ xfs_qm_scall_quotaon(
             ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
             (mp->m_qflags & XFS_GQUOTA_ACCT)) ||
            (flags & XFS_ALL_QUOTA_ENFD) == 0)
-                return (0);
+                return 0;
        if (! XFS_IS_QUOTA_RUNNING(mp))
-                return XFS_ERROR(ESRCH);
+                return -ESRCH;
        /*
         * Switch on quota enforcement in core.
@@ -399,7 +399,7 @@ xfs_qm_scall_quotaon(
        mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
        mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
-        return (0);
+        return 0;
 }
@@ -426,7 +426,7 @@ xfs_qm_scall_getqstat(
        if (!xfs_sb_version_hasquota(&mp->m_sb)) {
                out->qs_uquota.qfs_ino = NULLFSINO;
                out->qs_gquota.qfs_ino = NULLFSINO;
-                return (0);
+                return 0;
        }
        out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
@@ -514,7 +514,7 @@ xfs_qm_scall_getqstatv(
                out->qs_uquota.qfs_ino = NULLFSINO;
                out->qs_gquota.qfs_ino = NULLFSINO;
                out->qs_pquota.qfs_ino = NULLFSINO;
-                return (0);
+                return 0;
        }
        out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
@@ -595,7 +595,7 @@ xfs_qm_scall_setqlim(
        xfs_qcnt_t              hard, soft;
        if (newlim->d_fieldmask & ~XFS_DQ_MASK)
-                return EINVAL;
+                return -EINVAL;
        if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
                return 0;
@@ -615,7 +615,7 @@ xfs_qm_scall_setqlim(
         */
        error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
        if (error) {
-                ASSERT(error != ENOENT);
+                ASSERT(error != -ENOENT);
                goto out_unlock;
        }
        xfs_dqunlock(dqp);
@@ -758,7 +758,7 @@ xfs_qm_log_quotaoff_end(
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
        if (error) {
                xfs_trans_cancel(tp, 0);
-                return (error);
+                return error;
        }
        qoffi = xfs_trans_get_qoff_item(tp, startqoff,
@@ -772,7 +772,7 @@ xfs_qm_log_quotaoff_end(
         */
        xfs_trans_set_sync(tp);
        error = xfs_trans_commit(tp, 0);
-        return (error);
+        return error;
 }
@@ -822,7 +822,7 @@ error0:
                spin_unlock(&mp->m_sb_lock);
        }
        *qoffstartp = qoffi;
-        return (error);
+        return error;
 }
@@ -850,7 +850,7 @@ xfs_qm_scall_getquota(
         * our utility programs are concerned.
         */
        if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
-                error = XFS_ERROR(ENOENT);
+                error = -ENOENT;
                goto out_put;
        }
@@ -953,7 +953,7 @@ xfs_qm_export_flags(
                uflags |= FS_QUOTA_GDQ_ENFD;
        if (flags & XFS_PQUOTA_ENFD)
                uflags |= FS_QUOTA_PDQ_ENFD;
-        return (uflags);
+        return uflags;
 }
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 2ad1b9822e92..b238027df987 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -51,7 +51,7 @@ xfs_fs_get_xstate(
        if (!XFS_IS_QUOTA_RUNNING(mp))
                return -ENOSYS;
-        return -xfs_qm_scall_getqstat(mp, fqs);
+        return xfs_qm_scall_getqstat(mp, fqs);
 }
 STATIC int
@@ -63,7 +63,7 @@ xfs_fs_get_xstatev(
        if (!XFS_IS_QUOTA_RUNNING(mp))
                return -ENOSYS;
-        return -xfs_qm_scall_getqstatv(mp, fqs);
+        return xfs_qm_scall_getqstatv(mp, fqs);
 }
 STATIC int
@@ -95,11 +95,11 @@ xfs_fs_set_xstate(
        switch (op) {
        case Q_XQUOTAON:
-                return -xfs_qm_scall_quotaon(mp, flags);
+                return xfs_qm_scall_quotaon(mp, flags);
        case Q_XQUOTAOFF:
                if (!XFS_IS_QUOTA_ON(mp))
                        return -EINVAL;
-                return -xfs_qm_scall_quotaoff(mp, flags);
+                return xfs_qm_scall_quotaoff(mp, flags);
        }
        return -EINVAL;
@@ -112,7 +112,7 @@ xfs_fs_rm_xquota(
 {
        struct xfs_mount        *mp = XFS_M(sb);
        unsigned int            flags = 0;
-        
        if (sb->s_flags & MS_RDONLY)
                return -EROFS;
@@ -123,11 +123,11 @@ xfs_fs_rm_xquota(
                flags |= XFS_DQ_USER;
        if (uflags & FS_GROUP_QUOTA)
                flags |= XFS_DQ_GROUP;
-        if (uflags & FS_USER_QUOTA)
+        if (uflags & FS_PROJ_QUOTA)
                flags |= XFS_DQ_PROJ;
-        return -xfs_qm_scall_trunc_qfiles(mp, flags);
+        return xfs_qm_scall_trunc_qfiles(mp, flags);
-}       
+}
 STATIC int
 xfs_fs_get_dqblk(
@@ -142,7 +142,7 @@ xfs_fs_get_dqblk(
        if (!XFS_IS_QUOTA_ON(mp))
                return -ESRCH;
-        return -xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
+        return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
                                      xfs_quota_type(qid.type), fdq);
 }
@@ -161,7 +161,7 @@ xfs_fs_set_dqblk(
        if (!XFS_IS_QUOTA_ON(mp))
                return -ESRCH;
-        return -xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
+        return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
                                     xfs_quota_type(qid.type), fdq);
 }
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ec5ca65c6211..909e143b87ae 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -863,7 +863,7 @@ xfs_growfs_rt_alloc(
                                        XFS_BMAPI_METADATA, &firstblock,
                                        resblks, &map, &nmap, &flist);
                if (!error && nmap < 1)
-                        error = XFS_ERROR(ENOSPC);
+                        error = -ENOSPC;
                if (error)
                        goto error_cancel;
                /*
@@ -903,7 +903,7 @@ xfs_growfs_rt_alloc(
                        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
                                mp->m_bsize, 0);
                        if (bp == NULL) {
-                                error = XFS_ERROR(EIO);
+                                error = -EIO;
 error_cancel:
                                xfs_trans_cancel(tp, cancelflags);
                                goto error;
@@ -944,9 +944,9 @@ xfs_growfs_rt(
        xfs_buf_t       *bp;            /* temporary buffer */
        int             error;          /* error return value */
        xfs_mount_t     *nmp;           /* new (fake) mount structure */
-        xfs_drfsbno_t   nrblocks;       /* new number of realtime blocks */
+        xfs_rfsblock_t  nrblocks;       /* new number of realtime blocks */
        xfs_extlen_t    nrbmblocks;     /* new number of rt bitmap blocks */
-        xfs_drtbno_t    nrextents;      /* new number of realtime extents */
+        xfs_rtblock_t   nrextents;      /* new number of realtime extents */
        uint8_t         nrextslog;      /* new log2 of sb_rextents */
        xfs_extlen_t    nrsumblocks;    /* new number of summary blocks */
        uint            nrsumlevels;    /* new rt summary levels */
@@ -962,11 +962,11 @@ xfs_growfs_rt(
         * Initial error checking.
         */
        if (!capable(CAP_SYS_ADMIN))
-                return XFS_ERROR(EPERM);
+                return -EPERM;
        if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
            (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
            (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks)))
                return error;
        /*
@@ -976,7 +976,7 @@ xfs_growfs_rt(
                                XFS_FSB_TO_BB(mp, nrblocks - 1),
                                XFS_FSB_TO_BB(mp, 1), 0, NULL);
        if (!bp)
-                return EIO;
+                return -EIO;
        if (bp->b_error) {
                error = bp->b_error;
                xfs_buf_relse(bp);
@@ -1001,7 +1001,7 @@ xfs_growfs_rt(
         * since we'll log basically the whole summary file at once.
         */
        if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        /*
         * Get the old block counts for bitmap and summary inodes.
         * These can't change since other growfs callers are locked out.
@@ -1208,7 +1208,7 @@ xfs_rtallocate_extent(
                                len, &sumbp, &sb, prod, &r);
                break;
        default:
-                error = EIO;
+                error = -EIO;
                ASSERT(0);
        }
        if (error)
@@ -1247,7 +1247,7 @@ xfs_rtmount_init(
        if (mp->m_rtdev_targp == NULL) {
                xfs_warn(mp,
        "Filesystem has a realtime volume, use rtdev=device option");
-                return XFS_ERROR(ENODEV);
+                return -ENODEV;
        }
        mp->m_rsumlevels = sbp->sb_rextslog + 1;
        mp->m_rsumsize =
@@ -1263,7 +1263,7 @@ xfs_rtmount_init(
                xfs_warn(mp, "realtime mount -- %llu != %llu",
                        (unsigned long long) XFS_BB_TO_FSB(mp, d),
                        (unsigned long long) mp->m_sb.sb_rblocks);
-                return XFS_ERROR(EFBIG);
+                return -EFBIG;
        }
        bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
@@ -1272,7 +1272,7 @@ xfs_rtmount_init(
                xfs_warn(mp, "realtime device size check failed");
                if (bp)
                        xfs_buf_relse(bp);
-                return EIO;
+                return -EIO;
        }
        xfs_buf_relse(bp);
        return 0;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 752b63d10300..c642795324af 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -132,7 +132,7 @@ xfs_rtmount_init(
                return 0;
        xfs_warn(mp, "Not built with CONFIG_XFS_RT");
-        return ENOSYS;
+        return -ENOSYS;
 }
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
 # define xfs_rtunmount_inodes(m)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8f0333b3f7a0..b194652033cd 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -61,6 +61,7 @@
 static const struct super_operations xfs_super_operations;
 static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
+struct kset *xfs_kset;
 #define MNTOPT_LOGBUFS  "logbufs"       /* number of XFS log buffers */
 #define MNTOPT_LOGBSIZE "logbsize"      /* size of XFS log buffers */
@@ -185,7 +186,7 @@ xfs_parseargs(
         */
        mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
        if (!mp->m_fsname)
-                return ENOMEM;
+                return -ENOMEM;
        mp->m_fsname_len = strlen(mp->m_fsname) + 1;
        /*
@@ -204,9 +205,6 @@ xfs_parseargs(
         */
        mp->m_flags |= XFS_MOUNT_BARRIER;
        mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-#if !XFS_BIG_INUMS
-        mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-#endif
        /*
         * These can be overridden by the mount option parsing.
@@ -227,57 +225,57 @@ xfs_parseargs(
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                                return EINVAL;
+                                return -EINVAL;
                        }
                        if (kstrtoint(value, 10, &mp->m_logbufs))
-                                return EINVAL;
+                                return -EINVAL;
                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                                return EINVAL;
+                                return -EINVAL;
                        }
                        if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
-                                return EINVAL;
+                                return -EINVAL;
                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                                return EINVAL;
+                                return -EINVAL;
                        }
                        mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
                        if (!mp->m_logname)
-                                return ENOMEM;
+                                return -ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
                        xfs_warn(mp, "%s option not allowed on this system",
                                this_char);
-                        return EINVAL;
+                        return -EINVAL;
                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                                return EINVAL;
+                                return -EINVAL;
                        }
                        mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
                        if (!mp->m_rtname)
-                                return ENOMEM;
+                                return -ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                                return EINVAL;
+                                return -EINVAL;
                        }
                        if (kstrtoint(value, 10, &iosize))
-                                return EINVAL;
+                                return -EINVAL;
                        iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                                return EINVAL;
+                                return -EINVAL;
                        }
                        if (suffix_kstrtoint(value, 10, &iosize))
-                                return EINVAL;
+                                return -EINVAL;
                        iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_GRPID) ||
                           !strcmp(this_char, MNTOPT_BSDGROUPS)) {
@@ -297,27 +295,22 @@ xfs_parseargs(
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                                return EINVAL;
+                                return -EINVAL;
                        }
                        if (kstrtoint(value, 10, &dsunit))
-                                return EINVAL;
+                                return -EINVAL;
                } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
-                                return EINVAL;
+                                return -EINVAL;
                        }
                        if (kstrtoint(value, 10, &dswidth))
-                                return EINVAL;
+                                return -EINVAL;
                } else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
                        mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
                        mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
-#if !XFS_BIG_INUMS
-                        xfs_warn(mp, "%s option not allowed on this system",
-                                this_char);
-                        return EINVAL;
-#endif
                } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
                        mp->m_flags |= XFS_MOUNT_NOUUID;
                } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
@@ -390,7 +383,7 @@ xfs_parseargs(
        "irixsgid is now a sysctl(2) variable, option is deprecated.");
                } else {
                        xfs_warn(mp, "unknown mount option [%s].", this_char);
-                        return EINVAL;
+                        return -EINVAL;
                }
        }
@@ -400,32 +393,32 @@ xfs_parseargs(
        if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
            !(mp->m_flags & XFS_MOUNT_RDONLY)) {
                xfs_warn(mp, "no-recovery mounts must be read-only.");
-                return EINVAL;
+                return -EINVAL;
        }
        if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
                xfs_warn(mp,
        "sunit and swidth options incompatible with the noalign option");
-                return EINVAL;
+                return -EINVAL;
        }
 #ifndef CONFIG_XFS_QUOTA
        if (XFS_IS_QUOTA_RUNNING(mp)) {
                xfs_warn(mp, "quota support not available in this kernel.");
-                return EINVAL;
+                return -EINVAL;
        }
 #endif
        if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
                xfs_warn(mp, "sunit and swidth must be specified together");
-                return EINVAL;
+                return -EINVAL;
        }
        if (dsunit && (dswidth % dsunit != 0)) {
                xfs_warn(mp,
        "stripe width (%d) must be a multiple of the stripe unit (%d)",
                        dswidth, dsunit);
-                return EINVAL;
+                return -EINVAL;
        }
 done:
@@ -446,7 +439,7 @@ done:
             mp->m_logbufs > XLOG_MAX_ICLOGS)) {
                xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
                        mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        if (mp->m_logbsize != -1 &&
            mp->m_logbsize !=  0 &&
@@ -456,7 +449,7 @@ done:
                xfs_warn(mp,
                        "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
                        mp->m_logbsize);
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        if (iosizelog) {
@@ -465,7 +458,7 @@ done:
                        xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
                                iosizelog, XFS_MIN_IO_LOG,
                                XFS_MAX_IO_LOG);
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
                mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
@@ -597,15 +590,20 @@ xfs_max_file_offset(
        return (((__uint64_t)pagefactor) << bitshift) - 1;
 }
+/*
+ * xfs_set_inode32() and xfs_set_inode64() are passed an agcount
+ * because in the growfs case, mp->m_sb.sb_agcount is not updated
+ * yet to the potentially higher ag count.
+ */
 xfs_agnumber_t
-xfs_set_inode32(struct xfs_mount *mp)
+xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
 {
        xfs_agnumber_t  index = 0;
        xfs_agnumber_t  maxagi = 0;
        xfs_sb_t        *sbp = &mp->m_sb;
        xfs_agnumber_t  max_metadata;
-        xfs_agino_t     agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks -1, 0);
+        xfs_agino_t     agino;
-        xfs_ino_t       ino = XFS_AGINO_TO_INO(mp, sbp->sb_agcount -1, agino);
+        xfs_ino_t       ino;
        xfs_perag_t     *pag;
        /* Calculate how much should be reserved for inodes to meet
@@ -620,10 +618,12 @@ xfs_set_inode32(struct xfs_mount *mp)
                do_div(icount, sbp->sb_agblocks);
                max_metadata = icount;
        } else {
-                max_metadata = sbp->sb_agcount;
+                max_metadata = agcount;
        }
-        for (index = 0; index < sbp->sb_agcount; index++) {
+        agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+        for (index = 0; index < agcount; index++) {
                ino = XFS_AGINO_TO_INO(mp, index, agino);
                if (ino > XFS_MAXINUMBER_32) {
@@ -648,11 +648,11 @@ xfs_set_inode32(struct xfs_mount *mp)
 }
 xfs_agnumber_t
-xfs_set_inode64(struct xfs_mount *mp)
+xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount)
 {
        xfs_agnumber_t index = 0;
-        for (index = 0; index < mp->m_sb.sb_agcount; index++) {
+        for (index = 0; index < agcount; index++) {
                struct xfs_perag        *pag;
                pag = xfs_perag_get(mp, index);
@@ -686,7 +686,7 @@ xfs_blkdev_get(
                xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
        }
-        return -error;
+        return error;
 }
 STATIC void
@@ -756,7 +756,7 @@ xfs_open_devices(
                if (rtdev == ddev || rtdev == logdev) {
                        xfs_warn(mp,
        "Cannot mount filesystem with identical rtdev and ddev/logdev.");
-                        error = EINVAL;
+                        error = -EINVAL;
                        goto out_close_rtdev;
                }
        }
@@ -764,7 +764,7 @@ xfs_open_devices(
        /*
         * Setup xfs_mount buffer target pointers
         */
-        error = ENOMEM;
+        error = -ENOMEM;
        mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
        if (!mp->m_ddev_targp)
                goto out_close_rtdev;
@@ -1188,6 +1188,7 @@ xfs_fs_remount(
        char                    *options)
 {
        struct xfs_mount        *mp = XFS_M(sb);
+        xfs_sb_t                *sbp = &mp->m_sb;
        substring_t             args[MAX_OPT_ARGS];
        char                    *p;
        int                     error;
@@ -1208,10 +1209,10 @@ xfs_fs_remount(
                        mp->m_flags &= ~XFS_MOUNT_BARRIER;
                        break;
                case Opt_inode64:
-                        mp->m_maxagi = xfs_set_inode64(mp);
+                        mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount);
                        break;
                case Opt_inode32:
-                        mp->m_maxagi = xfs_set_inode32(mp);
+                        mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount);
                        break;
                default:
                        /*
@@ -1295,7 +1296,7 @@ xfs_fs_freeze(
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-        return -xfs_fs_log_dummy(mp);
+        return xfs_fs_log_dummy(mp);
 }
 STATIC int
@@ -1314,7 +1315,7 @@ xfs_fs_show_options(
        struct seq_file         *m,
        struct dentry           *root)
 {
-        return -xfs_showargs(XFS_M(root->d_sb), m);
+        return xfs_showargs(XFS_M(root->d_sb), m);
 }
 /*
@@ -1336,14 +1337,14 @@ xfs_finish_flags(
                           mp->m_logbsize < mp->m_sb.sb_logsunit) {
                        xfs_warn(mp,
                "logbuf size must be greater than or equal to log stripe size");
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
        } else {
                /* Fail a mount if the logbuf is larger than 32K */
                if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
                        xfs_warn(mp,
                "logbuf size for version 1 logs must be 16K or 32K");
-                        return XFS_ERROR(EINVAL);
+                        return -EINVAL;
                }
        }
@@ -1355,7 +1356,7 @@ xfs_finish_flags(
                xfs_warn(mp,
 "Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
                        MNTOPT_NOATTR2, MNTOPT_ATTR2);
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        /*
@@ -1372,7 +1373,7 @@ xfs_finish_flags(
        if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
                xfs_warn(mp,
                        "cannot mount a read-only filesystem as read-write");
-                return XFS_ERROR(EROFS);
+                return -EROFS;
        }
        if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
@@ -1380,7 +1381,7 @@ xfs_finish_flags(
            !xfs_sb_version_has_pquotino(&mp->m_sb)) {
                xfs_warn(mp,
                  "Super block does not support project and group quota together");
-                return XFS_ERROR(EINVAL);
+                return -EINVAL;
        }
        return 0;
@@ -1394,7 +1395,7 @@ xfs_fs_fill_super(
 {
        struct inode            *root;
        struct xfs_mount        *mp = NULL;
-        int                     flags = 0, error = ENOMEM;
+        int                     flags = 0, error = -ENOMEM;
        mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
        if (!mp)
@@ -1428,11 +1429,11 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_fsname;
-        error = -xfs_init_mount_workqueues(mp);
+        error = xfs_init_mount_workqueues(mp);
        if (error)
                goto out_close_devices;
-        error = -xfs_icsb_init_counters(mp);
+        error = xfs_icsb_init_counters(mp);
        if (error)
                goto out_destroy_workqueues;
@@ -1474,12 +1475,12 @@ xfs_fs_fill_super(
        root = igrab(VFS_I(mp->m_rootip));
        if (!root) {
-                error = ENOENT;
+                error = -ENOENT;
                goto out_unmount;
        }
        sb->s_root = d_make_root(root);
        if (!sb->s_root) {
-                error = ENOMEM;
+                error = -ENOMEM;
                goto out_unmount;
        }
@@ -1499,7 +1500,7 @@ out_destroy_workqueues:
        xfs_free_fsname(mp);
        kfree(mp);
 out:
-        return -error;
+        return error;
 out_unmount:
        xfs_filestream_unmount(mp);
@@ -1761,9 +1762,15 @@ init_xfs_fs(void)
        if (error)
                goto out_cleanup_procfs;
+        xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
+        if (!xfs_kset) {
+                error = -ENOMEM;
+                goto out_sysctl_unregister;;
+        }
        error = xfs_qm_init();
        if (error)
-                goto out_sysctl_unregister;
+                goto out_kset_unregister;
        error = register_filesystem(&xfs_fs_type);
        if (error)
@@ -1772,6 +1779,8 @@ init_xfs_fs(void)
 out_qm_exit:
        xfs_qm_exit();
+ out_kset_unregister:
+        kset_unregister(xfs_kset);
 out_sysctl_unregister:
        xfs_sysctl_unregister();
 out_cleanup_procfs:
@@ -1793,6 +1802,7 @@ exit_xfs_fs(void)
 {
        xfs_qm_exit();
        unregister_filesystem(&xfs_fs_type);
+        kset_unregister(xfs_kset);
        xfs_sysctl_unregister();
        xfs_cleanup_procfs();
        xfs_buf_terminate();
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index bbe3d15a7904..2b830c2f322e 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -44,16 +44,6 @@ extern void xfs_qm_exit(void);
 # define XFS_REALTIME_STRING
 #endif
-#if XFS_BIG_BLKNOS
-# if XFS_BIG_INUMS
-#  define XFS_BIGFS_STRING      "large block/inode numbers, "
-# else
-#  define XFS_BIGFS_STRING      "large block numbers, "
-# endif
-#else
-# define XFS_BIGFS_STRING
-#endif
 #ifdef DEBUG
 # define XFS_DBG_STRING         "debug"
 #else
@@ -64,7 +54,6 @@ extern void xfs_qm_exit(void);
 #define XFS_BUILD_OPTIONS       XFS_ACL_STRING \
                                XFS_SECURITY_STRING \
                                XFS_REALTIME_STRING \
-                                XFS_BIGFS_STRING \
                                XFS_DBG_STRING /* DBG must be last */
 struct xfs_inode;
@@ -76,8 +65,8 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
 extern void xfs_flush_inodes(struct xfs_mount *mp);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
-extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
+extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount);
-extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
+extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount);
 extern const struct export_operations xfs_export_operations;
 extern const struct xattr_handler *xfs_xattr_handlers[];
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index d69363c833e1..6a944a2cd36f 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -76,15 +76,15 @@ xfs_readlink_bmap(
                bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
                                  &xfs_symlink_buf_ops);
                if (!bp)
-                        return XFS_ERROR(ENOMEM);
+                        return -ENOMEM;
                error = bp->b_error;
                if (error) {
                        xfs_buf_ioerror_alert(bp, __func__);
                        xfs_buf_relse(bp);
                        /* bad CRC means corrupted metadata */
-                        if (error == EFSBADCRC)
+                        if (error == -EFSBADCRC)
-                                error = EFSCORRUPTED;
+                                error = -EFSCORRUPTED;
                        goto out;
                }
                byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
@@ -95,7 +95,7 @@ xfs_readlink_bmap(
                if (xfs_sb_version_hascrc(&mp->m_sb)) {
                        if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
                                                        byte_cnt, bp)) {
-                                error = EFSCORRUPTED;
+                                error = -EFSCORRUPTED;
                                xfs_alert(mp,
 "symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
                                        offset, byte_cnt, ip->i_ino);
@@ -135,7 +135,7 @@ xfs_readlink(
        trace_xfs_readlink(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -148,7 +148,7 @@ xfs_readlink(
                         __func__, (unsigned long long) ip->i_ino,
                         (long long) pathlen);
                ASSERT(0);
-                error = XFS_ERROR(EFSCORRUPTED);
+                error = -EFSCORRUPTED;
                goto out;
        }
@@ -203,14 +203,14 @@ xfs_symlink(
        trace_xfs_symlink(dp, link_name);
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        /*
         * Check component lengths of the target path name.
         */
        pathlen = strlen(target_path);
        if (pathlen >= MAXPATHLEN)      /* total string too long */
-                return XFS_ERROR(ENAMETOOLONG);
+                return -ENAMETOOLONG;
        udqp = gdqp = NULL;
        prid = xfs_get_initial_prid(dp);
@@ -238,7 +238,7 @@ xfs_symlink(
                fs_blocks = xfs_symlink_blocks(mp, pathlen);
        resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
-        if (error == ENOSPC && fs_blocks == 0) {
+        if (error == -ENOSPC && fs_blocks == 0) {
                resblks = 0;
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
        }
@@ -254,7 +254,7 @@ xfs_symlink(
         * Check whether the directory allows new symlinks or not.
         */
        if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
-                error = XFS_ERROR(EPERM);
+                error = -EPERM;
                goto error_return;
        }
@@ -284,7 +284,7 @@ xfs_symlink(
        error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
                               prid, resblks > 0, &ip, NULL);
        if (error) {
-                if (error == ENOSPC)
+                if (error == -ENOSPC)
                        goto error_return;
                goto error1;
        }
@@ -348,7 +348,7 @@ xfs_symlink(
                        bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
                                               BTOBB(byte_cnt), 0);
                        if (!bp) {
-                                error = ENOMEM;
+                                error = -ENOMEM;
                                goto error2;
                        }
                        bp->b_ops = &xfs_symlink_buf_ops;
@@ -489,7 +489,7 @@ xfs_inactive_symlink_rmt(
                        XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
                        XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
                if (!bp) {
-                        error = ENOMEM;
+                        error = -ENOMEM;
                        goto error_bmap_cancel;
                }
                xfs_trans_binval(tp, bp);
@@ -562,7 +562,7 @@ xfs_inactive_symlink(
        trace_xfs_inactive_symlink(ip);
        if (XFS_FORCED_SHUTDOWN(mp))
-                return XFS_ERROR(EIO);
+                return -EIO;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -580,7 +580,7 @@ xfs_inactive_symlink(
                         __func__, (unsigned long long)ip->i_ino, pathlen);
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                ASSERT(0);
-                return XFS_ERROR(EFSCORRUPTED);
+                return -EFSCORRUPTED;
        }
        if (ip->i_df.if_flags & XFS_IFINLINE) {
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
new file mode 100644
index 000000000000..9835139ce1ec
--- /dev/null
+++ b/fs/xfs/xfs_sysfs.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sysfs.h"
+#include "xfs_log_format.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+struct xfs_sysfs_attr {
+        struct attribute attr;
+        ssize_t (*show)(char *buf, void *data);
+        ssize_t (*store)(const char *buf, size_t count, void *data);
+};
+static inline struct xfs_sysfs_attr *
+to_attr(struct attribute *attr)
+{
+        return container_of(attr, struct xfs_sysfs_attr, attr);
+}
+#define XFS_SYSFS_ATTR_RW(name) \
+        static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name)
+#define XFS_SYSFS_ATTR_RO(name) \
+        static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name)
+#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
+/*
+ * xfs_mount kobject. This currently has no attributes and thus no need for show
+ * and store helpers. The mp kobject serves as the per-mount parent object that
+ * is identified by the fsname under sysfs.
+ */
+struct kobj_type xfs_mp_ktype = {
+        .release = xfs_sysfs_release,
+};
+/* xlog */
+STATIC ssize_t
+log_head_lsn_show(
+        char    *buf,
+        void    *data)
+{
+        struct xlog *log = data;
+        int cycle;
+        int block;
+        spin_lock(&log->l_icloglock);
+        cycle = log->l_curr_cycle;
+        block = log->l_curr_block;
+        spin_unlock(&log->l_icloglock);
+        return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+}
+XFS_SYSFS_ATTR_RO(log_head_lsn);
+STATIC ssize_t
+log_tail_lsn_show(
+        char    *buf,
+        void    *data)
+{
+        struct xlog *log = data;
+        int cycle;
+        int block;
+        xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
+        return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
+}
+XFS_SYSFS_ATTR_RO(log_tail_lsn);
+STATIC ssize_t
+reserve_grant_head_show(
+        char    *buf,
+        void    *data)
+{
+        struct xlog *log = data;
+        int cycle;
+        int bytes;
+        xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
+        return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+}
+XFS_SYSFS_ATTR_RO(reserve_grant_head);
+STATIC ssize_t
+write_grant_head_show(
+        char    *buf,
+        void    *data)
+{
+        struct xlog *log = data;
+        int cycle;
+        int bytes;
+        xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
+        return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
+}
+XFS_SYSFS_ATTR_RO(write_grant_head);
+static struct attribute *xfs_log_attrs[] = {
+        ATTR_LIST(log_head_lsn),
+        ATTR_LIST(log_tail_lsn),
+        ATTR_LIST(reserve_grant_head),
+        ATTR_LIST(write_grant_head),
+        NULL,
+};
+static inline struct xlog *
+to_xlog(struct kobject *kobject)
+{
+        struct xfs_kobj *kobj = to_kobj(kobject);
+        return container_of(kobj, struct xlog, l_kobj);
+}
+STATIC ssize_t
+xfs_log_show(
+        struct kobject          *kobject,
+        struct attribute        *attr,
+        char                    *buf)
+{
+        struct xlog *log = to_xlog(kobject);
+        struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+        return xfs_attr->show ? xfs_attr->show(buf, log) : 0;
+}
+STATIC ssize_t
+xfs_log_store(
+        struct kobject          *kobject,
+        struct attribute        *attr,
+        const char              *buf,
+        size_t                  count)
+{
+        struct xlog *log = to_xlog(kobject);
+        struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+        return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0;
+}
+static struct sysfs_ops xfs_log_ops = {
+        .show = xfs_log_show,
+        .store = xfs_log_store,
+};
+struct kobj_type xfs_log_ktype = {
+        .release = xfs_sysfs_release,
+        .sysfs_ops = &xfs_log_ops,
+        .default_attrs = xfs_log_attrs,
+};
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
new file mode 100644
index 000000000000..54a2091183c0
--- /dev/null
+++ b/fs/xfs/xfs_sysfs.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2014 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SYSFS_H__
+#define __XFS_SYSFS_H__
+extern struct kobj_type xfs_mp_ktype;   /* xfs_mount */
+extern struct kobj_type xfs_log_ktype;  /* xlog */
+static inline struct xfs_kobj *
+to_kobj(struct kobject *kobject)
+{
+        return container_of(kobject, struct xfs_kobj, kobject);
+}
+static inline void
+xfs_sysfs_release(struct kobject *kobject)
+{
+        struct xfs_kobj *kobj = to_kobj(kobject);
+        complete(&kobj->complete);
+}
+static inline int
+xfs_sysfs_init(
+        struct xfs_kobj         *kobj,
+        struct kobj_type        *ktype,
+        struct xfs_kobj         *parent_kobj,
+        const char              *name)
+{
+        init_completion(&kobj->complete);
+        return kobject_init_and_add(&kobj->kobject, ktype,
+                                    &parent_kobj->kobject, "%s", name);
+}
+static inline void
+xfs_sysfs_del(
+        struct xfs_kobj *kobj)
+{
+        kobject_del(&kobj->kobject);
+        kobject_put(&kobj->kobject);
+        wait_for_completion(&kobj->complete);
+}
+#endif  /* __XFS_SYSFS_H__ */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d03932564ccb..30e8e3410955 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -190,7 +190,7 @@ xfs_trans_reserve(
                                          -((int64_t)blocks), rsvd);
                if (error != 0) {
                        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-                        return (XFS_ERROR(ENOSPC));
+                        return -ENOSPC;
                }
                tp->t_blk_res += blocks;
        }
@@ -241,7 +241,7 @@ xfs_trans_reserve(
                error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
                                          -((int64_t)rtextents), rsvd);
                if (error) {
-                        error = XFS_ERROR(ENOSPC);
+                        error = -ENOSPC;
                        goto undo_log;
                }
                tp->t_rtx_res += rtextents;
@@ -874,7 +874,7 @@ xfs_trans_commit(
                goto out_unreserve;
        if (XFS_FORCED_SHUTDOWN(mp)) {
-                error = XFS_ERROR(EIO);
+                error = -EIO;
                goto out_unreserve;
        }
@@ -917,7 +917,7 @@ out_unreserve:
        if (tp->t_ticket) {
                commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
                if (commit_lsn == -1 && !error)
-                        error = XFS_ERROR(EIO);
+                        error = -EIO;
        }
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
        xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
@@ -1024,7 +1024,7 @@ xfs_trans_roll(
         */
        error = xfs_trans_commit(trans, 0);
        if (error)
-                return (error);
+                return error;
        trans = *tpp;
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index cb0f3a84cc68..859482f53b5a 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -762,7 +762,7 @@ xfs_trans_ail_init(
        ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
        if (!ailp)
-                return ENOMEM;
+                return -ENOMEM;
        ailp->xa_mount = mp;
        INIT_LIST_HEAD(&ailp->xa_ail);
@@ -781,7 +781,7 @@ xfs_trans_ail_init(
 out_free_ailp:
        kmem_free(ailp);
-        return ENOMEM;
+        return -ENOMEM;
 }
 void
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index b8eef0549f3f..96c898e7ac9a 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -166,7 +166,7 @@ xfs_trans_get_buf_map(
                ASSERT(atomic_read(&bip->bli_refcount) > 0);
                bip->bli_recur++;
                trace_xfs_trans_get_buf_recur(bip);
-                return (bp);
+                return bp;
        }
        bp = xfs_buf_get_map(target, map, nmaps, flags);
@@ -178,7 +178,7 @@ xfs_trans_get_buf_map(
        _xfs_trans_bjoin(tp, bp, 1);
        trace_xfs_trans_get_buf(bp->b_fspriv);
-        return (bp);
+        return bp;
 }
 /*
@@ -201,9 +201,8 @@ xfs_trans_getsb(xfs_trans_t	*tp,
         * Default to just trying to lock the superblock buffer
         * if tp is NULL.
         */
-        if (tp == NULL) {
+        if (tp == NULL)
-                return (xfs_getsb(mp, flags));
+                return xfs_getsb(mp, flags);
-        }
        /*
         * If the superblock buffer already has this transaction
@@ -218,7 +217,7 @@ xfs_trans_getsb(xfs_trans_t	*tp,
                ASSERT(atomic_read(&bip->bli_refcount) > 0);
                bip->bli_recur++;
                trace_xfs_trans_getsb_recur(bip);
-                return (bp);
+                return bp;
        }
        bp = xfs_getsb(mp, flags);
@@ -227,7 +226,7 @@ xfs_trans_getsb(xfs_trans_t	*tp,
        _xfs_trans_bjoin(tp, bp, 1);
        trace_xfs_trans_getsb(bp->b_fspriv);
-        return (bp);
+        return bp;
 }
 #ifdef DEBUG
@@ -267,7 +266,7 @@ xfs_trans_read_buf_map(
                bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
                if (!bp)
                        return (flags & XBF_TRYLOCK) ?
-                                        EAGAIN : XFS_ERROR(ENOMEM);
+                                        -EAGAIN : -ENOMEM;
                if (bp->b_error) {
                        error = bp->b_error;
@@ -277,8 +276,8 @@ xfs_trans_read_buf_map(
                        xfs_buf_relse(bp);
                        /* bad CRC means corrupted metadata */
-                        if (error == EFSBADCRC)
+                        if (error == -EFSBADCRC)
-                                error = EFSCORRUPTED;
+                                error = -EFSCORRUPTED;
                        return error;
                }
 #ifdef DEBUG
@@ -287,7 +286,7 @@ xfs_trans_read_buf_map(
                                if (((xfs_req_num++) % xfs_error_mod) == 0) {
                                        xfs_buf_relse(bp);
                                        xfs_debug(mp, "Returning error!");
-                                        return XFS_ERROR(EIO);
+                                        return -EIO;
                                }
                        }
                }
@@ -343,8 +342,8 @@ xfs_trans_read_buf_map(
                                        xfs_force_shutdown(tp->t_mountp,
                                                        SHUTDOWN_META_IO_ERROR);
                                /* bad CRC means corrupted metadata */
-                                if (error == EFSBADCRC)
+                                if (error == -EFSBADCRC)
-                                        error = EFSCORRUPTED;
+                                        error = -EFSCORRUPTED;
                                return error;
                        }
                }
@@ -355,7 +354,7 @@ xfs_trans_read_buf_map(
                if (XFS_FORCED_SHUTDOWN(mp)) {
                        trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
                        *bpp = NULL;
-                        return XFS_ERROR(EIO);
+                        return -EIO;
                }
@@ -372,7 +371,7 @@ xfs_trans_read_buf_map(
        if (bp == NULL) {
                *bpp = NULL;
                return (flags & XBF_TRYLOCK) ?
-                                        0 : XFS_ERROR(ENOMEM);
+                                        0 : -ENOMEM;
        }
        if (bp->b_error) {
                error = bp->b_error;
@@ -384,8 +383,8 @@ xfs_trans_read_buf_map(
                xfs_buf_relse(bp);
                /* bad CRC means corrupted metadata */
-                if (error == EFSBADCRC)
+                if (error == -EFSBADCRC)
-                        error = EFSCORRUPTED;
+                        error = -EFSCORRUPTED;
                return error;
        }
 #ifdef DEBUG
@@ -396,7 +395,7 @@ xfs_trans_read_buf_map(
                                                   SHUTDOWN_META_IO_ERROR);
                                xfs_buf_relse(bp);
                                xfs_debug(mp, "Returning trans error!");
-                                return XFS_ERROR(EIO);
+                                return -EIO;
                        }
                }
        }
@@ -414,7 +413,7 @@ shutdown_abort:
        trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
        xfs_buf_relse(bp);
        *bpp = NULL;
-        return XFS_ERROR(EIO);
+        return -EIO;
 }
 /*
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 41172861e857..846e061c2e98 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -722,8 +722,8 @@ xfs_trans_dqresv(
 error_return:
        xfs_dqunlock(dqp);
        if (flags & XFS_QMOPT_ENOSPC)
-                return ENOSPC;
+                return -ENOSPC;
-        return EDQUOT;
+        return -EDQUOT;
 }
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 65c6e6650b1a..b79dc66b2ecd 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -38,43 +38,18 @@ typedef	__int32_t	xfs_tid_t;	/* transaction identifier */
 typedef __uint32_t      xfs_dablk_t;    /* dir/attr block number (in file) */
 typedef __uint32_t      xfs_dahash_t;   /* dir/attr hash value */
-/*
- * These types are 64 bits on disk but are either 32 or 64 bits in memory.
- * Disk based types:
- */
-typedef __uint64_t      xfs_dfsbno_t;   /* blockno in filesystem (agno|agbno) */
-typedef __uint64_t      xfs_drfsbno_t;  /* blockno in filesystem (raw) */
-typedef __uint64_t      xfs_drtbno_t;   /* extent (block) in realtime area */
-typedef __uint64_t      xfs_dfiloff_t;  /* block number in a file */
-typedef __uint64_t      xfs_dfilblks_t; /* number of blocks in a file */
-/*
- * Memory based types are conditional.
- */
-#if XFS_BIG_BLKNOS
 typedef __uint64_t      xfs_fsblock_t;  /* blockno in filesystem (agno|agbno) */
 typedef __uint64_t      xfs_rfsblock_t; /* blockno in filesystem (raw) */
 typedef __uint64_t      xfs_rtblock_t;  /* extent (block) in realtime area */
-typedef __int64_t       xfs_srtblock_t; /* signed version of xfs_rtblock_t */
-#else
-typedef __uint32_t      xfs_fsblock_t;  /* blockno in filesystem (agno|agbno) */
-typedef __uint32_t      xfs_rfsblock_t; /* blockno in filesystem (raw) */
-typedef __uint32_t      xfs_rtblock_t;  /* extent (block) in realtime area */
-typedef __int32_t       xfs_srtblock_t; /* signed version of xfs_rtblock_t */
-#endif
 typedef __uint64_t      xfs_fileoff_t;  /* block number in a file */
-typedef __int64_t       xfs_sfiloff_t;  /* signed block number in a file */
 typedef __uint64_t      xfs_filblks_t;  /* number of blocks in a file */
+typedef __int64_t       xfs_srtblock_t; /* signed version of xfs_rtblock_t */
+typedef __int64_t       xfs_sfiloff_t;  /* signed block number in a file */
 /*
 * Null values for the types.
 */
-#define NULLDFSBNO      ((xfs_dfsbno_t)-1)
-#define NULLDRFSBNO     ((xfs_drfsbno_t)-1)
-#define NULLDRTBNO      ((xfs_drtbno_t)-1)
-#define NULLDFILOFF     ((xfs_dfiloff_t)-1)
 #define NULLFSBLOCK     ((xfs_fsblock_t)-1)
 #define NULLRFSBLOCK    ((xfs_rfsblock_t)-1)
 #define NULLRTBLOCK     ((xfs_rtblock_t)-1)
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
deleted file mode 100644
index e8a77383c0d5..000000000000
--- a/fs/xfs/xfs_vnode.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_VNODE_H__
-#define __XFS_VNODE_H__
-#include "xfs_fs.h"
-struct file;
-struct xfs_inode;
-struct attrlist_cursor_kern;
-/*
- * Flags for read/write calls - same values as IRIX
- */
-#define IO_ISDIRECT     0x00004         /* bypass page cache */
-#define IO_INVIS        0x00020         /* don't update inode timestamps */
-#define XFS_IO_FLAGS \
-        { IO_ISDIRECT,  "DIRECT" }, \
-        { IO_INVIS,     "INVIS"}
-/*
- * Some useful predicates.
- */
-#define VN_MAPPED(vp)   mapping_mapped(vp->i_mapping)
-#define VN_CACHED(vp)   (vp->i_mapping->nrpages)
-#define VN_DIRTY(vp)    mapping_tagged(vp->i_mapping, \
-                                        PAGECACHE_TAG_DIRTY)
-#endif  /* __XFS_VNODE_H__ */
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 78ed92a46fdd..93455b998041 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -49,7 +49,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
                value = NULL;
        }
-        error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
+        error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
        if (error)
                return error;
        return asize;
@@ -71,8 +71,8 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
                xflags |= ATTR_REPLACE;
        if (!value)
-                return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
+                return xfs_attr_remove(ip, (unsigned char *)name, xflags);
-        return -xfs_attr_set(ip, (unsigned char *)name,
+        return xfs_attr_set(ip, (unsigned char *)name,
                                (void *)value, size, xflags);
 }
author	Ingo Molnar <mingo@kernel.org>	2014-08-22 04:04:15 -0400
committer	Ingo Molnar <mingo@kernel.org>	2014-08-22 04:04:15 -0400
commit	80b304fd00e8b667775ff791121b61ecd7cd0c03 (patch)
tree	b4f2ec59fe062c43343ee4c2f10a6bcd0e4dcd1b /fs
parent	fb21b84e7f809ef04b1e5aed5d463cf0d4866638 (diff)
parent	6a7519e81321343165f89abb8b616df186d3e57a (diff)