Merge branch 'cleanups'

Merge cleanups requested by Linus. * cleanups: (3 commits) pnfs: Refactor the *_layout_mark_request_commit to use pnfs_layout_mark_request_commit nfs: Can call nfs_clear_page_commit() instead nfs: Provide and use helper functions for marking a page as unstable
author: Trond Myklebust <trond.myklebust@primarydata.com> 2015-02-18 10:28:37 -0500
committer: Trond Myklebust <trond.myklebust@primarydata.com> 2015-02-18 10:28:37 -0500
commit: 65d2918e716afb89359cfa59734d76c1ff8700cb (patch)
tree: 4685404f96642243d62c3a1a823340913d087090 /fs
parent: bf40e5561fd288a505d5d8d8bf45eef96fe7253d (diff)
parent: 338d00cfef07d74a072f96821c64b20f98517d72 (diff)
214 files changed, 4768 insertions, 2538 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6894b085f0ee..620d93489539 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -335,7 +335,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        }
        init_rwsem(&v9ses->rename_sem);
-        rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
+        rc = bdi_setup_and_register(&v9ses->bdi, "9p");
        if (rc) {
                kfree(v9ses->aname);
                kfree(v9ses->uname);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 5594505e6e73..b40133796b87 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -831,7 +831,6 @@ static const struct vm_operations_struct v9fs_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = v9fs_vm_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
@@ -839,7 +838,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = v9fs_vm_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
diff --git a/fs/Kconfig b/fs/Kconfig
index 664991afe0c0..a6bb530b1ec5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -165,6 +165,7 @@ config HUGETLB_PAGE
        def_bool HUGETLBFS
 source "fs/configfs/Kconfig"
+source "fs/efivarfs/Kconfig"
 endmenu
@@ -209,7 +210,6 @@ source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
 source "fs/f2fs/Kconfig"
-source "fs/efivarfs/Kconfig"
 endif # MISC_FILESYSTEMS
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 06e14bfb3496..dbc732e9a5c0 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -306,8 +306,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg,
                        _debug("- range %u-%u%s",
                               offset, to, msg->msg_flags ? " [more]" : "");
-                        iov_iter_init(&msg->msg_iter, WRITE,
+                        iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC,
-                                      (struct iovec *) iov, 1, to - offset);
+                                      iov, 1, to - offset);
                        /* have to change the state *before* sending the last
                         * packet as RxRPC might give us the reply before it
@@ -384,7 +384,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
        msg.msg_name            = NULL;
        msg.msg_namelen         = 0;
-        iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)iov, 1,
+        iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1,
                      call->request_size);
        msg.msg_control         = NULL;
        msg.msg_controllen      = 0;
@@ -770,7 +770,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
 void afs_send_empty_reply(struct afs_call *call)
 {
        struct msghdr msg;
-        struct iovec iov[1];
+        struct kvec iov[1];
        _enter("");
@@ -778,7 +778,7 @@ void afs_send_empty_reply(struct afs_call *call)
        iov[0].iov_len          = 0;
        msg.msg_name            = NULL;
        msg.msg_namelen         = 0;
-        iov_iter_init(&msg.msg_iter, WRITE, iov, 0, 0); /* WTF? */
+        iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 0, 0);     /* WTF? */
        msg.msg_control         = NULL;
        msg.msg_controllen      = 0;
        msg.msg_flags           = 0;
@@ -805,7 +805,7 @@ void afs_send_empty_reply(struct afs_call *call)
 void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 {
        struct msghdr msg;
-        struct iovec iov[1];
+        struct kvec iov[1];
        int n;
        _enter("");
@@ -814,7 +814,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
        iov[0].iov_len          = len;
        msg.msg_name            = NULL;
        msg.msg_namelen         = 0;
-        iov_iter_init(&msg.msg_iter, WRITE, iov, 1, len);
+        iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len);
        msg.msg_control         = NULL;
        msg.msg_controllen      = 0;
        msg.msg_flags           = 0;
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 2b607257820c..d142a2449e65 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,7 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
        volume->cell            = params->cell;
        volume->vid             = vlocation->vldb.vid[params->type];
-        ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
+        ret = bdi_setup_and_register(&volume->bdi, "afs");
        if (ret)
                goto error_bdi;
diff --git a/fs/aio.c b/fs/aio.c
index 1b7893ecc296..118a2e0088d8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -165,15 +165,6 @@ static struct vfsmount *aio_mnt;
 static const struct file_operations aio_ring_fops;
 static const struct address_space_operations aio_ctx_aops;
-/* Backing dev info for aio fs.
- * -no dirty page accounting or writeback happens
- */
-static struct backing_dev_info aio_fs_backing_dev_info = {
-        .name           = "aiofs",
-        .state          = 0,
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
-};
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
        struct qstr this = QSTR_INIT("[aio]", 5);
@@ -185,7 +176,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
        inode->i_mapping->a_ops = &aio_ctx_aops;
        inode->i_mapping->private_data = ctx;
-        inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
        inode->i_size = PAGE_SIZE * nr_pages;
        path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@ -230,9 +220,6 @@ static int __init aio_setup(void)
        if (IS_ERR(aio_mnt))
                panic("Failed to create aio fs mount.");
-        if (bdi_init(&aio_fs_backing_dev_info))
-                panic("Failed to init aio fs backing dev info.");
        kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
@@ -1140,6 +1127,13 @@ static long aio_read_events_ring(struct kioctx *ctx,
        long ret = 0;
        int copy_ret;
+        /*
+         * The mutex can block and wake us up and that will cause
+         * wait_event_interruptible_hrtimeout() to schedule without sleeping
+         * and repeat. This should be rare enough that it doesn't cause
+         * peformance issues. See the comment in read_events() for more detail.
+         */
+        sched_annotate_sleep();
        mutex_lock(&ctx->ring_lock);
        /* Access to ->ring_pages here is protected by ctx->ring_lock. */
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b48c41bf0f86..a9f92794d7a0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -49,23 +49,15 @@ inline struct block_device *I_BDEV(struct inode *inode)
 }
 EXPORT_SYMBOL(I_BDEV);
-/*
+static void bdev_write_inode(struct inode *inode)
- * Move the inode from its current bdi to a new bdi.  Make sure the inode
- * is clean before moving so that it doesn't linger on the old bdi.
- */
-static void bdev_inode_switch_bdi(struct inode *inode,
-                        struct backing_dev_info *dst)
 {
-        while (true) {
+        spin_lock(&inode->i_lock);
-                spin_lock(&inode->i_lock);
+        while (inode->i_state & I_DIRTY) {
-                if (!(inode->i_state & I_DIRTY)) {
-                        inode->i_data.backing_dev_info = dst;
-                        spin_unlock(&inode->i_lock);
-                        return;
-                }
                spin_unlock(&inode->i_lock);
                WARN_ON_ONCE(write_inode_now(inode, true));
+                spin_lock(&inode->i_lock);
        }
+        spin_unlock(&inode->i_lock);
 }
 /* Kill _all_ buffers and pagecache , dirty or not.. */
@@ -584,7 +576,6 @@ struct block_device *bdget(dev_t dev)
                inode->i_bdev = bdev;
                inode->i_data.a_ops = &def_blk_aops;
                mapping_set_gfp_mask(&inode->i_data, GFP_USER);
-                inode->i_data.backing_dev_info = &default_backing_dev_info;
                spin_lock(&bdev_lock);
                list_add(&bdev->bd_list, &all_bdevs);
                spin_unlock(&bdev_lock);
@@ -1145,8 +1136,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                bdev->bd_queue = disk->queue;
                bdev->bd_contains = bdev;
                if (!partno) {
-                        struct backing_dev_info *bdi;
                        ret = -ENXIO;
                        bdev->bd_part = disk_get_part(disk, partno);
                        if (!bdev->bd_part)
@@ -1172,11 +1161,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                }
                        }
-                        if (!ret) {
+                        if (!ret)
                                bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
-                                bdi = blk_get_backing_dev_info(bdev);
-                                bdev_inode_switch_bdi(bdev->bd_inode, bdi);
-                        }
                        /*
                         * If the device is invalidated, rescan partition
@@ -1203,8 +1189,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        if (ret)
                                goto out_clear;
                        bdev->bd_contains = whole;
-                        bdev_inode_switch_bdi(bdev->bd_inode,
-                                whole->bd_inode->i_data.backing_dev_info);
                        bdev->bd_part = disk_get_part(disk, partno);
                        if (!(disk->flags & GENHD_FL_UP) ||
                            !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1244,7 +1228,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        bdev->bd_disk = NULL;
        bdev->bd_part = NULL;
        bdev->bd_queue = NULL;
-        bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
        if (bdev != bdev->bd_contains)
                __blkdev_put(bdev->bd_contains, mode, 1);
        bdev->bd_contains = NULL;
@@ -1464,11 +1447,11 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                WARN_ON_ONCE(bdev->bd_holders);
                sync_blockdev(bdev);
                kill_bdev(bdev);
-                /* ->release can cause the old bdi to disappear,
+                /*
-                 * so must switch it out first
+                 * ->release can cause the queue to disappear, so flush all
+                 * dirty data before.
                 */
-                bdev_inode_switch_bdi(bdev->bd_inode,
+                bdev_write_inode(bdev->bd_inode);
-                                        &default_backing_dev_info);
        }
        if (bdev->bd_contains == bdev) {
                if (disk->fops->release)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index a66768ebc8d1..80e9c18ea64f 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -8,6 +8,7 @@ config BTRFS_FS
        select LZO_DECOMPRESS
        select RAID6_PQ
        select XOR_BLOCKS
+        select SRCU
        help
          Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7e607416755a..0b180708bf79 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1171,6 +1171,7 @@ struct btrfs_space_info {
        struct percpu_counter total_bytes_pinned;
        struct list_head list;
+        /* Protected by the spinlock 'lock'. */
        struct list_head ro_bgs;
        struct rw_semaphore groups_sem;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8c63419a7f70..1afb18226da8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1715,12 +1715,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
        int err;
-        bdi->capabilities = BDI_CAP_MAP_COPY;
+        err = bdi_setup_and_register(bdi, "btrfs");
-        err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
        if (err)
                return err;
-        bdi->ra_pages   = default_backing_dev_info.ra_pages;
+        bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
        bdi->congested_fn       = btrfs_congested_fn;
        bdi->congested_data     = info;
        return 0;
@@ -2319,7 +2318,6 @@ int open_ctree(struct super_block *sb,
         */
        fs_info->btree_inode->i_size = OFFSET_MAX;
        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
-        fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 15116585e714..a684086c3c81 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9422,7 +9422,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         * are still on the list after taking the semaphore
         */
        list_del_init(&block_group->list);
-        list_del_init(&block_group->ro_list);
        if (list_empty(&block_group->space_info->block_groups[index])) {
                kobj = block_group->space_info->block_group_kobjs[index];
                block_group->space_info->block_group_kobjs[index] = NULL;
@@ -9464,6 +9463,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        btrfs_remove_free_space_cache(block_group);
        spin_lock(&block_group->space_info->lock);
+        list_del_init(&block_group->ro_list);
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
        block_group->space_info->disk_total -= block_group->key.offset * factor;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4ebabd237153..c73df6a7c9b6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1407,8 +1407,8 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
        while (index <= end_index) {
                page = find_get_page(inode->i_mapping, index);
                BUG_ON(!page); /* Pages should be in the extent_io_tree */
-                account_page_redirty(page);
                __set_page_dirty_nobuffers(page);
+                account_page_redirty(page);
                page_cache_release(page);
                index++;
        }
@@ -2190,7 +2190,7 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
                next = next_state(state);
-                failrec = (struct io_failure_record *)state->private;
+                failrec = (struct io_failure_record *)(unsigned long)state->private;
                free_extent_state(state);
                kfree(failrec);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..b78bbbac900d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1746,7 +1746,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
        mutex_lock(&inode->i_mutex);
-        current->backing_dev_info = inode->i_mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err) {
                mutex_unlock(&inode->i_mutex);
@@ -2081,7 +2081,6 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = btrfs_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8bf326affb94..54bcf639d1cf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3608,7 +3608,6 @@ cache_acl:
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
                inode->i_mapping->a_ops = &btrfs_aops;
-                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
                inode->i_fop = &btrfs_file_operations;
                inode->i_op = &btrfs_file_inode_operations;
@@ -3623,7 +3622,6 @@ cache_acl:
        case S_IFLNK:
                inode->i_op = &btrfs_symlink_inode_operations;
                inode->i_mapping->a_ops = &btrfs_symlink_aops;
-                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
                break;
        default:
                inode->i_op = &btrfs_special_inode_operations;
@@ -6088,7 +6086,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
        inode->i_mapping->a_ops = &btrfs_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
@@ -9203,7 +9200,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
        inode->i_mapping->a_ops = &btrfs_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
@@ -9247,7 +9243,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        inode->i_op = &btrfs_symlink_inode_operations;
        inode->i_mapping->a_ops = &btrfs_symlink_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        inode_set_bytes(inode, name_len);
        btrfs_i_size_write(inode, name_len);
        err = btrfs_update_inode(trans, root, inode);
@@ -9459,7 +9454,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
        inode->i_op = &btrfs_file_inode_operations;
        inode->i_mapping->a_ops = &btrfs_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 9e1569ffbf6e..e427cb7ee12c 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3053,7 +3053,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        ppath = btrfs_alloc_path();
        if (!ppath) {
-                btrfs_free_path(ppath);
+                btrfs_free_path(path);
                return -ENOMEM;
        }
@@ -3065,6 +3065,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        path->search_commit_root = 1;
        path->skip_locking = 1;
+        ppath->search_commit_root = 1;
+        ppath->skip_locking = 1;
        /*
         * trigger the readahead for extent tree csum tree and wait for
         * completion. During readahead, the scrub is officially paused
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 60f7cbe815e9..6f49b2872a64 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1000,10 +1000,20 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
                         */
                        if (fs_info->pending_changes == 0)
                                return 0;
+                        /*
+                         * A non-blocking test if the fs is frozen. We must not
+                         * start a new transaction here otherwise a deadlock
+                         * happens. The pending operations are delayed to the
+                         * next commit after thawing.
+                         */
+                        if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
+                                __sb_end_write(sb, SB_FREEZE_WRITE);
+                        else
+                                return 0;
                        trans = btrfs_start_transaction(root, 0);
-                } else {
-                        return PTR_ERR(trans);
                }
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
        }
        return btrfs_commit_transaction(trans, root);
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a605d4e2f2bc..e88b59d13439 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2118,7 +2118,7 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
        unsigned long prev;
        unsigned long bit;
-        prev = cmpxchg(&fs_info->pending_changes, 0, 0);
+        prev = xchg(&fs_info->pending_changes, 0);
        if (!prev)
                return;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9a02da16f2be..1a9585d4380a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2591,6 +2591,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        }
        if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
+                blk_finish_plug(&plug);
                mutex_unlock(&log_root_tree->log_mutex);
                ret = root_log_ctx.log_ret;
                goto out;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c81c0e004588..24be059fd1f8 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1569,7 +1569,6 @@ out:
 static struct vm_operations_struct ceph_vmops = {
        .fault          = ceph_filemap_fault,
        .page_mkwrite   = ceph_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 int ceph_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ce74b394b49d..905986dd4c3c 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -945,7 +945,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
        mutex_lock(&inode->i_mutex);
        /* We can write back this queue in page reclaim */
-        current->backing_dev_info = file->f_mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f61a74115beb..6b5173605154 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -783,8 +783,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
        }
        inode->i_mapping->a_ops = &ceph_aops;
-        inode->i_mapping->backing_dev_info =
-                &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
        switch (inode->i_mode & S_IFMT) {
        case S_IFIFO:
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index c35c5c614e38..06ea5cd05cd9 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -239,23 +239,21 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
        return err;
 }
-/**
+/*
- * Must be called with lock_flocks() already held. Fills in the passed
+ * Fills in the passed counter variables, so you can prepare pagelist metadata
- * counter variables, so you can prepare pagelist metadata before calling
+ * before calling ceph_encode_locks.
- * ceph_encode_locks.
 */
 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 {
-        struct file_lock *lock;
+        struct file_lock_context *ctx;
        *fcntl_count = 0;
        *flock_count = 0;
-        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+        ctx = inode->i_flctx;
-                if (lock->fl_flags & FL_POSIX)
+        if (ctx) {
-                        ++(*fcntl_count);
+                *fcntl_count = ctx->flc_posix_cnt;
-                else if (lock->fl_flags & FL_FLOCK)
+                *flock_count = ctx->flc_flock_cnt;
-                        ++(*flock_count);
        }
        dout("counted %d flock locks and %d fcntl locks",
             *flock_count, *fcntl_count);
@@ -271,6 +269,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
                                int num_fcntl_locks, int num_flock_locks)
 {
        struct file_lock *lock;
+        struct file_lock_context *ctx = inode->i_flctx;
        int err = 0;
        int seen_fcntl = 0;
        int seen_flock = 0;
@@ -279,33 +278,34 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
        dout("encoding %d flock and %d fcntl locks", num_flock_locks,
             num_fcntl_locks);
-        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+        if (!ctx)
-                if (lock->fl_flags & FL_POSIX) {
+                return 0;
-                        ++seen_fcntl;
-                        if (seen_fcntl > num_fcntl_locks) {
+        spin_lock(&ctx->flc_lock);
-                                err = -ENOSPC;
+        list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
-                                goto fail;
+                ++seen_fcntl;
-                        }
+                if (seen_fcntl > num_fcntl_locks) {
-                        err = lock_to_ceph_filelock(lock, &flocks[l]);
+                        err = -ENOSPC;
-                        if (err)
+                        goto fail;
-                                goto fail;
-                        ++l;
                }
+                err = lock_to_ceph_filelock(lock, &flocks[l]);
+                if (err)
+                        goto fail;
+                ++l;
        }
-        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+        list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
-                if (lock->fl_flags & FL_FLOCK) {
+                ++seen_flock;
-                        ++seen_flock;
+                if (seen_flock > num_flock_locks) {
-                        if (seen_flock > num_flock_locks) {
+                        err = -ENOSPC;
-                                err = -ENOSPC;
+                        goto fail;
-                                goto fail;
-                        }
-                        err = lock_to_ceph_filelock(lock, &flocks[l]);
-                        if (err)
-                                goto fail;
-                        ++l;
                }
+                err = lock_to_ceph_filelock(lock, &flocks[l]);
+                if (err)
+                        goto fail;
+                ++l;
        }
 fail:
+        spin_unlock(&ctx->flc_lock);
        return err;
 }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d2171f4a6980..5f62fb7a5d0a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2700,20 +2700,16 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                struct ceph_filelock *flocks;
 encode_again:
-                spin_lock(&inode->i_lock);
                ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
-                spin_unlock(&inode->i_lock);
                flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
                                 sizeof(struct ceph_filelock), GFP_NOFS);
                if (!flocks) {
                        err = -ENOMEM;
                        goto out_free;
                }
-                spin_lock(&inode->i_lock);
                err = ceph_encode_locks_to_buffer(inode, flocks,
                                                  num_fcntl_locks,
                                                  num_flock_locks);
-                spin_unlock(&inode->i_lock);
                if (err) {
                        kfree(flocks);
                        if (err == -ENOSPC)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 50f06cddc94b..5ae62587a71d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -40,17 +40,6 @@ static void ceph_put_super(struct super_block *s)
        dout("put_super\n");
        ceph_mdsc_close_sessions(fsc->mdsc);
-        /*
-         * ensure we release the bdi before put_anon_super releases
-         * the device name.
-         */
-        if (s->s_bdi == &fsc->backing_dev_info) {
-                bdi_unregister(&fsc->backing_dev_info);
-                s->s_bdi = NULL;
-        }
-        return;
 }
 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -910,7 +899,7 @@ static int ceph_register_bdi(struct super_block *sb,
                        >> PAGE_SHIFT;
        else
                fsc->backing_dev_info.ra_pages =
-                        default_backing_dev_info.ra_pages;
+                        VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
        err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
                           atomic_long_inc_return(&bdi_seq));
@@ -1002,11 +991,16 @@ out_final:
 static void ceph_kill_sb(struct super_block *s)
 {
        struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+        dev_t dev = s->s_dev;
        dout("kill_sb %p\n", s);
        ceph_mdsc_pre_umount(fsc->mdsc);
-        kill_anon_super(s);    /* will call put_super after sb is r/o */
+        generic_shutdown_super(s);
        ceph_mdsc_destroy(fsc);
        destroy_fs_client(fsc);
+        free_anon_bdev(dev);
 }
 static struct file_system_type ceph_fs_type = {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 67b2007f10fe..ea06a3d0364c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -24,27 +24,6 @@
 #include "internal.h"
-/*
- * capabilities for /dev/mem, /dev/kmem and similar directly mappable character
- * devices
- * - permits shared-mmap for read, write and/or exec
- * - does not permit private mmap in NOMMU mode (can't do COW)
- * - no readahead or I/O queue unplugging required
- */
-struct backing_dev_info directly_mappable_cdev_bdi = {
-        .name = "char",
-        .capabilities   = (
-#ifdef CONFIG_MMU
-                /* permit private copies of the data to be taken */
-                BDI_CAP_MAP_COPY |
-#endif
-                /* permit direct mmap, for read, write or exec */
-                BDI_CAP_MAP_DIRECT |
-                BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
-                /* no writeback happens */
-                BDI_CAP_NO_ACCT_AND_WRITEBACK),
-};
 static struct kobj_map *cdev_map;
 static DEFINE_MUTEX(chrdevs_lock);
@@ -575,8 +554,6 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
 void __init chrdev_init(void)
 {
        cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
-        if (bdi_init(&directly_mappable_cdev_bdi))
-                panic("Failed to init directly mappable cdev bdi");
 }
@@ -590,4 +567,3 @@ EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
 EXPORT_SYMBOL(__register_chrdev);
 EXPORT_SYMBOL(__unregister_chrdev);
-EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 9c56ef776407..7febcf2475c5 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -606,9 +606,11 @@ cifs_security_flags_handle_must_flags(unsigned int *flags)
                *flags = CIFSSEC_MUST_NTLMV2;
        else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM)
                *flags = CIFSSEC_MUST_NTLM;
-        else if ((*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
+        else if (CIFSSEC_MUST_LANMAN &&
+                 (*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
                *flags = CIFSSEC_MUST_LANMAN;
-        else if ((*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
+        else if (CIFSSEC_MUST_PLNTXT &&
+                 (*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
                *flags = CIFSSEC_MUST_PLNTXT;
        *flags |= signflags;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2a772da16b83..d3aa999ab785 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3446,7 +3446,7 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
        int referral_walks_count = 0;
 #endif
-        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
        if (rc)
                return rc;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 96b7e9b7706d..8fe1f7a21b3e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -366,6 +366,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
        struct cifsLockInfo *li, *tmp;
        struct cifs_fid fid;
        struct cifs_pending_open open;
+        bool oplock_break_cancelled;
        spin_lock(&cifs_file_list_lock);
        if (--cifs_file->count > 0) {
@@ -397,7 +398,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
        }
        spin_unlock(&cifs_file_list_lock);
-        cancel_work_sync(&cifs_file->oplock_break);
+        oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break);
        if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
                struct TCP_Server_Info *server = tcon->ses->server;
@@ -409,6 +410,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
                _free_xid(xid);
        }
+        if (oplock_break_cancelled)
+                cifs_done_oplock_break(cifsi);
        cifs_del_pending_open(&open);
        /*
@@ -1109,11 +1113,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
        return rc;
 }
-/* copied from fs/locks.c with a name change */
-#define cifs_for_each_lock(inode, lockp) \
-        for (lockp = &inode->i_flock; *lockp != NULL; \
-             lockp = &(*lockp)->fl_next)
 struct lock_to_push {
        struct list_head llist;
        __u64 offset;
@@ -1128,8 +1127,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 {
        struct inode *inode = cfile->dentry->d_inode;
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
-        struct file_lock *flock, **before;
+        struct file_lock *flock;
-        unsigned int count = 0, i = 0;
+        struct file_lock_context *flctx = inode->i_flctx;
+        unsigned int i;
        int rc = 0, xid, type;
        struct list_head locks_to_send, *el;
        struct lock_to_push *lck, *tmp;
@@ -1137,21 +1137,17 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
        xid = get_xid();
-        spin_lock(&inode->i_lock);
+        if (!flctx)
-        cifs_for_each_lock(inode, before) {
+                goto out;
-                if ((*before)->fl_flags & FL_POSIX)
-                        count++;
-        }
-        spin_unlock(&inode->i_lock);
        INIT_LIST_HEAD(&locks_to_send);
        /*
-         * Allocating count locks is enough because no FL_POSIX locks can be
+         * Allocating flc_posix_cnt locks is enough because no FL_POSIX locks
-         * added to the list while we are holding cinode->lock_sem that
+         * can be added to the list while we are holding cinode->lock_sem that
         * protects locking operations of this inode.
         */
-        for (; i < count; i++) {
+        for (i = 0; i < flctx->flc_posix_cnt; i++) {
                lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
                if (!lck) {
                        rc = -ENOMEM;
@@ -1161,11 +1157,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
        }
        el = locks_to_send.next;
-        spin_lock(&inode->i_lock);
+        spin_lock(&flctx->flc_lock);
-        cifs_for_each_lock(inode, before) {
+        list_for_each_entry(flock, &flctx->flc_posix, fl_list) {
-                flock = *before;
-                if ((flock->fl_flags & FL_POSIX) == 0)
-                        continue;
                if (el == &locks_to_send) {
                        /*
                         * The list ended. We don't have enough allocated
@@ -1185,9 +1178,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
                lck->length = length;
                lck->type = type;
                lck->offset = flock->fl_start;
-                el = el->next;
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&flctx->flc_lock);
        list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
                int stored_rc;
@@ -3244,7 +3236,6 @@ static struct vm_operations_struct cifs_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = cifs_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 0c3ce464cae4..2d4f37235ed0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -937,8 +937,6 @@ retry_iget5_locked:
                        inode->i_flags |= S_NOATIME | S_NOCMTIME;
                if (inode->i_state & I_NEW) {
                        inode->i_ino = hash;
-                        if (S_ISREG(inode->i_mode))
-                                inode->i_data.backing_dev_info = sb->s_bdi;
 #ifdef CONFIG_CIFS_FSCACHE
                        /* initialize per-inode cache cookie pointer */
                        CIFS_I(inode)->fscache = NULL;
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 45cb59bcc791..8b7898b7670f 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -86,21 +86,16 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
        }
        src_inode = file_inode(src_file.file);
+        rc = -EINVAL;
+        if (S_ISDIR(src_inode->i_mode))
+                goto out_fput;
        /*
         * Note: cifs case is easier than btrfs since server responsible for
         * checks for proper open modes and file type and if it wants
         * server could even support copy of range where source = target
         */
+        lock_two_nondirectories(target_inode, src_inode);
-        /* so we do not deadlock racing two ioctls on same files */
-        if (target_inode < src_inode) {
-                mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_PARENT);
-                mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
-        } else {
-                mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
-                mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_CHILD);
-        }
        /* determine range to clone */
        rc = -EINVAL;
@@ -124,13 +119,7 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
 out_unlock:
        /* although unlocking in the reverse order from locking is not
           strictly necessary here it is a little cleaner to be consistent */
-        if (target_inode < src_inode) {
+        unlock_two_nondirectories(src_inode, target_inode);
-                mutex_unlock(&src_inode->i_mutex);
-                mutex_unlock(&target_inode->i_mutex);
-        } else {
-                mutex_unlock(&target_inode->i_mutex);
-                mutex_unlock(&src_inode->i_mutex);
-        }
 out_fput:
        fdput(src_file);
 out_drop_write:
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 6c1566366a66..a4232ec4f2ba 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -221,7 +221,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
        }
        rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16));
-        memset(wpwd, 0, 129 * sizeof(__le16));
+        memzero_explicit(wpwd, sizeof(wpwd));
        return rc;
 }
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index b945410bfcd5..82ec68b59208 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,7 +183,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
                goto unlock_out;
        }
-        error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
+        error = bdi_setup_and_register(&vc->bdi, "coda");
        if (error)
                goto unlock_out;
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index bd4a3c167091..a315677e44d3 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -70,8 +70,6 @@ extern int configfs_is_root(struct config_item *item);
 extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
 extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
-extern int configfs_inode_init(void);
-extern void configfs_inode_exit(void);
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
 extern int configfs_make_dirent(struct configfs_dirent *,
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5946ad98053f..65af86147154 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -50,12 +50,6 @@ static const struct address_space_operations configfs_aops = {
        .write_end      = simple_write_end,
 };
-static struct backing_dev_info configfs_backing_dev_info = {
-        .name           = "configfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
 static const struct inode_operations configfs_inode_operations ={
        .setattr        = configfs_setattr,
 };
@@ -137,7 +131,6 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
        if (inode) {
                inode->i_ino = get_next_ino();
                inode->i_mapping->a_ops = &configfs_aops;
-                inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
                inode->i_op = &configfs_inode_operations;
                if (sd->s_iattr) {
@@ -283,13 +276,3 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
        }
        mutex_unlock(&dir->d_inode->i_mutex);
 }
-int __init configfs_inode_init(void)
-{
-        return bdi_init(&configfs_backing_dev_info);
-}
-void configfs_inode_exit(void)
-{
-        bdi_destroy(&configfs_backing_dev_info);
-}
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f6c285833390..da94e41bdbf6 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -145,19 +145,13 @@ static int __init configfs_init(void)
        if (!config_kobj)
                goto out2;
-        err = configfs_inode_init();
-        if (err)
-                goto out3;
        err = register_filesystem(&configfs_fs_type);
        if (err)
-                goto out4;
+                goto out3;
        return 0;
-out4:
-        pr_err("Unable to register filesystem!\n");
-        configfs_inode_exit();
 out3:
+        pr_err("Unable to register filesystem!\n");
        kobject_put(config_kobj);
 out2:
        kmem_cache_destroy(configfs_dir_cachep);
@@ -172,7 +166,6 @@ static void __exit configfs_exit(void)
        kobject_put(config_kobj);
        kmem_cache_destroy(configfs_dir_cachep);
        configfs_dir_cachep = NULL;
-        configfs_inode_exit();
 }
 MODULE_AUTHOR("Oracle");
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index e7cfbaf8d0e2..1e6e227134d7 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -56,13 +56,8 @@ static int send_data(struct sk_buff *skb)
 {
        struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
        void *data = genlmsg_data(genlhdr);
-        int rv;
-        rv = genlmsg_end(skb, data);
+        genlmsg_end(skb, data);
-        if (rv < 0) {
-                nlmsg_free(skb);
-                return rv;
-        }
        return genlmsg_unicast(&init_net, skb, listener_nlportid);
 }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1686dc2da9fd..34b36a504059 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -67,7 +67,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque)
        inode->i_ino = lower_inode->i_ino;
        inode->i_version++;
        inode->i_mapping->a_ops = &ecryptfs_aops;
-        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
        if (S_ISLNK(inode->i_mode))
                inode->i_op = &ecryptfs_symlink_iops;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d9eb84bda559..1895d60f4122 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -520,7 +520,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
                goto out;
        }
-        rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
+        rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
        if (rc)
                goto out1;
diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig
index 367bbb10c543..c2499ef174a2 100644
--- a/fs/efivarfs/Kconfig
+++ b/fs/efivarfs/Kconfig
@@ -1,6 +1,7 @@
 config EFIVAR_FS
        tristate "EFI Variable filesystem"
        depends on EFI
+        default m
        help
          efivarfs is a replacement filesystem for the old EFI
          variable support via sysfs, as it doesn't suffer from the
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 6dad1176ec52..ddbce42548c9 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -140,7 +140,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
        name[len] = '-';
-        efi_guid_unparse(&entry->var.VendorGuid, name + len + 1);
+        efi_guid_to_str(&entry->var.VendorGuid, name + len + 1);
        name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f1d3d4eb8c4f..6fc91df99ff8 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1214,7 +1214,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
                memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
        }
-        inode->i_mapping->backing_dev_info = sb->s_bdi;
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &exofs_file_inode_operations;
                inode->i_fop = &exofs_file_operations;
@@ -1314,7 +1313,6 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
        set_obj_2bcreated(oi);
-        inode->i_mapping->backing_dev_info = sb->s_bdi;
        inode_init_owner(inode, dir, mode);
        inode->i_ino = sbi->s_nextid++;
        inode->i_blkbits = EXOFS_BLKSHIFT;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 95965503afcb..fcc2e565f540 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -836,7 +836,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_sbi;
        }
-        ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
+        ret = bdi_setup_and_register(&sbi->bdi, "exofs");
        if (ret) {
                EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
                dput(sb->s_root);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7d66fb0e4cca..6c14bb8322fa 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -170,7 +170,7 @@ static void ext2_preread_inode(struct inode *inode)
        struct ext2_group_desc * gdp;
        struct backing_dev_info *bdi;
-        bdi = inode->i_mapping->backing_dev_info;
+        bdi = inode_to_bdi(inode);
        if (bdi_read_congested(bdi))
                return;
        if (bdi_write_congested(bdi))
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9b4e7d750d4f..d4dbf3c259b3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -466,6 +466,8 @@ static void ext3_put_super (struct super_block * sb)
        }
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
+        mutex_destroy(&sbi->s_orphan_lock);
+        mutex_destroy(&sbi->s_resize_lock);
        kfree(sbi);
 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8131be8c0af3..7cb592386121 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -195,7 +195,6 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = ext4_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 74c5f53595fb..64c39c7c594f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -334,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func,
 static int block_device_ejected(struct super_block *sb)
 {
        struct inode *bd_inode = sb->s_bdev->bd_inode;
-        struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+        struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
        return bdi->dev == NULL;
 }
@@ -1046,10 +1046,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                         struct path *path);
-static int ext4_quota_on_sysfile(struct super_block *sb, int type,
-                                 int format_id);
 static int ext4_quota_off(struct super_block *sb, int type);
-static int ext4_quota_off_sysfile(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -1084,16 +1081,6 @@ static const struct quotactl_ops ext4_qctl_operations = {
        .get_dqblk      = dquot_get_dqblk,
        .set_dqblk      = dquot_set_dqblk
 };
-static const struct quotactl_ops ext4_qctl_sysfile_operations = {
-        .quota_on_meta  = ext4_quota_on_sysfile,
-        .quota_off      = ext4_quota_off_sysfile,
-        .quota_sync     = dquot_quota_sync,
-        .get_info       = dquot_get_dqinfo,
-        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = dquot_set_dqblk
-};
 #endif
 static const struct super_operations ext4_sops = {
@@ -3935,7 +3922,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_QUOTA
        sb->dq_op = &ext4_quota_operations;
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
-                sb->s_qcop = &ext4_qctl_sysfile_operations;
+                sb->s_qcop = &dquot_quotactl_sysfile_ops;
        else
                sb->s_qcop = &ext4_qctl_operations;
        sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
@@ -5288,21 +5275,6 @@ static int ext4_enable_quotas(struct super_block *sb)
        return 0;
 }
-/*
- * quota_on function that is used when QUOTA feature is set.
- */
-static int ext4_quota_on_sysfile(struct super_block *sb, int type,
-                                 int format_id)
-{
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
-                return -EINVAL;
-        /*
-         * USAGE was enabled at mount time. Only need to enable LIMITS now.
-         */
-        return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED);
-}
 static int ext4_quota_off(struct super_block *sb, int type)
 {
        struct inode *inode = sb_dqopt(sb)->files[type];
@@ -5329,18 +5301,6 @@ out:
        return dquot_quota_off(sb, type);
 }
-/*
- * quota_off function that is used when QUOTA feature is set.
- */
-static int ext4_quota_off_sysfile(struct super_block *sb, int type)
-{
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
-                return -EINVAL;
-        /* Disable only the limits. */
-        return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-}
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
 * itself serializes the operations (and no one else should touch the files)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3c27e0ecb3bc..5674ba13102b 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -92,7 +92,6 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = f2fs_vm_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int get_parent_ino(struct inode *inode, nid_t *pino)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2d609a5fbfea..c399152de397 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -66,15 +66,21 @@ int writeback_in_progress(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(writeback_in_progress);
-static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
+struct backing_dev_info *inode_to_bdi(struct inode *inode)
 {
-        struct super_block *sb = inode->i_sb;
+        struct super_block *sb;
-        if (sb_is_blkdev_sb(sb))
+        if (!inode)
-                return inode->i_mapping->backing_dev_info;
+                return &noop_backing_dev_info;
+        sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+        if (sb_is_blkdev_sb(sb))
+                return blk_get_backing_dev_info(I_BDEV(inode));
+#endif
        return sb->s_bdi;
 }
+EXPORT_SYMBOL_GPL(inode_to_bdi);
 static inline struct inode *wb_inode(struct list_head *head)
 {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 760b2c552197..c01ec3bdcfd8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1159,7 +1159,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        mutex_lock(&inode->i_mutex);
        /* We can write back this queue in page reclaim */
-        current->backing_dev_info = mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
@@ -1464,7 +1464,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 {
        struct inode *inode = req->inode;
        struct fuse_inode *fi = get_fuse_inode(inode);
-        struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+        struct backing_dev_info *bdi = inode_to_bdi(inode);
        int i;
        list_del(&req->writepages_entry);
@@ -1658,7 +1658,7 @@ static int fuse_writepage_locked(struct page *page)
        req->end = fuse_writepage_end;
        req->inode = inode;
-        inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+        inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
        inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
        spin_lock(&fc->lock);
@@ -1768,7 +1768,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
        if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
                                        old_req->state == FUSE_REQ_PENDING)) {
-                struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+                struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
                copy_highpage(old_req->pages[0], page);
                spin_unlock(&fc->lock);
@@ -1872,7 +1872,7 @@ static int fuse_writepages_fill(struct page *page,
        req->page_descs[req->num_pages].offset = 0;
        req->page_descs[req->num_pages].length = PAGE_SIZE;
-        inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+        inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
        inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
        err = 0;
@@ -2062,7 +2062,6 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = fuse_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f38256e4476e..e8799c11424b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -308,7 +308,6 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
                if (!fc->writeback_cache || !S_ISREG(attr->mode))
                        inode->i_flags |= S_NOCMTIME;
                inode->i_generation = generation;
-                inode->i_data.backing_dev_info = &fc->bdi;
                fuse_init_inode(inode, attr);
                unlock_new_inode(inode);
        } else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3088e2a38e30..7b3143064af1 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -73,7 +73,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
        BUG_ON(name == NULL);
-        if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
+        if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
                return -E2BIG;
        if (type == ACL_TYPE_ACCESS) {
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 805b37fed638..4ad4f94edebe 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -289,7 +289,7 @@ continue_unlock:
                if (!clear_page_dirty_for_io(page))
                        goto continue_unlock;
-                trace_wbc_writepage(wbc, mapping->backing_dev_info);
+                trace_wbc_writepage(wbc, inode_to_bdi(inode));
                ret = __gfs2_jdata_writepage(page, wbc);
                if (unlikely(ret)) {
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c5a34f09e228..6371192961e2 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1896,7 +1896,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
        ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN);
        if (ht == NULL)
-                ht = vzalloc(size);
+                ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO,
+                               PAGE_KERNEL);
        if (!ht)
                return -ENOMEM;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6e600abf694a..ec9c2d33477a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -498,7 +498,6 @@ static const struct vm_operations_struct gfs2_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = gfs2_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 /**
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a23524aa3eac..f42dffba056a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -173,19 +173,14 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
        spin_unlock(&lru_lock);
 }
-static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
 {
+        spin_lock(&lru_lock);
        if (!list_empty(&gl->gl_lru)) {
                list_del_init(&gl->gl_lru);
                atomic_dec(&lru_count);
                clear_bit(GLF_LRU, &gl->gl_flags);
        }
-}
-static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
-{
-        spin_lock(&lru_lock);
-        __gfs2_glock_remove_from_lru(gl);
        spin_unlock(&lru_lock);
 }
@@ -205,9 +200,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
        lockref_mark_dead(&gl->gl_lockref);
-        spin_lock(&lru_lock);
+        gfs2_glock_remove_from_lru(gl);
-        __gfs2_glock_remove_from_lru(gl);
-        spin_unlock(&lru_lock);
        spin_unlock(&gl->gl_lockref.lock);
        spin_lock_bucket(gl->gl_hash);
        hlist_bl_del_rcu(&gl->gl_list);
@@ -775,7 +768,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                mapping->flags = 0;
                mapping_set_gfp_mask(mapping, GFP_NOFS);
                mapping->private_data = NULL;
-                mapping->backing_dev_info = s->s_bdi;
                mapping->writeback_index = 0;
        }
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9054002ebe70..73c72253faac 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -543,10 +543,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
        }
        error = gfs2_dir_add(&dip->i_inode, name, ip, da);
-        if (error)
-                goto fail_end_trans;
-fail_end_trans:
        gfs2_trans_end(sdp);
 fail_ipreserv:
        gfs2_inplace_release(dip);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8633ad328ee2..efc8e254787c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -112,7 +112,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_NOFS);
        mapping->private_data = NULL;
-        mapping->backing_dev_info = sb->s_bdi;
        mapping->writeback_index = 0;
        spin_lock_init(&sdp->sd_log_lock);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c8b148bbdc8b..3e193cb36996 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -667,7 +667,7 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
 static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                             s64 change, struct gfs2_quota_data *qd,
-                             struct fs_disk_quota *fdq)
+                             struct qc_dqblk *fdq)
 {
        struct inode *inode = &ip->i_inode;
        struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -697,16 +697,16 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        be64_add_cpu(&q.qu_value, change);
        qd->qd_qb.qb_value = q.qu_value;
        if (fdq) {
-                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+                if (fdq->d_fieldmask & QC_SPC_SOFT) {
-                        q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
+                        q.qu_warn = cpu_to_be64(fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift);
                        qd->qd_qb.qb_warn = q.qu_warn;
                }
-                if (fdq->d_fieldmask & FS_DQ_BHARD) {
+                if (fdq->d_fieldmask & QC_SPC_HARD) {
-                        q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
+                        q.qu_limit = cpu_to_be64(fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift);
                        qd->qd_qb.qb_limit = q.qu_limit;
                }
-                if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+                if (fdq->d_fieldmask & QC_SPACE) {
-                        q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+                        q.qu_value = cpu_to_be64(fdq->d_space >> sdp->sd_sb.sb_bsize_shift);
                        qd->qd_qb.qb_value = q.qu_value;
                }
        }
@@ -1497,7 +1497,7 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
 }
 static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
-                          struct fs_disk_quota *fdq)
+                          struct qc_dqblk *fdq)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_quota_lvb *qlvb;
@@ -1505,7 +1505,7 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
        struct gfs2_holder q_gh;
        int error;
-        memset(fdq, 0, sizeof(struct fs_disk_quota));
+        memset(fdq, 0, sizeof(*fdq));
        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
                return -ESRCH; /* Crazy XFS error code */
@@ -1522,12 +1522,9 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
                goto out;
        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
-        fdq->d_version = FS_DQUOT_VERSION;
+        fdq->d_spc_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_sb.sb_bsize_shift;
-        fdq->d_flags = (qid.type == USRQUOTA) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
+        fdq->d_spc_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_sb.sb_bsize_shift;
-        fdq->d_id = from_kqid_munged(current_user_ns(), qid);
+        fdq->d_space = be64_to_cpu(qlvb->qb_value) << sdp->sd_sb.sb_bsize_shift;
-        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
-        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
-        fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
        gfs2_glock_dq_uninit(&q_gh);
 out:
@@ -1536,10 +1533,10 @@ out:
 }
 /* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
+#define GFS2_FIELDMASK (QC_SPC_SOFT|QC_SPC_HARD|QC_SPACE)
 static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
-                          struct fs_disk_quota *fdq)
+                          struct qc_dqblk *fdq)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -1583,17 +1580,17 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
                goto out_i;
        /* If nothing has changed, this is a no-op */
-        if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
+        if ((fdq->d_fieldmask & QC_SPC_SOFT) &&
-            ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
+            ((fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
-                fdq->d_fieldmask ^= FS_DQ_BSOFT;
+                fdq->d_fieldmask ^= QC_SPC_SOFT;
-        if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
+        if ((fdq->d_fieldmask & QC_SPC_HARD) &&
-            ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
+            ((fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
-                fdq->d_fieldmask ^= FS_DQ_BHARD;
+                fdq->d_fieldmask ^= QC_SPC_HARD;
-        if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+        if ((fdq->d_fieldmask & QC_SPACE) &&
-            ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+            ((fdq->d_space >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
-                fdq->d_fieldmask ^= FS_DQ_BCOUNT;
+                fdq->d_fieldmask ^= QC_SPACE;
        if (fdq->d_fieldmask == 0)
                goto out_i;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 573bd3b758fa..1b645773c98e 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -439,7 +439,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
        ls->ls_recover_jid_done = jid;
        ls->ls_recover_jid_status = message;
-        sprintf(env_jid, "JID=%d", jid);
+        sprintf(env_jid, "JID=%u", jid);
        sprintf(env_status, "RECOVERY=%s",
                message == LM_RD_SUCCESS ? "Done" : "Failed");
        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 5b327f837de7..1666382b198d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -743,7 +743,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
-        struct backing_dev_info *bdi = metamapping->backing_dev_info;
+        struct backing_dev_info *bdi = inode_to_bdi(metamapping->host);
        int ret = 0;
        if (wbc->sync_mode == WB_SYNC_ALL)
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 3ab566ba5696..ae8e8811f0e8 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -96,7 +96,7 @@ static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
        struct super_block *sb = sdp->sd_vfs;
        int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1;
-        return snprintf(buf, PAGE_SIZE, "%u\n", frozen);
+        return snprintf(buf, PAGE_SIZE, "%d\n", frozen);
 }
 static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5eba47f593f8..c274aca8e8dc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -62,12 +62,6 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
        return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
 }
-static struct backing_dev_info hugetlbfs_backing_dev_info = {
-        .name           = "hugetlbfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
 int sysctl_hugetlb_shm_group;
 enum {
@@ -498,7 +492,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
                lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
                                &hugetlbfs_i_mmap_rwsem_key);
                inode->i_mapping->a_ops = &hugetlbfs_aops;
-                inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_mapping->private_data = resv_map;
                info = HUGETLBFS_I(inode);
@@ -1032,10 +1025,6 @@ static int __init init_hugetlbfs_fs(void)
                return -ENOTSUPP;
        }
-        error = bdi_init(&hugetlbfs_backing_dev_info);
-        if (error)
-                return error;
        error = -ENOMEM;
        hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
                                        sizeof(struct hugetlbfs_inode_info),
@@ -1071,7 +1060,6 @@ static int __init init_hugetlbfs_fs(void)
 out:
        kmem_cache_destroy(hugetlbfs_inode_cachep);
 out2:
-        bdi_destroy(&hugetlbfs_backing_dev_info);
        return error;
 }
@@ -1091,7 +1079,6 @@ static void __exit exit_hugetlbfs_fs(void)
        for_each_hstate(h)
                kern_unmount(hugetlbfs_vfsmount[i++]);
        unregister_filesystem(&hugetlbfs_fs_type);
-        bdi_destroy(&hugetlbfs_backing_dev_info);
 }
 module_init(init_hugetlbfs_fs)
diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..b7871577571d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -170,20 +170,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        atomic_set(&mapping->i_mmap_writable, 0);
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->private_data = NULL;
-        mapping->backing_dev_info = &default_backing_dev_info;
        mapping->writeback_index = 0;
-        /*
-         * If the block_device provides a backing_dev_info for client
-         * inodes then use that.  Otherwise the inode share the bdev's
-         * backing_dev_info.
-         */
-        if (sb->s_bdev) {
-                struct backing_dev_info *bdi;
-                bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-                mapping->backing_dev_info = bdi;
-        }
        inode->i_private = NULL;
        inode->i_mapping = mapping;
        INIT_HLIST_HEAD(&inode->i_dentry);      /* buggered by rcu freeing */
@@ -194,7 +181,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 #ifdef CONFIG_FSNOTIFY
        inode->i_fsnotify_mask = 0;
 #endif
+        inode->i_flctx = NULL;
        this_cpu_inc(nr_inodes);
        return 0;
@@ -237,6 +224,7 @@ void __destroy_inode(struct inode *inode)
        BUG_ON(inode_has_buffers(inode));
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
+        locks_free_lock_context(inode->i_flctx);
        if (!inode->i_nlink) {
                WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
                atomic_long_dec(&inode->i_sb->s_remove_count);
@@ -355,7 +343,6 @@ void address_space_init_once(struct address_space *mapping)
        INIT_LIST_HEAD(&mapping->private_list);
        spin_lock_init(&mapping->private_lock);
        mapping->i_mmap = RB_ROOT;
-        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
 }
 EXPORT_SYMBOL(address_space_init_once);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 214c3c11fbc2..5d01d2638ca5 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -379,6 +379,11 @@ int __generic_block_fiemap(struct inode *inode,
                                past_eof = true;
                }
                cond_resched();
+                if (fatal_signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
        } while (1);
        /* If ret is 1 then we just hit the end of the extent array */
diff --git a/fs/isofs/util.c b/fs/isofs/util.c
index 01e1ee7a998b..005a15cfd30a 100644
--- a/fs/isofs/util.c
+++ b/fs/isofs/util.c
@@ -2,6 +2,7 @@
 *  linux/fs/isofs/util.c
 */
+#include <linux/time.h>
 #include "isofs.h"
 /* 
@@ -17,9 +18,9 @@
 int iso_date(char * p, int flag)
 {
        int year, month, day, hour, minute, second, tz;
-        int crtime, days, i;
+        int crtime;
-        year = p[0] - 70;
+        year = p[0];
        month = p[1];
        day = p[2];
        hour = p[3];
@@ -31,18 +32,7 @@ int iso_date(char * p, int flag)
        if (year < 0) {
                crtime = 0;
        } else {
-                int monlen[12] = {31,28,31,30,31,30,31,31,30,31,30,31};
+                crtime = mktime64(year+1900, month, day, hour, minute, second);
-                days = year * 365;
-                if (year > 2)
-                        days += (year+1) / 4;
-                for (i = 1; i < month; i++)
-                        days += monlen[i-1];
-                if (((year+2) % 4) == 0 && month > 2)
-                        days++;
-                days += day - 1;
-                crtime = ((((days * 24) + hour) * 60 + minute) * 60)
-                        + second;
                /* sign extend */
                if (tz & 0x80)
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h
deleted file mode 100644
index fa92f7f1d0d0..000000000000
--- a/fs/jfs/endian24.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- *   Copyright (C) International Business Machines Corp., 2001
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#ifndef _H_ENDIAN24
-#define _H_ENDIAN24
-/*
- *      endian24.h:
- *
- * Endian conversion for 24-byte data
- *
- */
-#define __swab24(x) \
-({ \
-        __u32 __x = (x); \
-        ((__u32)( \
-                ((__x & (__u32)0x000000ffUL) << 16) | \
-                 (__x & (__u32)0x0000ff00UL)        | \
-                ((__x & (__u32)0x00ff0000UL) >> 16) )); \
-})
-#if (defined(__KERNEL__) && defined(__LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN))
-        #define __cpu_to_le24(x) ((__u32)(x))
-        #define __le24_to_cpu(x) ((__u32)(x))
-#else
-        #define __cpu_to_le24(x) __swab24(x)
-        #define __le24_to_cpu(x) __swab24(x)
-#endif
-#ifdef __KERNEL__
-        #define cpu_to_le24 __cpu_to_le24
-        #define le24_to_cpu __le24_to_cpu
-#endif
-#endif                          /* !_H_ENDIAN24 */
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 984c2bbf4f61..d88576e23fe4 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -1040,8 +1040,8 @@ static int dtSplitUp(tid_t tid,
                pxdlist.maxnpxd = 1;
                pxdlist.npxd = 0;
                pxd = &pxdlist.pxd[0];
-                PXDaddress(pxd, nxaddr)
+                PXDaddress(pxd, nxaddr);
-                    PXDlength(pxd, xlen + n);
+                PXDlength(pxd, xlen + n);
                split->pxdlist = &pxdlist;
                if ((rc = dtExtendPage(tid, ip, split, btstack))) {
                        nxaddr = addressPXD(pxd);
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
index 43ea3713c083..8f602dcb51fa 100644
--- a/fs/jfs/jfs_types.h
+++ b/fs/jfs/jfs_types.h
@@ -30,8 +30,6 @@
 #include <linux/types.h>
 #include <linux/nls.h>
-#include "endian24.h"
 /*
 * transaction and lock id's
 *
@@ -59,26 +57,42 @@ struct timestruc_t {
 /*
 *      physical xd (pxd)
+ *
+ *      The leftmost 24 bits of len_addr are the extent length.
+ *      The rightmost 8 bits of len_addr are the most signficant bits of
+ *      the extent address
 */
 typedef struct {
-        unsigned len:24;
+        __le32 len_addr;
-        unsigned addr1:8;
        __le32 addr2;
 } pxd_t;
 /* xd_t field construction */
-#define PXDlength(pxd, length32)        ((pxd)->len = __cpu_to_le24(length32))
+static inline void PXDlength(pxd_t *pxd, __u32 len)
-#define PXDaddress(pxd, address64)\
+{
-{\
+        pxd->len_addr = (pxd->len_addr & cpu_to_le32(~0xffffff)) |
-        (pxd)->addr1 = ((s64)address64) >> 32;\
+                        cpu_to_le32(len & 0xffffff);
-        (pxd)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
+}
+static inline void PXDaddress(pxd_t *pxd, __u64 addr)
+{
+        pxd->len_addr = (pxd->len_addr & cpu_to_le32(0xffffff)) |
+                        cpu_to_le32((addr >> 32)<<24);
+        pxd->addr2 = cpu_to_le32(addr & 0xffffffff);
 }
 /* xd_t field extraction */
-#define lengthPXD(pxd)  __le24_to_cpu((pxd)->len)
+static inline __u32 lengthPXD(pxd_t *pxd)
-#define addressPXD(pxd)\
+{
-        ( ((s64)((pxd)->addr1)) << 32 | __le32_to_cpu((pxd)->addr2))
+        return le32_to_cpu((pxd)->len_addr) & 0xffffff;
+}
+static inline __u64 addressPXD(pxd_t *pxd)
+{
+        __u64 n = le32_to_cpu(pxd->len_addr) & ~0xffffff;
+        return (n << 8) + le32_to_cpu(pxd->addr2);
+}
 #define MAXTREEHEIGHT 8
 /* pxd list */
@@ -93,12 +107,10 @@ struct pxdlist {
 *      data extent descriptor (dxd)
 */
 typedef struct {
-        unsigned flag:8;        /* 1: flags */
+        __u8 flag;      /* 1: flags */
-        unsigned rsrvd:24;
+        __u8 rsrvd[3];
        __le32 size;            /* 4: size in byte */
-        unsigned len:24;        /* 3: length in unit of fsblksize */
+        pxd_t loc;              /* 8: address and length in unit of fsblksize */
-        unsigned addr1:8;       /* 1: address in unit of fsblksize */
-        __le32 addr2;           /* 4: address in unit of fsblksize */
 } dxd_t;                        /* - 16 - */
 /* dxd_t flags */
@@ -109,12 +121,11 @@ typedef struct {
 #define DXD_CORRUPT     0x08    /* Inconsistency detected */
 /* dxd_t field construction
- *      Conveniently, the PXD macros work for DXD
 */
-#define DXDlength       PXDlength
+#define DXDlength(dxd, len)     PXDlength(&(dxd)->loc, len)
-#define DXDaddress      PXDaddress
+#define DXDaddress(dxd, addr)   PXDaddress(&(dxd)->loc, addr)
-#define lengthDXD       lengthPXD
+#define lengthDXD(dxd)  lengthPXD(&(dxd)->loc)
-#define addressDXD      addressPXD
+#define addressDXD(dxd) addressPXD(&(dxd)->loc)
 #define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32))
 #define sizeDXD(dxd)    le32_to_cpu((dxd)->size)
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 08c0c749b986..1e0987986d5f 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -29,13 +29,11 @@
 *      extent allocation descriptor (xad)
 */
 typedef struct xad {
-        unsigned flag:8;        /* 1: flag */
+        __u8 flag;      /* 1: flag */
-        unsigned rsvrd:16;      /* 2: reserved */
+        __u8 rsvrd[2];  /* 2: reserved */
-        unsigned off1:8;        /* 1: offset in unit of fsblksize */
+        __u8 off1;      /* 1: offset in unit of fsblksize */
-        __le32 off2;            /* 4: offset in unit of fsblksize */
+        __le32 off2;    /* 4: offset in unit of fsblksize */
-        unsigned len:24;        /* 3: length in unit of fsblksize */
+        pxd_t loc;      /* 8: length and address in unit of fsblksize */
-        unsigned addr1:8;       /* 1: address in unit of fsblksize */
-        __le32 addr2;           /* 4: address in unit of fsblksize */
 } xad_t;                        /* (16) */
 #define MAXXLEN         ((1 << 24) - 1)
@@ -49,19 +47,14 @@ typedef struct xad {
        (xad)->off1 = ((u64)offset64) >> 32;\
        (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
 }
-#define XADaddress(xad, address64)\
+#define XADaddress(xad, address64) PXDaddress(&(xad)->loc, address64)
-{\
+#define XADlength(xad, length32) PXDlength(&(xad)->loc, length32)
-        (xad)->addr1 = ((u64)address64) >> 32;\
-        (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
-}
-#define XADlength(xad, length32)        (xad)->len = __cpu_to_le24(length32)
 /* xad_t field extraction */
 #define offsetXAD(xad)\
        ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
-#define addressXAD(xad)\
+#define addressXAD(xad) addressPXD(&(xad)->loc)
-        ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2))
+#define lengthXAD(xad) lengthPXD(&(xad)->loc)
-#define lengthXAD(xad)  __le24_to_cpu((xad)->len)
 /* xad list */
 struct xadlist {
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 16c3a9556634..5d30c56ae075 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -619,8 +619,7 @@ out_mount_failed:
        iput(sbi->direct_inode);
        sbi->direct_inode = NULL;
 out_unload:
-        if (sbi->nls_tab)
+        unload_nls(sbi->nls_tab);
-                unload_nls(sbi->nls_tab);
 out_kfree:
        kfree(sbi);
        return ret;
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 985217626e66..9000874a945b 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -24,12 +24,6 @@ static const struct address_space_operations kernfs_aops = {
        .write_end      = simple_write_end,
 };
-static struct backing_dev_info kernfs_bdi = {
-        .name           = "kernfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
 static const struct inode_operations kernfs_iops = {
        .permission     = kernfs_iop_permission,
        .setattr        = kernfs_iop_setattr,
@@ -40,12 +34,6 @@ static const struct inode_operations kernfs_iops = {
        .listxattr      = kernfs_iop_listxattr,
 };
-void __init kernfs_inode_init(void)
-{
-        if (bdi_init(&kernfs_bdi))
-                panic("failed to init kernfs_bdi");
-}
 static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
 {
        static DEFINE_MUTEX(iattr_mutex);
@@ -298,7 +286,6 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
        kernfs_get(kn);
        inode->i_private = kn;
        inode->i_mapping->a_ops = &kernfs_aops;
-        inode->i_mapping->backing_dev_info = &kernfs_bdi;
        inode->i_op = &kernfs_iops;
        set_default_inode_attr(inode, kn->mode);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index dc84a3ef9ca2..af9fa7499919 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -88,7 +88,6 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
 ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
                            size_t size);
 ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
-void kernfs_inode_init(void);
 /*
 * dir.c
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f973ae9b05f1..8eaf417187f1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -246,5 +246,4 @@ void __init kernfs_init(void)
        kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
                                              sizeof(struct kernfs_node),
                                              0, SLAB_PANIC, NULL);
-        kernfs_inode_init();
 }
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 56598742dde4..5581e020644b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -57,8 +57,8 @@ static DEFINE_SPINLOCK(nlm_blocked_lock);
 static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
 {
        /*
-         * We can get away with a static buffer because we're only
+         * We can get away with a static buffer because this is only called
-         * called with BKL held.
+         * from lockd, which is single-threaded.
         */
        static char buf[2*NLM_MAXCOOKIELEN+1];
        unsigned int i, len = sizeof(buf);
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d12ff4e2dbe7..665ef5a05183 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -164,12 +164,15 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 {
        struct inode     *inode = nlmsvc_file_inode(file);
        struct file_lock *fl;
+        struct file_lock_context *flctx = inode->i_flctx;
        struct nlm_host  *lockhost;
+        if (!flctx || list_empty_careful(&flctx->flc_posix))
+                return 0;
 again:
        file->f_locks = 0;
-        spin_lock(&inode->i_lock);
+        spin_lock(&flctx->flc_lock);
-        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+        list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
                if (fl->fl_lmops != &nlmsvc_lock_operations)
                        continue;
@@ -180,7 +183,7 @@ again:
                if (match(lockhost, host)) {
                        struct file_lock lock = *fl;
-                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&flctx->flc_lock);
                        lock.fl_type  = F_UNLCK;
                        lock.fl_start = 0;
                        lock.fl_end   = OFFSET_MAX;
@@ -192,7 +195,7 @@ again:
                        goto again;
                }
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&flctx->flc_lock);
        return 0;
 }
@@ -223,18 +226,21 @@ nlm_file_inuse(struct nlm_file *file)
 {
        struct inode     *inode = nlmsvc_file_inode(file);
        struct file_lock *fl;
+        struct file_lock_context *flctx = inode->i_flctx;
        if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
                return 1;
-        spin_lock(&inode->i_lock);
+        if (flctx && !list_empty_careful(&flctx->flc_posix)) {
-        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+                spin_lock(&flctx->flc_lock);
-                if (fl->fl_lmops == &nlmsvc_lock_operations) {
+                list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
-                        spin_unlock(&inode->i_lock);
+                        if (fl->fl_lmops == &nlmsvc_lock_operations) {
-                        return 1;
+                                spin_unlock(&flctx->flc_lock);
+                                return 1;
+                        }
                }
+                spin_unlock(&flctx->flc_lock);
        }
-        spin_unlock(&inode->i_lock);
        file->f_locks = 0;
        return 0;
 }
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 9340e7e10ef6..5b651daad518 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -95,14 +95,6 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f)
        return p + XDR_QUADLEN(NFS2_FHSIZE);
 }
-static inline __be32 *
-nlm_encode_fh(__be32 *p, struct nfs_fh *f)
-{
-        *p++ = htonl(NFS2_FHSIZE);
-        memcpy(p, f->data, NFS2_FHSIZE);
-        return p + XDR_QUADLEN(NFS2_FHSIZE);
-}
 /*
 * Encode and decode owner handle
 */
diff --git a/fs/locks.c b/fs/locks.c
index 59e2f905e4ff..4753218f308e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -137,7 +137,7 @@
 #define IS_POSIX(fl)    (fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)    (fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl)    (fl->fl_flags & (FL_LEASE|FL_DELEG))
+#define IS_LEASE(fl)    (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
 #define IS_OFDLCK(fl)   (fl->fl_flags & FL_OFDLCK)
 static bool lease_breaking(struct file_lock *fl)
@@ -157,14 +157,11 @@ static int target_leasetype(struct file_lock *fl)
 int leases_enable = 1;
 int lease_break_time = 45;
-#define for_each_lock(inode, lockp) \
-        for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
 /*
 * The global file_lock_list is only used for displaying /proc/locks, so we
 * keep a list on each CPU, with each list protected by its own spinlock via
 * the file_lock_lglock. Note that alterations to the list also require that
- * the relevant i_lock is held.
+ * the relevant flc_lock is held.
 */
 DEFINE_STATIC_LGLOCK(file_lock_lglock);
 static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
@@ -192,21 +189,68 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
 * contrast to those that are acting as records of acquired locks).
 *
 * Note that when we acquire this lock in order to change the above fields,
- * we often hold the i_lock as well. In certain cases, when reading the fields
+ * we often hold the flc_lock as well. In certain cases, when reading the fields
 * protected by this lock, we can skip acquiring it iff we already hold the
- * i_lock.
+ * flc_lock.
 *
 * In particular, adding an entry to the fl_block list requires that you hold
- * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting
+ * both the flc_lock and the blocked_lock_lock (acquired in that order).
- * an entry from the list however only requires the file_lock_lock.
+ * Deleting an entry from the list however only requires the file_lock_lock.
 */
 static DEFINE_SPINLOCK(blocked_lock_lock);
+static struct kmem_cache *flctx_cache __read_mostly;
 static struct kmem_cache *filelock_cache __read_mostly;
+static struct file_lock_context *
+locks_get_lock_context(struct inode *inode)
+{
+        struct file_lock_context *new;
+        if (likely(inode->i_flctx))
+                goto out;
+        new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
+        if (!new)
+                goto out;
+        spin_lock_init(&new->flc_lock);
+        INIT_LIST_HEAD(&new->flc_flock);
+        INIT_LIST_HEAD(&new->flc_posix);
+        INIT_LIST_HEAD(&new->flc_lease);
+        /*
+         * Assign the pointer if it's not already assigned. If it is, then
+         * free the context we just allocated.
+         */
+        spin_lock(&inode->i_lock);
+        if (likely(!inode->i_flctx)) {
+                inode->i_flctx = new;
+                new = NULL;
+        }
+        spin_unlock(&inode->i_lock);
+        if (new)
+                kmem_cache_free(flctx_cache, new);
+out:
+        return inode->i_flctx;
+}
+void
+locks_free_lock_context(struct file_lock_context *ctx)
+{
+        if (ctx) {
+                WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
+                WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
+                WARN_ON_ONCE(!list_empty(&ctx->flc_lease));
+                kmem_cache_free(flctx_cache, ctx);
+        }
+}
 static void locks_init_lock_heads(struct file_lock *fl)
 {
        INIT_HLIST_NODE(&fl->fl_link);
+        INIT_LIST_HEAD(&fl->fl_list);
        INIT_LIST_HEAD(&fl->fl_block);
        init_waitqueue_head(&fl->fl_wait);
 }
@@ -243,6 +287,7 @@ EXPORT_SYMBOL_GPL(locks_release_private);
 void locks_free_lock(struct file_lock *fl)
 {
        BUG_ON(waitqueue_active(&fl->fl_wait));
+        BUG_ON(!list_empty(&fl->fl_list));
        BUG_ON(!list_empty(&fl->fl_block));
        BUG_ON(!hlist_unhashed(&fl->fl_link));
@@ -257,8 +302,8 @@ locks_dispose_list(struct list_head *dispose)
        struct file_lock *fl;
        while (!list_empty(dispose)) {
-                fl = list_first_entry(dispose, struct file_lock, fl_block);
+                fl = list_first_entry(dispose, struct file_lock, fl_list);
-                list_del_init(&fl->fl_block);
+                list_del_init(&fl->fl_list);
                locks_free_lock(fl);
        }
 }
@@ -513,7 +558,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
        return fl1->fl_owner == fl2->fl_owner;
 }
-/* Must be called with the i_lock held! */
+/* Must be called with the flc_lock held! */
 static void locks_insert_global_locks(struct file_lock *fl)
 {
        lg_local_lock(&file_lock_lglock);
@@ -522,12 +567,12 @@ static void locks_insert_global_locks(struct file_lock *fl)
        lg_local_unlock(&file_lock_lglock);
 }
-/* Must be called with the i_lock held! */
+/* Must be called with the flc_lock held! */
 static void locks_delete_global_locks(struct file_lock *fl)
 {
        /*
         * Avoid taking lock if already unhashed. This is safe since this check
-         * is done while holding the i_lock, and new insertions into the list
+         * is done while holding the flc_lock, and new insertions into the list
         * also require that it be held.
         */
        if (hlist_unhashed(&fl->fl_link))
@@ -579,10 +624,10 @@ static void locks_delete_block(struct file_lock *waiter)
 * the order they blocked. The documentation doesn't require this but
 * it seems like the reasonable thing to do.
 *
- * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
+ * Must be called with both the flc_lock and blocked_lock_lock held. The
- * list itself is protected by the blocked_lock_lock, but by ensuring that the
+ * fl_block list itself is protected by the blocked_lock_lock, but by ensuring
- * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
+ * that the flc_lock is also held on insertions we can avoid taking the
- * in some cases when we see that the fl_block list is empty.
+ * blocked_lock_lock in some cases when we see that the fl_block list is empty.
 */
 static void __locks_insert_block(struct file_lock *blocker,
                                        struct file_lock *waiter)
@@ -594,7 +639,7 @@ static void __locks_insert_block(struct file_lock *blocker,
                locks_insert_global_blocked(waiter);
 }
-/* Must be called with i_lock held. */
+/* Must be called with flc_lock held. */
 static void locks_insert_block(struct file_lock *blocker,
                                        struct file_lock *waiter)
 {
@@ -606,15 +651,15 @@ static void locks_insert_block(struct file_lock *blocker,
 /*
 * Wake up processes blocked waiting for blocker.
 *
- * Must be called with the inode->i_lock held!
+ * Must be called with the inode->flc_lock held!
 */
 static void locks_wake_up_blocks(struct file_lock *blocker)
 {
        /*
         * Avoid taking global lock if list is empty. This is safe since new
-         * blocked requests are only added to the list under the i_lock, and
+         * blocked requests are only added to the list under the flc_lock, and
-         * the i_lock is always held here. Note that removal from the fl_block
+         * the flc_lock is always held here. Note that removal from the fl_block
-         * list does not require the i_lock, so we must recheck list_empty()
+         * list does not require the flc_lock, so we must recheck list_empty()
         * after acquiring the blocked_lock_lock.
         */
        if (list_empty(&blocker->fl_block))
@@ -635,63 +680,36 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
        spin_unlock(&blocked_lock_lock);
 }
-/* Insert file lock fl into an inode's lock list at the position indicated
+static void
- * by pos. At the same time add the lock to the global file lock list.
+locks_insert_lock_ctx(struct file_lock *fl, int *counter,
- *
+                      struct list_head *before)
- * Must be called with the i_lock held!
- */
-static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
 {
        fl->fl_nspid = get_pid(task_tgid(current));
+        list_add_tail(&fl->fl_list, before);
-        /* insert into file's list */
+        ++*counter;
-        fl->fl_next = *pos;
-        *pos = fl;
        locks_insert_global_locks(fl);
 }
-/**
+static void
- * locks_delete_lock - Delete a lock and then free it.
+locks_unlink_lock_ctx(struct file_lock *fl, int *counter)
- * @thisfl_p: pointer that points to the fl_next field of the previous
- *            inode->i_flock list entry
- *
- * Unlink a lock from all lists and free the namespace reference, but don't
- * free it yet. Wake up processes that are blocked waiting for this lock and
- * notify the FS that the lock has been cleared.
- *
- * Must be called with the i_lock held!
- */
-static void locks_unlink_lock(struct file_lock **thisfl_p)
 {
-        struct file_lock *fl = *thisfl_p;
        locks_delete_global_locks(fl);
+        list_del_init(&fl->fl_list);
-        *thisfl_p = fl->fl_next;
+        --*counter;
-        fl->fl_next = NULL;
        if (fl->fl_nspid) {
                put_pid(fl->fl_nspid);
                fl->fl_nspid = NULL;
        }
        locks_wake_up_blocks(fl);
 }
-/*
+static void
- * Unlink a lock from all lists and free it.
+locks_delete_lock_ctx(struct file_lock *fl, int *counter,
- *
+                      struct list_head *dispose)
- * Must be called with i_lock held!
- */
-static void locks_delete_lock(struct file_lock **thisfl_p,
-                              struct list_head *dispose)
 {
-        struct file_lock *fl = *thisfl_p;
+        locks_unlink_lock_ctx(fl, counter);
-        locks_unlink_lock(thisfl_p);
        if (dispose)
-                list_add(&fl->fl_block, dispose);
+                list_add(&fl->fl_list, dispose);
        else
                locks_free_lock(fl);
 }
@@ -746,22 +764,27 @@ void
 posix_test_lock(struct file *filp, struct file_lock *fl)
 {
        struct file_lock *cfl;
+        struct file_lock_context *ctx;
        struct inode *inode = file_inode(filp);
-        spin_lock(&inode->i_lock);
+        ctx = inode->i_flctx;
-        for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {
+        if (!ctx || list_empty_careful(&ctx->flc_posix)) {
-                if (!IS_POSIX(cfl))
-                        continue;
-                if (posix_locks_conflict(fl, cfl))
-                        break;
-        }
-        if (cfl) {
-                locks_copy_conflock(fl, cfl);
-                if (cfl->fl_nspid)
-                        fl->fl_pid = pid_vnr(cfl->fl_nspid);
-        } else
                fl->fl_type = F_UNLCK;
-        spin_unlock(&inode->i_lock);
+                return;
+        }
+        spin_lock(&ctx->flc_lock);
+        list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
+                if (posix_locks_conflict(fl, cfl)) {
+                        locks_copy_conflock(fl, cfl);
+                        if (cfl->fl_nspid)
+                                fl->fl_pid = pid_vnr(cfl->fl_nspid);
+                        goto out;
+                }
+        }
+        fl->fl_type = F_UNLCK;
+out:
+        spin_unlock(&ctx->flc_lock);
        return;
 }
 EXPORT_SYMBOL(posix_test_lock);
@@ -845,34 +868,34 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
 static int flock_lock_file(struct file *filp, struct file_lock *request)
 {
        struct file_lock *new_fl = NULL;
-        struct file_lock **before;
+        struct file_lock *fl;
-        struct inode * inode = file_inode(filp);
+        struct file_lock_context *ctx;
+        struct inode *inode = file_inode(filp);
        int error = 0;
-        int found = 0;
+        bool found = false;
        LIST_HEAD(dispose);
+        ctx = locks_get_lock_context(inode);
+        if (!ctx)
+                return -ENOMEM;
        if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
                new_fl = locks_alloc_lock();
                if (!new_fl)
                        return -ENOMEM;
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
        if (request->fl_flags & FL_ACCESS)
                goto find_conflict;
-        for_each_lock(inode, before) {
+        list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
-                struct file_lock *fl = *before;
-                if (IS_POSIX(fl))
-                        break;
-                if (IS_LEASE(fl))
-                        continue;
                if (filp != fl->fl_file)
                        continue;
                if (request->fl_type == fl->fl_type)
                        goto out;
-                found = 1;
+                found = true;
-                locks_delete_lock(before, &dispose);
+                locks_delete_lock_ctx(fl, &ctx->flc_flock_cnt, &dispose);
                break;
        }
@@ -887,18 +910,13 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
         * give it the opportunity to lock the file.
         */
        if (found) {
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&ctx->flc_lock);
                cond_resched();
-                spin_lock(&inode->i_lock);
+                spin_lock(&ctx->flc_lock);
        }
 find_conflict:
-        for_each_lock(inode, before) {
+        list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
-                struct file_lock *fl = *before;
-                if (IS_POSIX(fl))
-                        break;
-                if (IS_LEASE(fl))
-                        continue;
                if (!flock_locks_conflict(request, fl))
                        continue;
                error = -EAGAIN;
@@ -911,12 +929,12 @@ find_conflict:
        if (request->fl_flags & FL_ACCESS)
                goto out;
        locks_copy_lock(new_fl, request);
-        locks_insert_lock(before, new_fl);
+        locks_insert_lock_ctx(new_fl, &ctx->flc_flock_cnt, &ctx->flc_flock);
        new_fl = NULL;
        error = 0;
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        if (new_fl)
                locks_free_lock(new_fl);
        locks_dispose_list(&dispose);
@@ -925,16 +943,20 @@ out:
 static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
 {
-        struct file_lock *fl;
+        struct file_lock *fl, *tmp;
        struct file_lock *new_fl = NULL;
        struct file_lock *new_fl2 = NULL;
        struct file_lock *left = NULL;
        struct file_lock *right = NULL;
-        struct file_lock **before;
+        struct file_lock_context *ctx;
        int error;
        bool added = false;
        LIST_HEAD(dispose);
+        ctx = locks_get_lock_context(inode);
+        if (!ctx)
+                return -ENOMEM;
        /*
         * We may need two file_lock structures for this operation,
         * so we get them in advance to avoid races.
@@ -948,15 +970,14 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                new_fl2 = locks_alloc_lock();
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
        /*
         * New lock request. Walk all POSIX locks and look for conflicts. If
         * there are any, either return error or put the request on the
         * blocker's list of waiters and the global blocked_hash.
         */
        if (request->fl_type != F_UNLCK) {
-                for_each_lock(inode, before) {
+                list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
-                        fl = *before;
                        if (!IS_POSIX(fl))
                                continue;
                        if (!posix_locks_conflict(request, fl))
@@ -986,29 +1007,25 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
        if (request->fl_flags & FL_ACCESS)
                goto out;
-        /*
+        /* Find the first old lock with the same owner as the new lock */
-         * Find the first old lock with the same owner as the new lock.
+        list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
-         */
+                if (posix_same_owner(request, fl))
-        
+                        break;
-        before = &inode->i_flock;
-        /* First skip locks owned by other processes.  */
-        while ((fl = *before) && (!IS_POSIX(fl) ||
-                                  !posix_same_owner(request, fl))) {
-                before = &fl->fl_next;
        }
        /* Process locks with this owner. */
-        while ((fl = *before) && posix_same_owner(request, fl)) {
+        list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) {
-                /* Detect adjacent or overlapping regions (if same lock type)
+                if (!posix_same_owner(request, fl))
-                 */
+                        break;
+                /* Detect adjacent or overlapping regions (if same lock type) */
                if (request->fl_type == fl->fl_type) {
                        /* In all comparisons of start vs end, use
                         * "start - 1" rather than "end + 1". If end
                         * is OFFSET_MAX, end + 1 will become negative.
                         */
                        if (fl->fl_end < request->fl_start - 1)
-                                goto next_lock;
+                                continue;
                        /* If the next lock in the list has entirely bigger
                         * addresses than the new one, insert the lock here.
                         */
@@ -1029,18 +1046,18 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                        else
                                request->fl_end = fl->fl_end;
                        if (added) {
-                                locks_delete_lock(before, &dispose);
+                                locks_delete_lock_ctx(fl, &ctx->flc_posix_cnt,
+                                                        &dispose);
                                continue;
                        }
                        request = fl;
                        added = true;
-                }
+                } else {
-                else {
                        /* Processing for different lock types is a bit
                         * more complex.
                         */
                        if (fl->fl_end < request->fl_start)
-                                goto next_lock;
+                                continue;
                        if (fl->fl_start > request->fl_end)
                                break;
                        if (request->fl_type == F_UNLCK)
@@ -1059,7 +1076,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                                 * one (This may happen several times).
                                 */
                                if (added) {
-                                        locks_delete_lock(before, &dispose);
+                                        locks_delete_lock_ctx(fl,
+                                                &ctx->flc_posix_cnt, &dispose);
                                        continue;
                                }
                                /*
@@ -1075,15 +1093,13 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                                locks_copy_lock(new_fl, request);
                                request = new_fl;
                                new_fl = NULL;
-                                locks_delete_lock(before, &dispose);
+                                locks_insert_lock_ctx(request,
-                                locks_insert_lock(before, request);
+                                        &ctx->flc_posix_cnt, &fl->fl_list);
+                                locks_delete_lock_ctx(fl,
+                                        &ctx->flc_posix_cnt, &dispose);
                                added = true;
                        }
                }
-                /* Go on to next lock.
-                 */
-        next_lock:
-                before = &fl->fl_next;
        }
        /*
@@ -1108,7 +1124,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                        goto out;
                }
                locks_copy_lock(new_fl, request);
-                locks_insert_lock(before, new_fl);
+                locks_insert_lock_ctx(new_fl, &ctx->flc_posix_cnt,
+                                        &fl->fl_list);
                new_fl = NULL;
        }
        if (right) {
@@ -1119,7 +1136,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                        left = new_fl2;
                        new_fl2 = NULL;
                        locks_copy_lock(left, right);
-                        locks_insert_lock(before, left);
+                        locks_insert_lock_ctx(left, &ctx->flc_posix_cnt,
+                                                &fl->fl_list);
                }
                right->fl_start = request->fl_end + 1;
                locks_wake_up_blocks(right);
@@ -1129,7 +1147,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                locks_wake_up_blocks(left);
        }
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        /*
         * Free any unused locks.
         */
@@ -1199,22 +1217,29 @@ EXPORT_SYMBOL(posix_lock_file_wait);
 */
 int locks_mandatory_locked(struct file *file)
 {
+        int ret;
        struct inode *inode = file_inode(file);
+        struct file_lock_context *ctx;
        struct file_lock *fl;
+        ctx = inode->i_flctx;
+        if (!ctx || list_empty_careful(&ctx->flc_posix))
+                return 0;
        /*
         * Search the lock list for this inode for any POSIX locks.
         */
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
-        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+        ret = 0;
-                if (!IS_POSIX(fl))
+        list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
-                        continue;
                if (fl->fl_owner != current->files &&
-                    fl->fl_owner != file)
+                    fl->fl_owner != file) {
+                        ret = -EAGAIN;
                        break;
+                }
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
-        return fl ? -EAGAIN : 0;
+        return ret;
 }
 /**
@@ -1294,9 +1319,9 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
 }
 /* We already had a lease on this file; just change its type */
-int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
+int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 {
-        struct file_lock *fl = *before;
+        struct file_lock_context *flctx;
        int error = assign_type(fl, arg);
        if (error)
@@ -1306,6 +1331,7 @@ int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
        if (arg == F_UNLCK) {
                struct file *filp = fl->fl_file;
+                flctx = file_inode(filp)->i_flctx;
                f_delown(filp);
                filp->f_owner.signum = 0;
                fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync);
@@ -1313,7 +1339,7 @@ int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
                        printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
                        fl->fl_fasync = NULL;
                }
-                locks_delete_lock(before, dispose);
+                locks_delete_lock_ctx(fl, &flctx->flc_lease_cnt, dispose);
        }
        return 0;
 }
@@ -1329,25 +1355,24 @@ static bool past_time(unsigned long then)
 static void time_out_leases(struct inode *inode, struct list_head *dispose)
 {
-        struct file_lock **before;
+        struct file_lock_context *ctx = inode->i_flctx;
-        struct file_lock *fl;
+        struct file_lock *fl, *tmp;
-        lockdep_assert_held(&inode->i_lock);
+        lockdep_assert_held(&ctx->flc_lock);
-        before = &inode->i_flock;
+        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
-        while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
                trace_time_out_leases(inode, fl);
                if (past_time(fl->fl_downgrade_time))
-                        lease_modify(before, F_RDLCK, dispose);
+                        lease_modify(fl, F_RDLCK, dispose);
                if (past_time(fl->fl_break_time))
-                        lease_modify(before, F_UNLCK, dispose);
+                        lease_modify(fl, F_UNLCK, dispose);
-                if (fl == *before)      /* lease_modify may have freed fl */
-                        before = &fl->fl_next;
        }
 }
 static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
 {
+        if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
+                return false;
        if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
                return false;
        return locks_conflict(breaker, lease);
@@ -1356,11 +1381,12 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
 static bool
 any_leases_conflict(struct inode *inode, struct file_lock *breaker)
 {
+        struct file_lock_context *ctx = inode->i_flctx;
        struct file_lock *fl;
-        lockdep_assert_held(&inode->i_lock);
+        lockdep_assert_held(&ctx->flc_lock);
-        for (fl = inode->i_flock ; fl && IS_LEASE(fl); fl = fl->fl_next) {
+        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
                if (leases_conflict(fl, breaker))
                        return true;
        }
@@ -1384,7 +1410,8 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
        int error = 0;
        struct file_lock *new_fl;
-        struct file_lock *fl, **before;
+        struct file_lock_context *ctx = inode->i_flctx;
+        struct file_lock *fl;
        unsigned long break_time;
        int want_write = (mode & O_ACCMODE) != O_RDONLY;
        LIST_HEAD(dispose);
@@ -1394,7 +1421,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
                return PTR_ERR(new_fl);
        new_fl->fl_flags = type;
-        spin_lock(&inode->i_lock);
+        /* typically we will check that ctx is non-NULL before calling */
+        if (!ctx) {
+                WARN_ON_ONCE(1);
+                return error;
+        }
+        spin_lock(&ctx->flc_lock);
        time_out_leases(inode, &dispose);
@@ -1408,9 +1441,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
                        break_time++;   /* so that 0 means no break time */
        }
-        for (before = &inode->i_flock;
+        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-                        ((fl = *before) != NULL) && IS_LEASE(fl);
-                        before = &fl->fl_next) {
                if (!leases_conflict(fl, new_fl))
                        continue;
                if (want_write) {
@@ -1419,17 +1450,17 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
                        fl->fl_flags |= FL_UNLOCK_PENDING;
                        fl->fl_break_time = break_time;
                } else {
-                        if (lease_breaking(inode->i_flock))
+                        if (lease_breaking(fl))
                                continue;
                        fl->fl_flags |= FL_DOWNGRADE_PENDING;
                        fl->fl_downgrade_time = break_time;
                }
                if (fl->fl_lmops->lm_break(fl))
-                        locks_delete_lock(before, &dispose);
+                        locks_delete_lock_ctx(fl, &ctx->flc_lease_cnt,
+                                                &dispose);
        }
-        fl = inode->i_flock;
+        if (list_empty(&ctx->flc_lease))
-        if (!fl || !IS_LEASE(fl))
                goto out;
        if (mode & O_NONBLOCK) {
@@ -1439,18 +1470,19 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
        }
 restart:
-        break_time = inode->i_flock->fl_break_time;
+        fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list);
+        break_time = fl->fl_break_time;
        if (break_time != 0)
                break_time -= jiffies;
        if (break_time == 0)
                break_time++;
-        locks_insert_block(inode->i_flock, new_fl);
+        locks_insert_block(fl, new_fl);
        trace_break_lease_block(inode, new_fl);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        locks_dispose_list(&dispose);
        error = wait_event_interruptible_timeout(new_fl->fl_wait,
                                                !new_fl->fl_next, break_time);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
        trace_break_lease_unblock(inode, new_fl);
        locks_delete_block(new_fl);
        if (error >= 0) {
@@ -1462,12 +1494,10 @@ restart:
                        time_out_leases(inode, &dispose);
                if (any_leases_conflict(inode, new_fl))
                        goto restart;
                error = 0;
        }
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        locks_dispose_list(&dispose);
        locks_free_lock(new_fl);
        return error;
@@ -1487,14 +1517,18 @@ EXPORT_SYMBOL(__break_lease);
 void lease_get_mtime(struct inode *inode, struct timespec *time)
 {
        bool has_lease = false;
-        struct file_lock *flock;
+        struct file_lock_context *ctx = inode->i_flctx;
+        struct file_lock *fl;
-        if (inode->i_flock) {
+        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
-                spin_lock(&inode->i_lock);
+                spin_lock(&ctx->flc_lock);
-                flock = inode->i_flock;
+                if (!list_empty(&ctx->flc_lease)) {
-                if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK))
+                        fl = list_first_entry(&ctx->flc_lease,
-                        has_lease = true;
+                                                struct file_lock, fl_list);
-                spin_unlock(&inode->i_lock);
+                        if (fl->fl_type == F_WRLCK)
+                                has_lease = true;
+                }
+                spin_unlock(&ctx->flc_lock);
        }
        if (has_lease)
@@ -1532,20 +1566,22 @@ int fcntl_getlease(struct file *filp)
 {
        struct file_lock *fl;
        struct inode *inode = file_inode(filp);
+        struct file_lock_context *ctx = inode->i_flctx;
        int type = F_UNLCK;
        LIST_HEAD(dispose);
-        spin_lock(&inode->i_lock);
+        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
-        time_out_leases(file_inode(filp), &dispose);
+                spin_lock(&ctx->flc_lock);
-        for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
+                time_out_leases(file_inode(filp), &dispose);
-                        fl = fl->fl_next) {
+                list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-                if (fl->fl_file == filp) {
+                        if (fl->fl_file != filp)
+                                continue;
                        type = target_leasetype(fl);
                        break;
                }
+                spin_unlock(&ctx->flc_lock);
+                locks_dispose_list(&dispose);
        }
-        spin_unlock(&inode->i_lock);
-        locks_dispose_list(&dispose);
        return type;
 }
@@ -1560,11 +1596,14 @@ int fcntl_getlease(struct file *filp)
 * conflict with the lease we're trying to set.
 */
 static int
-check_conflicting_open(const struct dentry *dentry, const long arg)
+check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
 {
        int ret = 0;
        struct inode *inode = dentry->d_inode;
+        if (flags & FL_LAYOUT)
+                return 0;
        if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
                return -EAGAIN;
@@ -1578,9 +1617,10 @@ check_conflicting_open(const struct dentry *dentry, const long arg)
 static int
 generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
 {
-        struct file_lock *fl, **before, **my_before = NULL, *lease;
+        struct file_lock *fl, *my_fl = NULL, *lease;
        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
+        struct file_lock_context *ctx;
        bool is_deleg = (*flp)->fl_flags & FL_DELEG;
        int error;
        LIST_HEAD(dispose);
@@ -1588,6 +1628,10 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
        lease = *flp;
        trace_generic_add_lease(inode, lease);
+        ctx = locks_get_lock_context(inode);
+        if (!ctx)
+                return -ENOMEM;
        /*
         * In the delegation case we need mutual exclusion with
         * a number of operations that take the i_mutex.  We trylock
@@ -1606,9 +1650,9 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
                return -EINVAL;
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
        time_out_leases(inode, &dispose);
-        error = check_conflicting_open(dentry, arg);
+        error = check_conflicting_open(dentry, arg, lease->fl_flags);
        if (error)
                goto out;
@@ -1621,13 +1665,13 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
         * except for this filp.
         */
        error = -EAGAIN;
-        for (before = &inode->i_flock;
+        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-                        ((fl = *before) != NULL) && IS_LEASE(fl);
+                if (fl->fl_file == filp &&
-                        before = &fl->fl_next) {
+                    fl->fl_owner == lease->fl_owner) {
-                if (fl->fl_file == filp) {
+                        my_fl = fl;
-                        my_before = before;
                        continue;
                }
                /*
                 * No exclusive leases if someone else has a lease on
                 * this file:
@@ -1642,9 +1686,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
                        goto out;
        }
-        if (my_before != NULL) {
+        if (my_fl != NULL) {
-                lease = *my_before;
+                error = lease->fl_lmops->lm_change(my_fl, arg, &dispose);
-                error = lease->fl_lmops->lm_change(my_before, arg, &dispose);
                if (error)
                        goto out;
                goto out_setup;
@@ -1654,7 +1697,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
        if (!leases_enable)
                goto out;
-        locks_insert_lock(before, lease);
+        locks_insert_lock_ctx(lease, &ctx->flc_lease_cnt, &ctx->flc_lease);
        /*
         * The check in break_lease() is lockless. It's possible for another
         * open to race in after we did the earlier check for a conflicting
@@ -1665,46 +1708,51 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
         * precedes these checks.
         */
        smp_mb();
-        error = check_conflicting_open(dentry, arg);
+        error = check_conflicting_open(dentry, arg, lease->fl_flags);
-        if (error)
+        if (error) {
-                goto out_unlink;
+                locks_unlink_lock_ctx(lease, &ctx->flc_lease_cnt);
+                goto out;
+        }
 out_setup:
        if (lease->fl_lmops->lm_setup)
                lease->fl_lmops->lm_setup(lease, priv);
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        locks_dispose_list(&dispose);
        if (is_deleg)
                mutex_unlock(&inode->i_mutex);
-        if (!error && !my_before)
+        if (!error && !my_fl)
                *flp = NULL;
        return error;
-out_unlink:
-        locks_unlink_lock(before);
-        goto out;
 }
-static int generic_delete_lease(struct file *filp)
+static int generic_delete_lease(struct file *filp, void *owner)
 {
        int error = -EAGAIN;
-        struct file_lock *fl, **before;
+        struct file_lock *fl, *victim = NULL;
        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
+        struct file_lock_context *ctx = inode->i_flctx;
        LIST_HEAD(dispose);
-        spin_lock(&inode->i_lock);
+        if (!ctx) {
-        time_out_leases(inode, &dispose);
+                trace_generic_delete_lease(inode, NULL);
-        for (before = &inode->i_flock;
+                return error;
-                        ((fl = *before) != NULL) && IS_LEASE(fl);
+        }
-                        before = &fl->fl_next) {
-                if (fl->fl_file == filp)
+        spin_lock(&ctx->flc_lock);
+        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+                if (fl->fl_file == filp &&
+                    fl->fl_owner == owner) {
+                        victim = fl;
                        break;
+                }
        }
        trace_generic_delete_lease(inode, fl);
-        if (fl && IS_LEASE(fl))
+        if (victim)
-                error = fl->fl_lmops->lm_change(before, F_UNLCK, &dispose);
+                error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        locks_dispose_list(&dispose);
        return error;
 }
@@ -1737,13 +1785,14 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
        switch (arg) {
        case F_UNLCK:
-                return generic_delete_lease(filp);
+                return generic_delete_lease(filp, *priv);
        case F_RDLCK:
        case F_WRLCK:
                if (!(*flp)->fl_lmops->lm_break) {
                        WARN_ON_ONCE(1);
                        return -ENOLCK;
                }
                return generic_add_lease(filp, arg, flp, priv);
        default:
                return -EINVAL;
@@ -1816,7 +1865,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 {
        if (arg == F_UNLCK)
-                return vfs_setlease(filp, F_UNLCK, NULL, NULL);
+                return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
        return do_fcntl_add_lease(fd, filp, arg);
 }
@@ -2171,7 +2220,7 @@ again:
         */
        /*
         * we need that spin_lock here - it prevents reordering between
-         * update of inode->i_flock and check for it done in close().
+         * update of i_flctx->flc_posix and check for it done in close().
         * rcu_read_lock() wouldn't do.
         */
        spin_lock(&current->files->file_lock);
@@ -2331,13 +2380,14 @@ out:
 void locks_remove_posix(struct file *filp, fl_owner_t owner)
 {
        struct file_lock lock;
+        struct file_lock_context *ctx = file_inode(filp)->i_flctx;
        /*
         * If there are no locks held on this file, we don't need to call
         * posix_lock_file().  Another process could be setting a lock on this
         * file at the same time, but we wouldn't remove that lock anyway.
         */
-        if (!file_inode(filp)->i_flock)
+        if (!ctx || list_empty(&ctx->flc_posix))
                return;
        lock.fl_type = F_UNLCK;
@@ -2358,67 +2408,67 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 EXPORT_SYMBOL(locks_remove_posix);
+/* The i_flctx must be valid when calling into here */
+static void
+locks_remove_flock(struct file *filp)
+{
+        struct file_lock fl = {
+                .fl_owner = filp,
+                .fl_pid = current->tgid,
+                .fl_file = filp,
+                .fl_flags = FL_FLOCK,
+                .fl_type = F_UNLCK,
+                .fl_end = OFFSET_MAX,
+        };
+        struct file_lock_context *flctx = file_inode(filp)->i_flctx;
+        if (list_empty(&flctx->flc_flock))
+                return;
+        if (filp->f_op->flock)
+                filp->f_op->flock(filp, F_SETLKW, &fl);
+        else
+                flock_lock_file(filp, &fl);
+        if (fl.fl_ops && fl.fl_ops->fl_release_private)
+                fl.fl_ops->fl_release_private(&fl);
+}
+/* The i_flctx must be valid when calling into here */
+static void
+locks_remove_lease(struct file *filp)
+{
+        struct inode *inode = file_inode(filp);
+        struct file_lock_context *ctx = inode->i_flctx;
+        struct file_lock *fl, *tmp;
+        LIST_HEAD(dispose);
+        if (list_empty(&ctx->flc_lease))
+                return;
+        spin_lock(&ctx->flc_lock);
+        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
+                lease_modify(fl, F_UNLCK, &dispose);
+        spin_unlock(&ctx->flc_lock);
+        locks_dispose_list(&dispose);
+}
 /*
 * This function is called on the last close of an open file.
 */
 void locks_remove_file(struct file *filp)
 {
-        struct inode * inode = file_inode(filp);
+        if (!file_inode(filp)->i_flctx)
-        struct file_lock *fl;
-        struct file_lock **before;
-        LIST_HEAD(dispose);
-        if (!inode->i_flock)
                return;
+        /* remove any OFD locks */
        locks_remove_posix(filp, filp);
-        if (filp->f_op->flock) {
+        /* remove flock locks */
-                struct file_lock fl = {
+        locks_remove_flock(filp);
-                        .fl_owner = filp,
-                        .fl_pid = current->tgid,
-                        .fl_file = filp,
-                        .fl_flags = FL_FLOCK,
-                        .fl_type = F_UNLCK,
-                        .fl_end = OFFSET_MAX,
-                };
-                filp->f_op->flock(filp, F_SETLKW, &fl);
-                if (fl.fl_ops && fl.fl_ops->fl_release_private)
-                        fl.fl_ops->fl_release_private(&fl);
-        }
-        spin_lock(&inode->i_lock);
-        before = &inode->i_flock;
-        while ((fl = *before) != NULL) {
-                if (fl->fl_file == filp) {
-                        if (IS_LEASE(fl)) {
-                                lease_modify(before, F_UNLCK, &dispose);
-                                continue;
-                        }
-                        /*
-                         * There's a leftover lock on the list of a type that
-                         * we didn't expect to see. Most likely a classic
-                         * POSIX lock that ended up not getting released
-                         * properly, or that raced onto the list somehow. Log
-                         * some info about it and then just remove it from
-                         * the list.
-                         */
-                        WARN(!IS_FLOCK(fl),
-                                "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
-                                MAJOR(inode->i_sb->s_dev),
-                                MINOR(inode->i_sb->s_dev), inode->i_ino,
-                                fl->fl_type, fl->fl_flags,
-                                fl->fl_start, fl->fl_end);
-                        locks_delete_lock(before, &dispose);
+        /* remove any leases */
-                        continue;
+        locks_remove_lease(filp);
-                }
-                before = &fl->fl_next;
-        }
-        spin_unlock(&inode->i_lock);
-        locks_dispose_list(&dispose);
 }
 /**
@@ -2621,6 +2671,9 @@ static int __init filelock_init(void)
 {
        int i;
+        flctx_cache = kmem_cache_create("file_lock_ctx",
+                        sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
        filelock_cache = kmem_cache_create("file_lock_cache",
                        sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index e31e589369a4..01a9e16e9782 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -267,7 +267,6 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
        if (inode) {
                atomic_set(&NCP_FINFO(inode)->opened, info->opened);
-                inode->i_mapping->backing_dev_info = sb->s_bdi;
                inode->i_ino = info->ino;
                ncp_set_attr(inode, info);
                if (S_ISREG(inode->i_mode)) {
@@ -560,7 +559,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        server = NCP_SBP(sb);
        memset(server, 0, sizeof(*server));
-        error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
+        error = bdi_setup_and_register(&server->bdi, "ncpfs");
        if (error)
                goto out_fput;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 4464eb06b0b6..a1f0685b42ff 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -85,25 +85,30 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 {
        struct inode *inode = state->inode;
        struct file_lock *fl;
+        struct file_lock_context *flctx = inode->i_flctx;
+        struct list_head *list;
        int status = 0;
-        if (inode->i_flock == NULL)
+        if (flctx == NULL)
                goto out;
-        /* Protect inode->i_flock using the i_lock */
+        list = &flctx->flc_posix;
-        spin_lock(&inode->i_lock);
+        spin_lock(&flctx->flc_lock);
-        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+restart:
-                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
+        list_for_each_entry(fl, list, fl_list) {
-                        continue;
                if (nfs_file_open_context(fl->fl_file) != ctx)
                        continue;
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&flctx->flc_lock);
                status = nfs4_lock_delegation_recall(fl, state, stateid);
                if (status < 0)
                        goto out;
-                spin_lock(&inode->i_lock);
+                spin_lock(&flctx->flc_lock);
        }
-        spin_unlock(&inode->i_lock);
+        if (list == &flctx->flc_posix) {
+                list = &flctx->flc_flock;
+                goto restart;
+        }
+        spin_unlock(&flctx->flc_lock);
 out:
        return status;
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2ab6f00dba5b..94712fc781fa 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -646,7 +646,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = nfs_vm_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 static int nfs_need_sync_write(struct file *filp, struct inode *inode)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 3c9769441f36..91e88a7ecef0 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -960,52 +960,19 @@ filelayout_mark_request_commit(struct nfs_page *req,
 {
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
        u32 i, j;
-        struct list_head *list;
-        struct pnfs_commit_bucket *buckets;
        if (fl->commit_through_mds) {
-                list = &cinfo->mds->list;
+                nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
-                spin_lock(cinfo->lock);
+        } else {
-                goto mds_commit;
+                /* Note that we are calling nfs4_fl_calc_j_index on each page
-        }
+                 * that ends up being committed to a data server.  An attractive
+                 * alternative is to add a field to nfs_write_data and nfs_page
-        /* Note that we are calling nfs4_fl_calc_j_index on each page
+                 * to store the value calculated in filelayout_write_pagelist
-         * that ends up being committed to a data server.  An attractive
+                 * and just use that here.
-         * alternative is to add a field to nfs_write_data and nfs_page
-         * to store the value calculated in filelayout_write_pagelist
-         * and just use that here.
-         */
-        j = nfs4_fl_calc_j_index(lseg, req_offset(req));
-        i = select_bucket_index(fl, j);
-        spin_lock(cinfo->lock);
-        buckets = cinfo->ds->buckets;
-        list = &buckets[i].written;
-        if (list_empty(list)) {
-                /* Non-empty buckets hold a reference on the lseg.  That ref
-                 * is normally transferred to the COMMIT call and released
-                 * there.  It could also be released if the last req is pulled
-                 * off due to a rewrite, in which case it will be done in
-                 * pnfs_generic_clear_request_commit
                 */
-                buckets[i].wlseg = pnfs_get_lseg(lseg);
+                j = nfs4_fl_calc_j_index(lseg, req_offset(req));
-        }
+                i = select_bucket_index(fl, j);
-        set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+                pnfs_layout_mark_request_commit(req, lseg, cinfo, i);
-        cinfo->ds->nwritten++;
-mds_commit:
-        /* nfs_request_add_commit_list(). We need to add req to list without
-         * dropping cinfo lock.
-         */
-        set_bit(PG_CLEAN, &(req)->wb_flags);
-        nfs_list_add_request(req, list);
-        cinfo->mds->ncommit++;
-        spin_unlock(cinfo->lock);
-        if (!cinfo->dreq) {
-                inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-                inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
-                             BDI_RECLAIMABLE);
-                __mark_inode_dirty(req->wb_context->dentry->d_inode,
-                                   I_DIRTY_DATASYNC);
        }
 }
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index f29fb7d7e8f8..315cc68945b9 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1332,47 +1332,6 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
        return PNFS_ATTEMPTED;
 }
-static void
-ff_layout_mark_request_commit(struct nfs_page *req,
-                              struct pnfs_layout_segment *lseg,
-                              struct nfs_commit_info *cinfo,
-                              u32 ds_commit_idx)
-{
-        struct list_head *list;
-        struct pnfs_commit_bucket *buckets;
-        spin_lock(cinfo->lock);
-        buckets = cinfo->ds->buckets;
-        list = &buckets[ds_commit_idx].written;
-        if (list_empty(list)) {
-                /* Non-empty buckets hold a reference on the lseg.  That ref
-                 * is normally transferred to the COMMIT call and released
-                 * there.  It could also be released if the last req is pulled
-                 * off due to a rewrite, in which case it will be done in
-                 * pnfs_common_clear_request_commit
-                 */
-                WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
-                buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
-        }
-        set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
-        cinfo->ds->nwritten++;
-        /* nfs_request_add_commit_list(). We need to add req to list without
-         * dropping cinfo lock.
-         */
-        set_bit(PG_CLEAN, &(req)->wb_flags);
-        nfs_list_add_request(req, list);
-        cinfo->mds->ncommit++;
-        spin_unlock(cinfo->lock);
-        if (!cinfo->dreq) {
-                inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-                inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
-                             BDI_RECLAIMABLE);
-                __mark_inode_dirty(req->wb_context->dentry->d_inode,
-                                   I_DIRTY_DATASYNC);
-        }
-}
 static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
 {
        return i;
@@ -1540,7 +1499,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
        .pg_write_ops           = &ff_layout_pg_write_ops,
        .get_ds_info            = ff_layout_get_ds_info,
        .free_deviceid_node     = ff_layout_free_deveiceid_node,
-        .mark_request_commit    = ff_layout_mark_request_commit,
+        .mark_request_commit    = pnfs_layout_mark_request_commit,
        .clear_request_commit   = pnfs_generic_clear_request_commit,
        .scan_commit_lists      = pnfs_generic_scan_commit_lists,
        .recover_commit_reqs    = pnfs_generic_recover_commit_reqs,
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e211f975a69a..83107be3dd01 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -388,7 +388,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
                if (S_ISREG(inode->i_mode)) {
                        inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
                        inode->i_data.a_ops = &nfs_file_aops;
-                        inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
                } else if (S_ISDIR(inode->i_mode)) {
                        inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
                        inode->i_fop = &nfs_dir_operations;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 21469e6e3834..b802fb3a2d99 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -430,7 +430,6 @@ int  nfs_show_options(struct seq_file *, struct dentry *);
 int  nfs_show_devname(struct seq_file *, struct dentry *);
 int  nfs_show_path(struct seq_file *, struct dentry *);
 int  nfs_show_stats(struct seq_file *, struct dentry *);
-void nfs_put_super(struct super_block *);
 int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 /* write.c */
@@ -599,6 +598,19 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
 }
 /*
+ * Record the page as unstable and mark its inode as dirty.
+ */
+static inline
+void nfs_mark_page_unstable(struct page *page)
+{
+        struct inode *inode = page_file_mapping(page)->host;
+        inc_zone_page_state(page, NR_UNSTABLE_NFS);
+        inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE);
+         __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+}
+/*
 * Determine the number of bytes of data the page contains
 */
 static inline
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 590f096fd011..5ad908e9ce9c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1373,49 +1373,55 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
        struct nfs_inode *nfsi = NFS_I(inode);
        struct file_lock *fl;
        int status = 0;
+        struct file_lock_context *flctx = inode->i_flctx;
+        struct list_head *list;
-        if (inode->i_flock == NULL)
+        if (flctx == NULL)
                return 0;
+        list = &flctx->flc_posix;
        /* Guard against delegation returns and new lock/unlock calls */
        down_write(&nfsi->rwsem);
-        /* Protect inode->i_flock using the BKL */
+        spin_lock(&flctx->flc_lock);
-        spin_lock(&inode->i_lock);
+restart:
-        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+        list_for_each_entry(fl, list, fl_list) {
-                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
-                        continue;
                if (nfs_file_open_context(fl->fl_file)->state != state)
                        continue;
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&flctx->flc_lock);
                status = ops->recover_lock(state, fl);
                switch (status) {
-                        case 0:
+                case 0:
-                                break;
+                        break;
-                        case -ESTALE:
+                case -ESTALE:
-                        case -NFS4ERR_ADMIN_REVOKED:
+                case -NFS4ERR_ADMIN_REVOKED:
-                        case -NFS4ERR_STALE_STATEID:
+                case -NFS4ERR_STALE_STATEID:
-                        case -NFS4ERR_BAD_STATEID:
+                case -NFS4ERR_BAD_STATEID:
-                        case -NFS4ERR_EXPIRED:
+                case -NFS4ERR_EXPIRED:
-                        case -NFS4ERR_NO_GRACE:
+                case -NFS4ERR_NO_GRACE:
-                        case -NFS4ERR_STALE_CLIENTID:
+                case -NFS4ERR_STALE_CLIENTID:
-                        case -NFS4ERR_BADSESSION:
+                case -NFS4ERR_BADSESSION:
-                        case -NFS4ERR_BADSLOT:
+                case -NFS4ERR_BADSLOT:
-                        case -NFS4ERR_BAD_HIGH_SLOT:
+                case -NFS4ERR_BAD_HIGH_SLOT:
-                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+                case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
-                                goto out;
+                        goto out;
-                        default:
+                default:
-                                printk(KERN_ERR "NFS: %s: unhandled error %d\n",
+                        pr_err("NFS: %s: unhandled error %d\n",
-                                         __func__, status);
+                                        __func__, status);
-                        case -ENOMEM:
+                case -ENOMEM:
-                        case -NFS4ERR_DENIED:
+                case -NFS4ERR_DENIED:
-                        case -NFS4ERR_RECLAIM_BAD:
+                case -NFS4ERR_RECLAIM_BAD:
-                        case -NFS4ERR_RECLAIM_CONFLICT:
+                case -NFS4ERR_RECLAIM_CONFLICT:
-                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
+                        /* kill_proc(fl->fl_pid, SIGLOST, 1); */
-                                status = 0;
+                        status = 0;
                }
-                spin_lock(&inode->i_lock);
+                spin_lock(&flctx->flc_lock);
        }
-        spin_unlock(&inode->i_lock);
+        if (list == &flctx->flc_posix) {
+                list = &flctx->flc_flock;
+                goto restart;
+        }
+        spin_unlock(&flctx->flc_lock);
 out:
        up_write(&nfsi->rwsem);
        return status;
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 48cea3c30e5d..75090feeafad 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -53,7 +53,6 @@ static const struct super_operations nfs4_sops = {
        .destroy_inode  = nfs_destroy_inode,
        .write_inode    = nfs4_write_inode,
        .drop_inode     = nfs_drop_inode,
-        .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
        .evict_inode    = nfs4_evict_inode,
        .umount_begin   = nfs_umount_begin,
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 960c99f75d3f..d57190a0d533 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -933,11 +933,15 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
                                      struct nfs_pageio_descriptor *pgio)
 {
        size_t size;
+        struct file_lock_context *flctx;
        if (prev) {
                if (!nfs_match_open_context(req->wb_context, prev->wb_context))
                        return false;
-                if (req->wb_context->dentry->d_inode->i_flock != NULL &&
+                flctx = req->wb_context->dentry->d_inode->i_flctx;
+                if (flctx != NULL &&
+                    !(list_empty_careful(&flctx->flc_posix) &&
+                      list_empty_careful(&flctx->flc_flock)) &&
                    !nfs_match_lock_context(req->wb_lock_context,
                                            prev->wb_lock_context))
                        return false;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 797cd6253adf..635f0865671c 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -344,6 +344,10 @@ void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
 struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
                                                 struct xdr_stream *xdr,
                                                 gfp_t gfp_flags);
+void pnfs_layout_mark_request_commit(struct nfs_page *req,
+                                     struct pnfs_layout_segment *lseg,
+                                     struct nfs_commit_info *cinfo,
+                                     u32 ds_commit_idx);
 static inline bool nfs_have_layout(struct inode *inode)
 {
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index fdc4f6562bb7..54e36b38fb5f 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -838,3 +838,33 @@ out_err:
        return NULL;
 }
 EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr);
+void
+pnfs_layout_mark_request_commit(struct nfs_page *req,
+                                struct pnfs_layout_segment *lseg,
+                                struct nfs_commit_info *cinfo,
+                                u32 ds_commit_idx)
+{
+        struct list_head *list;
+        struct pnfs_commit_bucket *buckets;
+        spin_lock(cinfo->lock);
+        buckets = cinfo->ds->buckets;
+        list = &buckets[ds_commit_idx].written;
+        if (list_empty(list)) {
+                /* Non-empty buckets hold a reference on the lseg.  That ref
+                 * is normally transferred to the COMMIT call and released
+                 * there.  It could also be released if the last req is pulled
+                 * off due to a rewrite, in which case it will be done in
+                 * pnfs_common_clear_request_commit
+                 */
+                WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
+                buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
+        }
+        set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+        cinfo->ds->nwritten++;
+        spin_unlock(cinfo->lock);
+        nfs_request_add_commit_list(req, list, cinfo);
+}
+EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 368d9395d2e7..322b2de02988 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -311,7 +311,6 @@ const struct super_operations nfs_sops = {
        .destroy_inode  = nfs_destroy_inode,
        .write_inode    = nfs_write_inode,
        .drop_inode     = nfs_drop_inode,
-        .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
        .evict_inode    = nfs_evict_inode,
        .umount_begin   = nfs_umount_begin,
@@ -2572,7 +2571,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
                error = nfs_bdi_register(server);
                if (error) {
                        mntroot = ERR_PTR(error);
-                        goto error_splat_bdi;
+                        goto error_splat_super;
                }
                server->super = s;
        }
@@ -2604,9 +2603,6 @@ error_splat_root:
        dput(mntroot);
        mntroot = ERR_PTR(error);
 error_splat_super:
-        if (server && !s->s_root)
-                bdi_unregister(&server->backing_dev_info);
-error_splat_bdi:
        deactivate_locked_super(s);
        goto out;
 }
@@ -2654,27 +2650,19 @@ out:
 EXPORT_SYMBOL_GPL(nfs_fs_mount);
 /*
- * Ensure that we unregister the bdi before kill_anon_super
- * releases the device name
- */
-void nfs_put_super(struct super_block *s)
-{
-        struct nfs_server *server = NFS_SB(s);
-        bdi_unregister(&server->backing_dev_info);
-}
-EXPORT_SYMBOL_GPL(nfs_put_super);
-/*
 * Destroy an NFS2/3 superblock
 */
 void nfs_kill_super(struct super_block *s)
 {
        struct nfs_server *server = NFS_SB(s);
+        dev_t dev = s->s_dev;
+        generic_shutdown_super(s);
-        kill_anon_super(s);
        nfs_fscache_release_super_cookie(s);
        nfs_free_server(server);
+        free_anon_bdev(dev);
 }
 EXPORT_SYMBOL_GPL(nfs_kill_super);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ceacfeeb28c2..595d81e354d1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -789,13 +789,8 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
        nfs_list_add_request(req, dst);
        cinfo->mds->ncommit++;
        spin_unlock(cinfo->lock);
-        if (!cinfo->dreq) {
+        if (!cinfo->dreq)
-                inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+                nfs_mark_page_unstable(req->wb_page);
-                inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
-                             BDI_RECLAIMABLE);
-                __mark_inode_dirty(req->wb_context->dentry->d_inode,
-                                   I_DIRTY_DATASYNC);
-        }
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
@@ -858,7 +853,7 @@ static void
 nfs_clear_page_commit(struct page *page)
 {
        dec_zone_page_state(page, NR_UNSTABLE_NFS);
-        dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
+        dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
 }
 /* Called holding inode (/cinfo) lock */
@@ -1097,6 +1092,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 {
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct nfs_lock_context *l_ctx;
+        struct file_lock_context *flctx = file_inode(file)->i_flctx;
        struct nfs_page *req;
        int do_flush, status;
        /*
@@ -1115,7 +1111,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
                do_flush = req->wb_page != page || req->wb_context != ctx;
                /* for now, flush if more than 1 request in page_group */
                do_flush |= req->wb_this_page != req;
-                if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
+                if (l_ctx && flctx &&
+                    !(list_empty_careful(&flctx->flc_posix) &&
+                      list_empty_careful(&flctx->flc_flock))) {
                        do_flush |= l_ctx->lockowner.l_owner != current->files
                                || l_ctx->lockowner.l_pid != current->tgid;
                }
@@ -1176,6 +1174,13 @@ out:
        return PageUptodate(page) != 0;
 }
+static bool
+is_whole_file_wrlock(struct file_lock *fl)
+{
+        return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX &&
+                        fl->fl_type == F_WRLCK;
+}
 /* If we know the page is up to date, and we're not using byte range locks (or
 * if we have the whole file locked for writing), it may be more efficient to
 * extend the write to cover the entire page in order to avoid fragmentation
@@ -1186,17 +1191,36 @@ out:
 */
 static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
 {
+        int ret;
+        struct file_lock_context *flctx = inode->i_flctx;
+        struct file_lock *fl;
        if (file->f_flags & O_DSYNC)
                return 0;
        if (!nfs_write_pageuptodate(page, inode))
                return 0;
        if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
                return 1;
-        if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 &&
+        if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
-                        inode->i_flock->fl_end == OFFSET_MAX &&
+                       list_empty_careful(&flctx->flc_posix)))
-                        inode->i_flock->fl_type != F_RDLCK))
+                return 0;
-                return 1;
-        return 0;
+        /* Check to see if there are whole file write locks */
+        ret = 0;
+        spin_lock(&flctx->flc_lock);
+        if (!list_empty(&flctx->flc_posix)) {
+                fl = list_first_entry(&flctx->flc_posix, struct file_lock,
+                                        fl_list);
+                if (is_whole_file_wrlock(fl))
+                        ret = 1;
+        } else if (!list_empty(&flctx->flc_flock)) {
+                fl = list_first_entry(&flctx->flc_flock, struct file_lock,
+                                        fl_list);
+                if (fl->fl_type == F_WRLCK)
+                        ret = 1;
+        }
+        spin_unlock(&flctx->flc_lock);
+        return ret;
 }
 /*
@@ -1576,11 +1600,8 @@ void nfs_retry_commit(struct list_head *page_list,
                req = nfs_list_entry(page_list->next);
                nfs_list_remove_request(req);
                nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
-                if (!cinfo->dreq) {
+                if (!cinfo->dreq)
-                        dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+                        nfs_clear_page_commit(req->wb_page);
-                        dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
-                                     BDI_RECLAIMABLE);
-                }
                nfs_unlock_and_release_request(req);
        }
 }
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 73395156bdb4..683bf718aead 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -82,6 +82,16 @@ config NFSD_V4
          If unsure, say N.
+config NFSD_PNFS
+        bool "NFSv4.1 server support for Parallel NFS (pNFS)"
+        depends on NFSD_V4
+        help
+          This option enables support for the parallel NFS features of the
+          minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
+          server.
+          If unsure, say N.
 config NFSD_V4_SECURITY_LABEL
        bool "Provide Security Label support for NFSv4 server"
        depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index af32ef06b4fe..9a6028e120c6 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -2,9 +2,14 @@
 # Makefile for the Linux nfs server
 #
+ccflags-y += -I$(src)                   # needed for trace events
 obj-$(CONFIG_NFSD)      += nfsd.o
-nfsd-y                  := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
+# this one should be compiled first, as the tracing macros can easily blow up
+nfsd-y                  += trace.o
+nfsd-y                  += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
                           export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
 nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
 nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
@@ -12,3 +17,4 @@ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)  += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
                           nfs4acl.o nfs4callback.o nfs4recover.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
new file mode 100644
index 000000000000..cdbc78c72542
--- /dev/null
+++ b/fs/nfsd/blocklayout.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/exportfs.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+#include <linux/nfsd/debug.h>
+#include "blocklayoutxdr.h"
+#include "pnfs.h"
+#define NFSDDBG_FACILITY        NFSDDBG_PNFS
+static int
+nfsd4_block_get_device_info_simple(struct super_block *sb,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        struct pnfs_block_deviceaddr *dev;
+        struct pnfs_block_volume *b;
+        dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+                      sizeof(struct pnfs_block_volume), GFP_KERNEL);
+        if (!dev)
+                return -ENOMEM;
+        gdp->gd_device = dev;
+        dev->nr_volumes = 1;
+        b = &dev->volumes[0];
+        b->type = PNFS_BLOCK_VOLUME_SIMPLE;
+        b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
+        return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
+                        &b->simple.offset);
+}
+static __be32
+nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        if (sb->s_bdev != sb->s_bdev->bd_contains)
+                return nfserr_inval;
+        return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
+}
+static __be32
+nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
+                struct nfsd4_layoutget *args)
+{
+        struct nfsd4_layout_seg *seg = &args->lg_seg;
+        struct super_block *sb = inode->i_sb;
+        u32 block_size = (1 << inode->i_blkbits);
+        struct pnfs_block_extent *bex;
+        struct iomap iomap;
+        u32 device_generation = 0;
+        int error;
+        /*
+         * We do not attempt to support I/O smaller than the fs block size,
+         * or not aligned to it.
+         */
+        if (args->lg_minlength < block_size) {
+                dprintk("pnfsd: I/O too small\n");
+                goto out_layoutunavailable;
+        }
+        if (seg->offset & (block_size - 1)) {
+                dprintk("pnfsd: I/O misaligned\n");
+                goto out_layoutunavailable;
+        }
+        /*
+         * Some clients barf on non-zero block numbers for NONE or INVALID
+         * layouts, so make sure to zero the whole structure.
+         */
+        error = -ENOMEM;
+        bex = kzalloc(sizeof(*bex), GFP_KERNEL);
+        if (!bex)
+                goto out_error;
+        args->lg_content = bex;
+        error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
+                                            &iomap, seg->iomode != IOMODE_READ,
+                                            &device_generation);
+        if (error) {
+                if (error == -ENXIO)
+                        goto out_layoutunavailable;
+                goto out_error;
+        }
+        if (iomap.length < args->lg_minlength) {
+                dprintk("pnfsd: extent smaller than minlength\n");
+                goto out_layoutunavailable;
+        }
+        switch (iomap.type) {
+        case IOMAP_MAPPED:
+                if (seg->iomode == IOMODE_READ)
+                        bex->es = PNFS_BLOCK_READ_DATA;
+                else
+                        bex->es = PNFS_BLOCK_READWRITE_DATA;
+                bex->soff = (iomap.blkno << 9);
+                break;
+        case IOMAP_UNWRITTEN:
+                if (seg->iomode & IOMODE_RW) {
+                        /*
+                         * Crack monkey special case from section 2.3.1.
+                         */
+                        if (args->lg_minlength == 0) {
+                                dprintk("pnfsd: no soup for you!\n");
+                                goto out_layoutunavailable;
+                        }
+                        bex->es = PNFS_BLOCK_INVALID_DATA;
+                        bex->soff = (iomap.blkno << 9);
+                        break;
+                }
+                /*FALLTHRU*/
+        case IOMAP_HOLE:
+                if (seg->iomode == IOMODE_READ) {
+                        bex->es = PNFS_BLOCK_NONE_DATA;
+                        break;
+                }
+                /*FALLTHRU*/
+        case IOMAP_DELALLOC:
+        default:
+                WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
+                goto out_layoutunavailable;
+        }
+        error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
+        if (error)
+                goto out_error;
+        bex->foff = iomap.offset;
+        bex->len = iomap.length;
+        seg->offset = iomap.offset;
+        seg->length = iomap.length;
+        dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
+        return 0;
+out_error:
+        seg->length = 0;
+        return nfserrno(error);
+out_layoutunavailable:
+        seg->length = 0;
+        return nfserr_layoutunavailable;
+}
+static __be32
+nfsd4_block_proc_layoutcommit(struct inode *inode,
+                struct nfsd4_layoutcommit *lcp)
+{
+        loff_t new_size = lcp->lc_last_wr + 1;
+        struct iattr iattr = { .ia_valid = 0 };
+        struct iomap *iomaps;
+        int nr_iomaps;
+        int error;
+        nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+                        lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+        if (nr_iomaps < 0)
+                return nfserrno(nr_iomaps);
+        if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
+            timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
+                lcp->lc_mtime = current_fs_time(inode->i_sb);
+        iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
+        iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
+        if (new_size > i_size_read(inode)) {
+                iattr.ia_valid |= ATTR_SIZE;
+                iattr.ia_size = new_size;
+        }
+        error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps,
+                        nr_iomaps, &iattr);
+        kfree(iomaps);
+        return nfserrno(error);
+}
+const struct nfsd4_layout_ops bl_layout_ops = {
+        .proc_getdeviceinfo     = nfsd4_block_proc_getdeviceinfo,
+        .encode_getdeviceinfo   = nfsd4_block_encode_getdeviceinfo,
+        .proc_layoutget         = nfsd4_block_proc_layoutget,
+        .encode_layoutget       = nfsd4_block_encode_layoutget,
+        .proc_layoutcommit      = nfsd4_block_proc_layoutcommit,
+};
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
new file mode 100644
index 000000000000..9da89fddab33
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/exportfs.h>
+#include <linux/nfs4.h>
+#include "nfsd.h"
+#include "blocklayoutxdr.h"
+#define NFSDDBG_FACILITY        NFSDDBG_PNFS
+__be32
+nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+                struct nfsd4_layoutget *lgp)
+{
+        struct pnfs_block_extent *b = lgp->lg_content;
+        int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
+        __be32 *p;
+        p = xdr_reserve_space(xdr, sizeof(__be32) + len);
+        if (!p)
+                return nfserr_toosmall;
+        *p++ = cpu_to_be32(len);
+        *p++ = cpu_to_be32(1);          /* we always return a single extent */
+        p = xdr_encode_opaque_fixed(p, &b->vol_id,
+                        sizeof(struct nfsd4_deviceid));
+        p = xdr_encode_hyper(p, b->foff);
+        p = xdr_encode_hyper(p, b->len);
+        p = xdr_encode_hyper(p, b->soff);
+        *p++ = cpu_to_be32(b->es);
+        return 0;
+}
+static int
+nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+        __be32 *p;
+        int len;
+        switch (b->type) {
+        case PNFS_BLOCK_VOLUME_SIMPLE:
+                len = 4 + 4 + 8 + 4 + b->simple.sig_len;
+                p = xdr_reserve_space(xdr, len);
+                if (!p)
+                        return -ETOOSMALL;
+                *p++ = cpu_to_be32(b->type);
+                *p++ = cpu_to_be32(1);  /* single signature */
+                p = xdr_encode_hyper(p, b->simple.offset);
+                p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
+                break;
+        default:
+                return -ENOTSUPP;
+        }
+        return len;
+}
+__be32
+nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        struct pnfs_block_deviceaddr *dev = gdp->gd_device;
+        int len = sizeof(__be32), ret, i;
+        __be32 *p;
+        p = xdr_reserve_space(xdr, len + sizeof(__be32));
+        if (!p)
+                return nfserr_resource;
+        for (i = 0; i < dev->nr_volumes; i++) {
+                ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]);
+                if (ret < 0)
+                        return nfserrno(ret);
+                len += ret;
+        }
+        /*
+         * Fill in the overall length and number of volumes at the beginning
+         * of the layout.
+         */
+        *p++ = cpu_to_be32(len);
+        *p++ = cpu_to_be32(dev->nr_volumes);
+        return 0;
+}
+int
+nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+                u32 block_size)
+{
+        struct iomap *iomaps;
+        u32 nr_iomaps, expected, i;
+        if (len < sizeof(u32)) {
+                dprintk("%s: extent array too small: %u\n", __func__, len);
+                return -EINVAL;
+        }
+        nr_iomaps = be32_to_cpup(p++);
+        expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
+        if (len != expected) {
+                dprintk("%s: extent array size mismatch: %u/%u\n",
+                        __func__, len, expected);
+                return -EINVAL;
+        }
+        iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
+        if (!iomaps) {
+                dprintk("%s: failed to allocate extent array\n", __func__);
+                return -ENOMEM;
+        }
+        for (i = 0; i < nr_iomaps; i++) {
+                struct pnfs_block_extent bex;
+                memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
+                p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
+                p = xdr_decode_hyper(p, &bex.foff);
+                if (bex.foff & (block_size - 1)) {
+                        dprintk("%s: unaligned offset %lld\n",
+                                __func__, bex.foff);
+                        goto fail;
+                }
+                p = xdr_decode_hyper(p, &bex.len);
+                if (bex.len & (block_size - 1)) {
+                        dprintk("%s: unaligned length %lld\n",
+                                __func__, bex.foff);
+                        goto fail;
+                }
+                p = xdr_decode_hyper(p, &bex.soff);
+                if (bex.soff & (block_size - 1)) {
+                        dprintk("%s: unaligned disk offset %lld\n",
+                                __func__, bex.soff);
+                        goto fail;
+                }
+                bex.es = be32_to_cpup(p++);
+                if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
+                        dprintk("%s: incorrect extent state %d\n",
+                                __func__, bex.es);
+                        goto fail;
+                }
+                iomaps[i].offset = bex.foff;
+                iomaps[i].length = bex.len;
+        }
+        *iomapp = iomaps;
+        return nr_iomaps;
+fail:
+        kfree(iomaps);
+        return -EINVAL;
+}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
new file mode 100644
index 000000000000..fdc79037c0e7
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -0,0 +1,62 @@
+#ifndef _NFSD_BLOCKLAYOUTXDR_H
+#define _NFSD_BLOCKLAYOUTXDR_H 1
+#include <linux/blkdev.h>
+#include "xdr4.h"
+struct iomap;
+struct xdr_stream;
+enum pnfs_block_extent_state {
+        PNFS_BLOCK_READWRITE_DATA       = 0,
+        PNFS_BLOCK_READ_DATA            = 1,
+        PNFS_BLOCK_INVALID_DATA         = 2,
+        PNFS_BLOCK_NONE_DATA            = 3,
+};
+struct pnfs_block_extent {
+        struct nfsd4_deviceid           vol_id;
+        u64                             foff;
+        u64                             len;
+        u64                             soff;
+        enum pnfs_block_extent_state    es;
+};
+#define NFS4_BLOCK_EXTENT_SIZE          44
+enum pnfs_block_volume_type {
+        PNFS_BLOCK_VOLUME_SIMPLE        = 0,
+        PNFS_BLOCK_VOLUME_SLICE         = 1,
+        PNFS_BLOCK_VOLUME_CONCAT        = 2,
+        PNFS_BLOCK_VOLUME_STRIPE        = 3,
+};
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN     128
+struct pnfs_block_volume {
+        enum pnfs_block_volume_type     type;
+        union {
+                struct {
+                        u64             offset;
+                        u32             sig_len;
+                        u8              sig[PNFS_BLOCK_UUID_LEN];
+                } simple;
+        };
+};
+struct pnfs_block_deviceaddr {
+        u32                             nr_volumes;
+        struct pnfs_block_volume        volumes[];
+};
+__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+                struct nfsd4_getdeviceinfo *gdp);
+__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+                struct nfsd4_layoutget *lgp);
+int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+                u32 block_size);
+#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 30a739d896ff..c3e3b6e55ae2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -20,6 +20,7 @@
 #include "nfsd.h"
 #include "nfsfh.h"
 #include "netns.h"
+#include "pnfs.h"
 #define NFSDDBG_FACILITY        NFSDDBG_EXPORT
@@ -545,6 +546,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
        exp.ex_client = dom;
        exp.cd = cd;
+        exp.ex_devid_map = NULL;
        /* expiry */
        err = -EINVAL;
@@ -621,6 +623,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
                if (!gid_valid(exp.ex_anon_gid))
                        goto out4;
                err = 0;
+                nfsd4_setup_layout_type(&exp);
        }
        expp = svc_export_lookup(&exp);
@@ -703,6 +707,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
        new->ex_fslocs.locations = NULL;
        new->ex_fslocs.locations_count = 0;
        new->ex_fslocs.migrated = 0;
+        new->ex_layout_type = 0;
        new->ex_uuid = NULL;
        new->cd = item->cd;
 }
@@ -717,6 +722,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
        new->ex_anon_uid = item->ex_anon_uid;
        new->ex_anon_gid = item->ex_anon_gid;
        new->ex_fsid = item->ex_fsid;
+        new->ex_devid_map = item->ex_devid_map;
+        item->ex_devid_map = NULL;
        new->ex_uuid = item->ex_uuid;
        item->ex_uuid = NULL;
        new->ex_fslocs.locations = item->ex_fslocs.locations;
@@ -725,6 +732,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
        item->ex_fslocs.locations_count = 0;
        new->ex_fslocs.migrated = item->ex_fslocs.migrated;
        item->ex_fslocs.migrated = 0;
+        new->ex_layout_type = item->ex_layout_type;
        new->ex_nflavors = item->ex_nflavors;
        for (i = 0; i < MAX_SECINFO_LIST; i++) {
                new->ex_flavors[i] = item->ex_flavors[i];
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 04dc8c167b0c..1f52bfcc436f 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -56,6 +56,8 @@ struct svc_export {
        struct nfsd4_fs_locations ex_fslocs;
        uint32_t                ex_nflavors;
        struct exp_flavor_info  ex_flavors[MAX_SECINFO_LIST];
+        enum pnfs_layouttype    ex_layout_type;
+        struct nfsd4_deviceid_map *ex_devid_map;
        struct cache_detail     *cd;
 };
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7cbdf1b2e4ab..58277859a467 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -546,6 +546,102 @@ out:
        return status;
 }
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * CB_LAYOUTRECALL4args
+ *
+ *      struct layoutrecall_file4 {
+ *              nfs_fh4         lor_fh;
+ *              offset4         lor_offset;
+ *              length4         lor_length;
+ *              stateid4        lor_stateid;
+ *      };
+ *
+ *      union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) {
+ *      case LAYOUTRECALL4_FILE:
+ *              layoutrecall_file4 lor_layout;
+ *      case LAYOUTRECALL4_FSID:
+ *              fsid4              lor_fsid;
+ *      case LAYOUTRECALL4_ALL:
+ *              void;
+ *      };
+ *
+ *      struct CB_LAYOUTRECALL4args {
+ *              layouttype4             clora_type;
+ *              layoutiomode4           clora_iomode;
+ *              bool                    clora_changed;
+ *              layoutrecall4           clora_recall;
+ *      };
+ */
+static void encode_cb_layout4args(struct xdr_stream *xdr,
+                                  const struct nfs4_layout_stateid *ls,
+                                  struct nfs4_cb_compound_hdr *hdr)
+{
+        __be32 *p;
+        BUG_ON(hdr->minorversion == 0);
+        p = xdr_reserve_space(xdr, 5 * 4);
+        *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL);
+        *p++ = cpu_to_be32(ls->ls_layout_type);
+        *p++ = cpu_to_be32(IOMODE_ANY);
+        *p++ = cpu_to_be32(1);
+        *p = cpu_to_be32(RETURN_FILE);
+        encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle);
+        p = xdr_reserve_space(xdr, 2 * 8);
+        p = xdr_encode_hyper(p, 0);
+        xdr_encode_hyper(p, NFS4_MAX_UINT64);
+        encode_stateid4(xdr, &ls->ls_recall_sid);
+        hdr->nops++;
+}
+static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfsd4_callback *cb)
+{
+        const struct nfs4_layout_stateid *ls =
+                container_of(cb, struct nfs4_layout_stateid, ls_recall);
+        struct nfs4_cb_compound_hdr hdr = {
+                .ident = 0,
+                .minorversion = cb->cb_minorversion,
+        };
+        encode_cb_compound4args(xdr, &hdr);
+        encode_cb_sequence4args(xdr, cb, &hdr);
+        encode_cb_layout4args(xdr, ls, &hdr);
+        encode_cb_nops(&hdr);
+}
+static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
+                                  struct xdr_stream *xdr,
+                                  struct nfsd4_callback *cb)
+{
+        struct nfs4_cb_compound_hdr hdr;
+        enum nfsstat4 nfserr;
+        int status;
+        status = decode_cb_compound4res(xdr, &hdr);
+        if (unlikely(status))
+                goto out;
+        if (cb) {
+                status = decode_cb_sequence4res(xdr, cb);
+                if (unlikely(status))
+                        goto out;
+        }
+        status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr);
+        if (unlikely(status))
+                goto out;
+        if (unlikely(nfserr != NFS4_OK))
+                status = nfs_cb_stat_to_errno(nfserr);
+out:
+        return status;
+}
+#endif /* CONFIG_NFSD_PNFS */
 /*
 * RPC procedure tables
 */
@@ -563,6 +659,9 @@ out:
 static struct rpc_procinfo nfs4_cb_procedures[] = {
        PROC(CB_NULL,   NULL,           cb_null,        cb_null),
        PROC(CB_RECALL, COMPOUND,       cb_recall,      cb_recall),
+#ifdef CONFIG_NFSD_PNFS
+        PROC(CB_LAYOUT, COMPOUND,       cb_layout,      cb_layout),
+#endif
 };
 static struct rpc_version nfs_cb_version4 = {
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
new file mode 100644
index 000000000000..3c1bfa155571
--- /dev/null
+++ b/fs/nfsd/nfs4layouts.c
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/kmod.h>
+#include <linux/file.h>
+#include <linux/jhash.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/addr.h>
+#include "pnfs.h"
+#include "netns.h"
+#include "trace.h"
+#define NFSDDBG_FACILITY                NFSDDBG_PNFS
+struct nfs4_layout {
+        struct list_head                lo_perstate;
+        struct nfs4_layout_stateid      *lo_state;
+        struct nfsd4_layout_seg         lo_seg;
+};
+static struct kmem_cache *nfs4_layout_cache;
+static struct kmem_cache *nfs4_layout_stateid_cache;
+static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
+static const struct lock_manager_operations nfsd4_layouts_lm_ops;
+const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
+        [LAYOUT_BLOCK_VOLUME]   = &bl_layout_ops,
+};
+/* pNFS device ID to export fsid mapping */
+#define DEVID_HASH_BITS 8
+#define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS)
+#define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1)
+static u64 nfsd_devid_seq = 1;
+static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfsd_devid_lock);
+static inline u32 devid_hashfn(u64 idx)
+{
+        return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK;
+}
+static void
+nfsd4_alloc_devid_map(const struct svc_fh *fhp)
+{
+        const struct knfsd_fh *fh = &fhp->fh_handle;
+        size_t fsid_len = key_len(fh->fh_fsid_type);
+        struct nfsd4_deviceid_map *map, *old;
+        int i;
+        map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL);
+        if (!map)
+                return;
+        map->fsid_type = fh->fh_fsid_type;
+        memcpy(&map->fsid, fh->fh_fsid, fsid_len);
+        spin_lock(&nfsd_devid_lock);
+        if (fhp->fh_export->ex_devid_map)
+                goto out_unlock;
+        for (i = 0; i < DEVID_HASH_SIZE; i++) {
+                list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
+                        if (old->fsid_type != fh->fh_fsid_type)
+                                continue;
+                        if (memcmp(old->fsid, fh->fh_fsid,
+                                        key_len(old->fsid_type)))
+                                continue;
+                        fhp->fh_export->ex_devid_map = old;
+                        goto out_unlock;
+                }
+        }
+        map->idx = nfsd_devid_seq++;
+        list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]);
+        fhp->fh_export->ex_devid_map = map;
+        map = NULL;
+out_unlock:
+        spin_unlock(&nfsd_devid_lock);
+        kfree(map);
+}
+struct nfsd4_deviceid_map *
+nfsd4_find_devid_map(int idx)
+{
+        struct nfsd4_deviceid_map *map, *ret = NULL;
+        rcu_read_lock();
+        list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash)
+                if (map->idx == idx)
+                        ret = map;
+        rcu_read_unlock();
+        return ret;
+}
+int
+nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+                u32 device_generation)
+{
+        if (!fhp->fh_export->ex_devid_map) {
+                nfsd4_alloc_devid_map(fhp);
+                if (!fhp->fh_export->ex_devid_map)
+                        return -ENOMEM;
+        }
+        id->fsid_idx = fhp->fh_export->ex_devid_map->idx;
+        id->generation = device_generation;
+        id->pad = 0;
+        return 0;
+}
+void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+        struct super_block *sb = exp->ex_path.mnt->mnt_sb;
+        if (exp->ex_flags & NFSEXP_NOPNFS)
+                return;
+        if (sb->s_export_op->get_uuid &&
+            sb->s_export_op->map_blocks &&
+            sb->s_export_op->commit_blocks)
+                exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+}
+static void
+nfsd4_free_layout_stateid(struct nfs4_stid *stid)
+{
+        struct nfs4_layout_stateid *ls = layoutstateid(stid);
+        struct nfs4_client *clp = ls->ls_stid.sc_client;
+        struct nfs4_file *fp = ls->ls_stid.sc_file;
+        trace_layoutstate_free(&ls->ls_stid.sc_stateid);
+        spin_lock(&clp->cl_lock);
+        list_del_init(&ls->ls_perclnt);
+        spin_unlock(&clp->cl_lock);
+        spin_lock(&fp->fi_lock);
+        list_del_init(&ls->ls_perfile);
+        spin_unlock(&fp->fi_lock);
+        vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
+        fput(ls->ls_file);
+        if (ls->ls_recalled)
+                atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
+        kmem_cache_free(nfs4_layout_stateid_cache, ls);
+}
+static int
+nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
+{
+        struct file_lock *fl;
+        int status;
+        fl = locks_alloc_lock();
+        if (!fl)
+                return -ENOMEM;
+        locks_init_lock(fl);
+        fl->fl_lmops = &nfsd4_layouts_lm_ops;
+        fl->fl_flags = FL_LAYOUT;
+        fl->fl_type = F_RDLCK;
+        fl->fl_end = OFFSET_MAX;
+        fl->fl_owner = ls;
+        fl->fl_pid = current->tgid;
+        fl->fl_file = ls->ls_file;
+        status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
+        if (status) {
+                locks_free_lock(fl);
+                return status;
+        }
+        BUG_ON(fl != NULL);
+        return 0;
+}
+static struct nfs4_layout_stateid *
+nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
+                struct nfs4_stid *parent, u32 layout_type)
+{
+        struct nfs4_client *clp = cstate->clp;
+        struct nfs4_file *fp = parent->sc_file;
+        struct nfs4_layout_stateid *ls;
+        struct nfs4_stid *stp;
+        stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
+        if (!stp)
+                return NULL;
+        stp->sc_free = nfsd4_free_layout_stateid;
+        get_nfs4_file(fp);
+        stp->sc_file = fp;
+        ls = layoutstateid(stp);
+        INIT_LIST_HEAD(&ls->ls_perclnt);
+        INIT_LIST_HEAD(&ls->ls_perfile);
+        spin_lock_init(&ls->ls_lock);
+        INIT_LIST_HEAD(&ls->ls_layouts);
+        ls->ls_layout_type = layout_type;
+        nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
+                        NFSPROC4_CLNT_CB_LAYOUT);
+        if (parent->sc_type == NFS4_DELEG_STID)
+                ls->ls_file = get_file(fp->fi_deleg_file);
+        else
+                ls->ls_file = find_any_file(fp);
+        BUG_ON(!ls->ls_file);
+        if (nfsd4_layout_setlease(ls)) {
+                put_nfs4_file(fp);
+                kmem_cache_free(nfs4_layout_stateid_cache, ls);
+                return NULL;
+        }
+        spin_lock(&clp->cl_lock);
+        stp->sc_type = NFS4_LAYOUT_STID;
+        list_add(&ls->ls_perclnt, &clp->cl_lo_states);
+        spin_unlock(&clp->cl_lock);
+        spin_lock(&fp->fi_lock);
+        list_add(&ls->ls_perfile, &fp->fi_lo_states);
+        spin_unlock(&fp->fi_lock);
+        trace_layoutstate_alloc(&ls->ls_stid.sc_stateid);
+        return ls;
+}
+__be32
+nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate, stateid_t *stateid,
+                bool create, u32 layout_type, struct nfs4_layout_stateid **lsp)
+{
+        struct nfs4_layout_stateid *ls;
+        struct nfs4_stid *stid;
+        unsigned char typemask = NFS4_LAYOUT_STID;
+        __be32 status;
+        if (create)
+                typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+        status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+                        net_generic(SVC_NET(rqstp), nfsd_net_id));
+        if (status)
+                goto out;
+        if (!fh_match(&cstate->current_fh.fh_handle,
+                      &stid->sc_file->fi_fhandle)) {
+                status = nfserr_bad_stateid;
+                goto out_put_stid;
+        }
+        if (stid->sc_type != NFS4_LAYOUT_STID) {
+                ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
+                nfs4_put_stid(stid);
+                status = nfserr_jukebox;
+                if (!ls)
+                        goto out;
+        } else {
+                ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
+                status = nfserr_bad_stateid;
+                if (stateid->si_generation > stid->sc_stateid.si_generation)
+                        goto out_put_stid;
+                if (layout_type != ls->ls_layout_type)
+                        goto out_put_stid;
+        }
+        *lsp = ls;
+        return 0;
+out_put_stid:
+        nfs4_put_stid(stid);
+out:
+        return status;
+}
+static void
+nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
+{
+        spin_lock(&ls->ls_lock);
+        if (ls->ls_recalled)
+                goto out_unlock;
+        ls->ls_recalled = true;
+        atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
+        if (list_empty(&ls->ls_layouts))
+                goto out_unlock;
+        trace_layout_recall(&ls->ls_stid.sc_stateid);
+        atomic_inc(&ls->ls_stid.sc_count);
+        update_stateid(&ls->ls_stid.sc_stateid);
+        memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+        nfsd4_run_cb(&ls->ls_recall);
+out_unlock:
+        spin_unlock(&ls->ls_lock);
+}
+static inline u64
+layout_end(struct nfsd4_layout_seg *seg)
+{
+        u64 end = seg->offset + seg->length;
+        return end >= seg->offset ? end : NFS4_MAX_UINT64;
+}
+static void
+layout_update_len(struct nfsd4_layout_seg *lo, u64 end)
+{
+        if (end == NFS4_MAX_UINT64)
+                lo->length = NFS4_MAX_UINT64;
+        else
+                lo->length = end - lo->offset;
+}
+static bool
+layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s)
+{
+        if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode)
+                return false;
+        if (layout_end(&lo->lo_seg) <= s->offset)
+                return false;
+        if (layout_end(s) <= lo->lo_seg.offset)
+                return false;
+        return true;
+}
+static bool
+layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
+{
+        if (lo->iomode != new->iomode)
+                return false;
+        if (layout_end(new) < lo->offset)
+                return false;
+        if (layout_end(lo) < new->offset)
+                return false;
+        lo->offset = min(lo->offset, new->offset);
+        layout_update_len(lo, max(layout_end(lo), layout_end(new)));
+        return true;
+}
+static __be32
+nfsd4_recall_conflict(struct nfs4_layout_stateid *ls)
+{
+        struct nfs4_file *fp = ls->ls_stid.sc_file;
+        struct nfs4_layout_stateid *l, *n;
+        __be32 nfserr = nfs_ok;
+        assert_spin_locked(&fp->fi_lock);
+        list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) {
+                if (l != ls) {
+                        nfsd4_recall_file_layout(l);
+                        nfserr = nfserr_recallconflict;
+                }
+        }
+        return nfserr;
+}
+__be32
+nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
+{
+        struct nfsd4_layout_seg *seg = &lgp->lg_seg;
+        struct nfs4_file *fp = ls->ls_stid.sc_file;
+        struct nfs4_layout *lp, *new = NULL;
+        __be32 nfserr;
+        spin_lock(&fp->fi_lock);
+        nfserr = nfsd4_recall_conflict(ls);
+        if (nfserr)
+                goto out;
+        spin_lock(&ls->ls_lock);
+        list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+                if (layouts_try_merge(&lp->lo_seg, seg))
+                        goto done;
+        }
+        spin_unlock(&ls->ls_lock);
+        spin_unlock(&fp->fi_lock);
+        new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
+        if (!new)
+                return nfserr_jukebox;
+        memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+        new->lo_state = ls;
+        spin_lock(&fp->fi_lock);
+        nfserr = nfsd4_recall_conflict(ls);
+        if (nfserr)
+                goto out;
+        spin_lock(&ls->ls_lock);
+        list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+                if (layouts_try_merge(&lp->lo_seg, seg))
+                        goto done;
+        }
+        atomic_inc(&ls->ls_stid.sc_count);
+        list_add_tail(&new->lo_perstate, &ls->ls_layouts);
+        new = NULL;
+done:
+        update_stateid(&ls->ls_stid.sc_stateid);
+        memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+        spin_unlock(&ls->ls_lock);
+out:
+        spin_unlock(&fp->fi_lock);
+        if (new)
+                kmem_cache_free(nfs4_layout_cache, new);
+        return nfserr;
+}
+static void
+nfsd4_free_layouts(struct list_head *reaplist)
+{
+        while (!list_empty(reaplist)) {
+                struct nfs4_layout *lp = list_first_entry(reaplist,
+                                struct nfs4_layout, lo_perstate);
+                list_del(&lp->lo_perstate);
+                nfs4_put_stid(&lp->lo_state->ls_stid);
+                kmem_cache_free(nfs4_layout_cache, lp);
+        }
+}
+static void
+nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
+                struct list_head *reaplist)
+{
+        struct nfsd4_layout_seg *lo = &lp->lo_seg;
+        u64 end = layout_end(lo);
+        if (seg->offset <= lo->offset) {
+                if (layout_end(seg) >= end) {
+                        list_move_tail(&lp->lo_perstate, reaplist);
+                        return;
+                }
+                end = seg->offset;
+        } else {
+                /* retain the whole layout segment on a split. */
+                if (layout_end(seg) < end) {
+                        dprintk("%s: split not supported\n", __func__);
+                        return;
+                }
+                lo->offset = layout_end(seg);
+        }
+        layout_update_len(lo, end);
+}
+__be32
+nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp)
+{
+        struct nfs4_layout_stateid *ls;
+        struct nfs4_layout *lp, *n;
+        LIST_HEAD(reaplist);
+        __be32 nfserr;
+        int found = 0;
+        nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid,
+                                                false, lrp->lr_layout_type,
+                                                &ls);
+        if (nfserr) {
+                trace_layout_return_lookup_fail(&lrp->lr_sid);
+                return nfserr;
+        }
+        spin_lock(&ls->ls_lock);
+        list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) {
+                if (layouts_overlapping(lp, &lrp->lr_seg)) {
+                        nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist);
+                        found++;
+                }
+        }
+        if (!list_empty(&ls->ls_layouts)) {
+                if (found) {
+                        update_stateid(&ls->ls_stid.sc_stateid);
+                        memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
+                                sizeof(stateid_t));
+                }
+                lrp->lrs_present = 1;
+        } else {
+                trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
+                nfs4_unhash_stid(&ls->ls_stid);
+                lrp->lrs_present = 0;
+        }
+        spin_unlock(&ls->ls_lock);
+        nfs4_put_stid(&ls->ls_stid);
+        nfsd4_free_layouts(&reaplist);
+        return nfs_ok;
+}
+__be32
+nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp)
+{
+        struct nfs4_layout_stateid *ls, *n;
+        struct nfs4_client *clp = cstate->clp;
+        struct nfs4_layout *lp, *t;
+        LIST_HEAD(reaplist);
+        lrp->lrs_present = 0;
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
+                if (lrp->lr_return_type == RETURN_FSID &&
+                    !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
+                                   &cstate->current_fh.fh_handle))
+                        continue;
+                spin_lock(&ls->ls_lock);
+                list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) {
+                        if (lrp->lr_seg.iomode == IOMODE_ANY ||
+                            lrp->lr_seg.iomode == lp->lo_seg.iomode)
+                                list_move_tail(&lp->lo_perstate, &reaplist);
+                }
+                spin_unlock(&ls->ls_lock);
+        }
+        spin_unlock(&clp->cl_lock);
+        nfsd4_free_layouts(&reaplist);
+        return 0;
+}
+static void
+nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls,
+                struct list_head *reaplist)
+{
+        spin_lock(&ls->ls_lock);
+        list_splice_init(&ls->ls_layouts, reaplist);
+        spin_unlock(&ls->ls_lock);
+}
+void
+nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+        struct nfs4_layout_stateid *ls, *n;
+        LIST_HEAD(reaplist);
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt)
+                nfsd4_return_all_layouts(ls, &reaplist);
+        spin_unlock(&clp->cl_lock);
+        nfsd4_free_layouts(&reaplist);
+}
+void
+nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+        struct nfs4_layout_stateid *ls, *n;
+        LIST_HEAD(reaplist);
+        spin_lock(&fp->fi_lock);
+        list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) {
+                if (ls->ls_stid.sc_client == clp)
+                        nfsd4_return_all_layouts(ls, &reaplist);
+        }
+        spin_unlock(&fp->fi_lock);
+        nfsd4_free_layouts(&reaplist);
+}
+static void
+nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
+{
+        struct nfs4_client *clp = ls->ls_stid.sc_client;
+        char addr_str[INET6_ADDRSTRLEN];
+        static char *envp[] = {
+                "HOME=/",
+                "TERM=linux",
+                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+                NULL
+        };
+        char *argv[8];
+        int error;
+        rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
+        nfsd4_cb_layout_fail(ls);
+        printk(KERN_WARNING
+                "nfsd: client %s failed to respond to layout recall. "
+                "  Fencing..\n", addr_str);
+        argv[0] = "/sbin/nfsd-recall-failed";
+        argv[1] = addr_str;
+        argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
+        argv[3] = NULL;
+        error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+        if (error) {
+                printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n",
+                        addr_str, error);
+        }
+}
+static int
+nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+        struct nfs4_layout_stateid *ls =
+                container_of(cb, struct nfs4_layout_stateid, ls_recall);
+        LIST_HEAD(reaplist);
+        switch (task->tk_status) {
+        case 0:
+                return 1;
+        case -NFS4ERR_NOMATCHING_LAYOUT:
+                trace_layout_recall_done(&ls->ls_stid.sc_stateid);
+                task->tk_status = 0;
+                return 1;
+        case -NFS4ERR_DELAY:
+                /* Poll the client until it's done with the layout */
+                /* FIXME: cap number of retries.
+                 * The pnfs standard states that we need to only expire
+                 * the client after at-least "lease time" .eg lease-time * 2
+                 * when failing to communicate a recall
+                 */
+                rpc_delay(task, HZ/100); /* 10 mili-seconds */
+                return 0;
+        default:
+                /*
+                 * Unknown error or non-responding client, we'll need to fence.
+                 */
+                nfsd4_cb_layout_fail(ls);
+                return -1;
+        }
+}
+static void
+nfsd4_cb_layout_release(struct nfsd4_callback *cb)
+{
+        struct nfs4_layout_stateid *ls =
+                container_of(cb, struct nfs4_layout_stateid, ls_recall);
+        LIST_HEAD(reaplist);
+        trace_layout_recall_release(&ls->ls_stid.sc_stateid);
+        nfsd4_return_all_layouts(ls, &reaplist);
+        nfsd4_free_layouts(&reaplist);
+        nfs4_put_stid(&ls->ls_stid);
+}
+static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+        .done           = nfsd4_cb_layout_done,
+        .release        = nfsd4_cb_layout_release,
+};
+static bool
+nfsd4_layout_lm_break(struct file_lock *fl)
+{
+        /*
+         * We don't want the locks code to timeout the lease for us;
+         * we'll remove it ourself if a layout isn't returned
+         * in time:
+         */
+        fl->fl_break_time = 0;
+        nfsd4_recall_file_layout(fl->fl_owner);
+        return false;
+}
+static int
+nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
+                struct list_head *dispose)
+{
+        BUG_ON(!(arg & F_UNLCK));
+        return lease_modify(onlist, arg, dispose);
+}
+static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
+        .lm_break       = nfsd4_layout_lm_break,
+        .lm_change      = nfsd4_layout_lm_change,
+};
+int
+nfsd4_init_pnfs(void)
+{
+        int i;
+        for (i = 0; i < DEVID_HASH_SIZE; i++)
+                INIT_LIST_HEAD(&nfsd_devid_hash[i]);
+        nfs4_layout_cache = kmem_cache_create("nfs4_layout",
+                        sizeof(struct nfs4_layout), 0, 0, NULL);
+        if (!nfs4_layout_cache)
+                return -ENOMEM;
+        nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
+                        sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+        if (!nfs4_layout_stateid_cache) {
+                kmem_cache_destroy(nfs4_layout_cache);
+                return -ENOMEM;
+        }
+        return 0;
+}
+void
+nfsd4_exit_pnfs(void)
+{
+        int i;
+        kmem_cache_destroy(nfs4_layout_cache);
+        kmem_cache_destroy(nfs4_layout_stateid_cache);
+        for (i = 0; i < DEVID_HASH_SIZE; i++) {
+                struct nfsd4_deviceid_map *map, *n;
+                list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash)
+                        kfree(map);
+        }
+}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ac71d13c69ef..d30bea8d0277 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -43,6 +43,8 @@
 #include "current_stateid.h"
 #include "netns.h"
 #include "acl.h"
+#include "pnfs.h"
+#include "trace.h"
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -1178,6 +1180,259 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        return status == nfserr_same ? nfs_ok : status;
 }
+#ifdef CONFIG_NFSD_PNFS
+static const struct nfsd4_layout_ops *
+nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
+{
+        if (!exp->ex_layout_type) {
+                dprintk("%s: export does not support pNFS\n", __func__);
+                return NULL;
+        }
+        if (exp->ex_layout_type != layout_type) {
+                dprintk("%s: layout type %d not supported\n",
+                        __func__, layout_type);
+                return NULL;
+        }
+        return nfsd4_layout_ops[layout_type];
+}
+static __be32
+nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        const struct nfsd4_layout_ops *ops;
+        struct nfsd4_deviceid_map *map;
+        struct svc_export *exp;
+        __be32 nfserr;
+        dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n",
+               __func__,
+               gdp->gd_layout_type,
+               gdp->gd_devid.fsid_idx, gdp->gd_devid.generation,
+               gdp->gd_maxcount);
+        map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx);
+        if (!map) {
+                dprintk("%s: couldn't find device ID to export mapping!\n",
+                        __func__);
+                return nfserr_noent;
+        }
+        exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
+        if (IS_ERR(exp)) {
+                dprintk("%s: could not find device id\n", __func__);
+                return nfserr_noent;
+        }
+        nfserr = nfserr_layoutunavailable;
+        ops = nfsd4_layout_verify(exp, gdp->gd_layout_type);
+        if (!ops)
+                goto out;
+        nfserr = nfs_ok;
+        if (gdp->gd_maxcount != 0)
+                nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+        gdp->gd_notify_types &= ops->notify_types;
+        exp_put(exp);
+out:
+        return nfserr;
+}
+static __be32
+nfsd4_layoutget(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutget *lgp)
+{
+        struct svc_fh *current_fh = &cstate->current_fh;
+        const struct nfsd4_layout_ops *ops;
+        struct nfs4_layout_stateid *ls;
+        __be32 nfserr;
+        int accmode;
+        switch (lgp->lg_seg.iomode) {
+        case IOMODE_READ:
+                accmode = NFSD_MAY_READ;
+                break;
+        case IOMODE_RW:
+                accmode = NFSD_MAY_READ | NFSD_MAY_WRITE;
+                break;
+        default:
+                dprintk("%s: invalid iomode %d\n",
+                        __func__, lgp->lg_seg.iomode);
+                nfserr = nfserr_badiomode;
+                goto out;
+        }
+        nfserr = fh_verify(rqstp, current_fh, 0, accmode);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_layoutunavailable;
+        ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type);
+        if (!ops)
+                goto out;
+        /*
+         * Verify minlength and range as per RFC5661:
+         *  o  If loga_length is less than loga_minlength,
+         *     the metadata server MUST return NFS4ERR_INVAL.
+         *  o  If the sum of loga_offset and loga_minlength exceeds
+         *     NFS4_UINT64_MAX, and loga_minlength is not
+         *     NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result.
+         *  o  If the sum of loga_offset and loga_length exceeds
+         *     NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX,
+         *     the error NFS4ERR_INVAL MUST result.
+         */
+        nfserr = nfserr_inval;
+        if (lgp->lg_seg.length < lgp->lg_minlength ||
+            (lgp->lg_minlength != NFS4_MAX_UINT64 &&
+             lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) ||
+            (lgp->lg_seg.length != NFS4_MAX_UINT64 &&
+             lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset))
+                goto out;
+        if (lgp->lg_seg.length == 0)
+                goto out;
+        nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
+                                                true, lgp->lg_layout_type, &ls);
+        if (nfserr) {
+                trace_layout_get_lookup_fail(&lgp->lg_sid);
+                goto out;
+        }
+        nfserr = nfserr_recallconflict;
+        if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls))
+                goto out_put_stid;
+        nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode,
+                                     current_fh, lgp);
+        if (nfserr)
+                goto out_put_stid;
+        nfserr = nfsd4_insert_layout(lgp, ls);
+out_put_stid:
+        nfs4_put_stid(&ls->ls_stid);
+out:
+        return nfserr;
+}
+static __be32
+nfsd4_layoutcommit(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutcommit *lcp)
+{
+        const struct nfsd4_layout_seg *seg = &lcp->lc_seg;
+        struct svc_fh *current_fh = &cstate->current_fh;
+        const struct nfsd4_layout_ops *ops;
+        loff_t new_size = lcp->lc_last_wr + 1;
+        struct inode *inode;
+        struct nfs4_layout_stateid *ls;
+        __be32 nfserr;
+        nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_layoutunavailable;
+        ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
+        if (!ops)
+                goto out;
+        inode = current_fh->fh_dentry->d_inode;
+        nfserr = nfserr_inval;
+        if (new_size <= seg->offset) {
+                dprintk("pnfsd: last write before layout segment\n");
+                goto out;
+        }
+        if (new_size > seg->offset + seg->length) {
+                dprintk("pnfsd: last write beyond layout segment\n");
+                goto out;
+        }
+        if (!lcp->lc_newoffset && new_size > i_size_read(inode)) {
+                dprintk("pnfsd: layoutcommit beyond EOF\n");
+                goto out;
+        }
+        nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid,
+                                                false, lcp->lc_layout_type,
+                                                &ls);
+        if (nfserr) {
+                trace_layout_commit_lookup_fail(&lcp->lc_sid);
+                /* fixup error code as per RFC5661 */
+                if (nfserr == nfserr_bad_stateid)
+                        nfserr = nfserr_badlayout;
+                goto out;
+        }
+        nfserr = ops->proc_layoutcommit(inode, lcp);
+        if (nfserr)
+                goto out_put_stid;
+        if (new_size > i_size_read(inode)) {
+                lcp->lc_size_chg = 1;
+                lcp->lc_newsize = new_size;
+        } else {
+                lcp->lc_size_chg = 0;
+        }
+out_put_stid:
+        nfs4_put_stid(&ls->ls_stid);
+out:
+        return nfserr;
+}
+static __be32
+nfsd4_layoutreturn(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp)
+{
+        struct svc_fh *current_fh = &cstate->current_fh;
+        __be32 nfserr;
+        nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_layoutunavailable;
+        if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type))
+                goto out;
+        switch (lrp->lr_seg.iomode) {
+        case IOMODE_READ:
+        case IOMODE_RW:
+        case IOMODE_ANY:
+                break;
+        default:
+                dprintk("%s: invalid iomode %d\n", __func__,
+                        lrp->lr_seg.iomode);
+                nfserr = nfserr_inval;
+                goto out;
+        }
+        switch (lrp->lr_return_type) {
+        case RETURN_FILE:
+                nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp);
+                break;
+        case RETURN_FSID:
+        case RETURN_ALL:
+                nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp);
+                break;
+        default:
+                dprintk("%s: invalid return_type %d\n", __func__,
+                        lrp->lr_return_type);
+                nfserr = nfserr_inval;
+                break;
+        }
+out:
+        return nfserr;
+}
+#endif /* CONFIG_NFSD_PNFS */
 /*
 * NULL call.
 */
@@ -1679,6 +1934,36 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
                op_encode_channel_attrs_maxsz) * sizeof(__be32);
 }
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * At this stage we don't really know what layout driver will handle the request,
+ * so we need to define an arbitrary upper bound here.
+ */
+#define MAX_LAYOUT_SIZE         128
+static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size +
+                1 /* logr_return_on_close */ +
+                op_encode_stateid_maxsz +
+                1 /* nr of layouts */ +
+                MAX_LAYOUT_SIZE) * sizeof(__be32);
+}
+static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size +
+                1 /* locr_newsize */ +
+                2 /* ns_size */) * sizeof(__be32);
+}
+static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size +
+                1 /* lrs_stateid */ +
+                op_encode_stateid_maxsz) * sizeof(__be32);
+}
+#endif /* CONFIG_NFSD_PNFS */
 static struct nfsd4_operation nfsd4_ops[] = {
        [OP_ACCESS] = {
                .op_func = (nfsd4op_func)nfsd4_access,
@@ -1966,6 +2251,31 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
+#ifdef CONFIG_NFSD_PNFS
+        [OP_GETDEVICEINFO] = {
+                .op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
+                .op_flags = ALLOWED_WITHOUT_FH,
+                .op_name = "OP_GETDEVICEINFO",
+        },
+        [OP_LAYOUTGET] = {
+                .op_func = (nfsd4op_func)nfsd4_layoutget,
+                .op_flags = OP_MODIFIES_SOMETHING,
+                .op_name = "OP_LAYOUTGET",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize,
+        },
+        [OP_LAYOUTCOMMIT] = {
+                .op_func = (nfsd4op_func)nfsd4_layoutcommit,
+                .op_flags = OP_MODIFIES_SOMETHING,
+                .op_name = "OP_LAYOUTCOMMIT",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize,
+        },
+        [OP_LAYOUTRETURN] = {
+                .op_func = (nfsd4op_func)nfsd4_layoutreturn,
+                .op_flags = OP_MODIFIES_SOMETHING,
+                .op_name = "OP_LAYOUTRETURN",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize,
+        },
+#endif /* CONFIG_NFSD_PNFS */
        /* NFSv4.2 operations */
        [OP_ALLOCATE] = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c06a1ba80d73..f6b2a09f793f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -48,6 +48,7 @@
 #include "current_stateid.h"
 #include "netns.h"
+#include "pnfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -150,16 +151,6 @@ renew_client_locked(struct nfs4_client *clp)
        clp->cl_time = get_seconds();
 }
-static inline void
-renew_client(struct nfs4_client *clp)
-{
-        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-        spin_lock(&nn->client_lock);
-        renew_client_locked(clp);
-        spin_unlock(&nn->client_lock);
-}
 static void put_client_renew_locked(struct nfs4_client *clp)
 {
        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -282,7 +273,7 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
        kmem_cache_free(file_slab, fp);
 }
-static inline void
+void
 put_nfs4_file(struct nfs4_file *fi)
 {
        might_lock(&state_lock);
@@ -295,12 +286,6 @@ put_nfs4_file(struct nfs4_file *fi)
        }
 }
-static inline void
-get_nfs4_file(struct nfs4_file *fi)
-{
-        atomic_inc(&fi->fi_ref);
-}
 static struct file *
 __nfs4_get_fd(struct nfs4_file *f, int oflag)
 {
@@ -358,7 +343,7 @@ find_readable_file(struct nfs4_file *f)
        return ret;
 }
-static struct file *
+struct file *
 find_any_file(struct nfs4_file *f)
 {
        struct file *ret;
@@ -408,14 +393,6 @@ static unsigned int file_hashval(struct knfsd_fh *fh)
        return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
 }
-static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
-{
-        return fh1->fh_size == fh2->fh_size &&
-                !memcmp(fh1->fh_base.fh_pad,
-                                fh2->fh_base.fh_pad,
-                                fh1->fh_size);
-}
 static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
 static void
@@ -494,7 +471,7 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
                __nfs4_file_put_access(fp, O_RDONLY);
 }
-static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
                                         struct kmem_cache *slab)
 {
        struct nfs4_stid *stid;
@@ -688,17 +665,17 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
        struct file *filp = NULL;
        spin_lock(&fp->fi_lock);
-        if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees))
+        if (fp->fi_deleg_file && --fp->fi_delegees == 0)
                swap(filp, fp->fi_deleg_file);
        spin_unlock(&fp->fi_lock);
        if (filp) {
-                vfs_setlease(filp, F_UNLCK, NULL, NULL);
+                vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp);
                fput(filp);
        }
 }
-static void unhash_stid(struct nfs4_stid *s)
+void nfs4_unhash_stid(struct nfs4_stid *s)
 {
        s->sc_type = 0;
 }
@@ -1006,7 +983,7 @@ static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
        list_del_init(&stp->st_locks);
        unhash_ol_stateid(stp);
-        unhash_stid(&stp->st_stid);
+        nfs4_unhash_stid(&stp->st_stid);
 }
 static void release_lock_stateid(struct nfs4_ol_stateid *stp)
@@ -1518,7 +1495,12 @@ unhash_session(struct nfsd4_session *ses)
 static int
 STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
 {
-        if (clid->cl_boot == nn->boot_time)
+        /*
+         * We're assuming the clid was not given out from a boot
+         * precisely 2^32 (about 136 years) before this one.  That seems
+         * a safe assumption:
+         */
+        if (clid->cl_boot == (u32)nn->boot_time)
                return 0;
        dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
                clid->cl_boot, clid->cl_id, nn->boot_time);
@@ -1558,6 +1540,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
        INIT_LIST_HEAD(&clp->cl_lru);
        INIT_LIST_HEAD(&clp->cl_callbacks);
        INIT_LIST_HEAD(&clp->cl_revoked);
+#ifdef CONFIG_NFSD_PNFS
+        INIT_LIST_HEAD(&clp->cl_lo_states);
+#endif
        spin_lock_init(&clp->cl_lock);
        rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
        return clp;
@@ -1662,6 +1647,7 @@ __destroy_client(struct nfs4_client *clp)
                nfs4_get_stateowner(&oo->oo_owner);
                release_openowner(oo);
        }
+        nfsd4_return_all_client_layouts(clp);
        nfsd4_shutdown_callback(clp);
        if (clp->cl_cb_conn.cb_xprt)
                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2145,8 +2131,11 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 static void
 nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
 {
-        /* pNFS is not supported */
+#ifdef CONFIG_NFSD_PNFS
+        new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS;
+#else
        new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+#endif
        /* Referrals are supported, Migration is not. */
        new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
@@ -3074,6 +3063,10 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
        fp->fi_share_deny = 0;
        memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
        memset(fp->fi_access, 0, sizeof(fp->fi_access));
+#ifdef CONFIG_NFSD_PNFS
+        INIT_LIST_HEAD(&fp->fi_lo_states);
+        atomic_set(&fp->fi_lo_recalls, 0);
+#endif
        hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
 }
@@ -3300,7 +3293,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
        struct nfs4_file *fp;
        hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) {
-                if (nfsd_fh_match(&fp->fi_fhandle, fh)) {
+                if (fh_match(&fp->fi_fhandle, fh)) {
                        if (atomic_inc_not_zero(&fp->fi_ref))
                                return fp;
                }
@@ -3308,7 +3301,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
        return NULL;
 }
-static struct nfs4_file *
+struct nfs4_file *
 find_file(struct knfsd_fh *fh)
 {
        struct nfs4_file *fp;
@@ -3477,7 +3470,8 @@ nfsd_break_deleg_cb(struct file_lock *fl)
 }
 static int
-nfsd_change_deleg_cb(struct file_lock **onlist, int arg, struct list_head *dispose)
+nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
+                     struct list_head *dispose)
 {
        if (arg & F_UNLCK)
                return lease_modify(onlist, arg, dispose);
@@ -3855,12 +3849,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
        /* Race breaker */
        if (fp->fi_deleg_file) {
                status = 0;
-                atomic_inc(&fp->fi_delegees);
+                ++fp->fi_delegees;
                hash_delegation_locked(dp, fp);
                goto out_unlock;
        }
        fp->fi_deleg_file = filp;
-        atomic_set(&fp->fi_delegees, 1);
+        fp->fi_delegees = 1;
        hash_delegation_locked(dp, fp);
        spin_unlock(&fp->fi_lock);
        spin_unlock(&state_lock);
@@ -3901,7 +3895,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
                status = -EAGAIN;
                goto out_unlock;
        }
-        atomic_inc(&fp->fi_delegees);
+        ++fp->fi_delegees;
        hash_delegation_locked(dp, fp);
        status = 0;
 out_unlock:
@@ -4294,7 +4288,7 @@ laundromat_main(struct work_struct *laundry)
 static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
 {
-        if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
+        if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
                return nfserr_bad_stateid;
        return nfs_ok;
 }
@@ -4445,7 +4439,7 @@ out_unlock:
        return status;
 }
-static __be32
+__be32
 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
                     stateid_t *stateid, unsigned char typemask,
                     struct nfs4_stid **s, struct nfsd_net *nn)
@@ -4859,6 +4853,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        update_stateid(&stp->st_stid.sc_stateid);
        memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+        nfsd4_return_all_file_layouts(stp->st_stateowner->so_client,
+                                      stp->st_stid.sc_file);
        nfsd4_close_open_stateid(stp);
        /* put reference from nfs4_preprocess_seqid_op */
@@ -5556,10 +5553,11 @@ out_nfserr:
 static bool
 check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 {
-        struct file_lock **flpp;
+        struct file_lock *fl;
        int status = false;
        struct file *filp = find_any_file(fp);
        struct inode *inode;
+        struct file_lock_context *flctx;
        if (!filp) {
                /* Any valid lock stateid should have some sort of access */
@@ -5568,15 +5566,18 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
        }
        inode = file_inode(filp);
+        flctx = inode->i_flctx;
-        spin_lock(&inode->i_lock);
+        if (flctx && !list_empty_careful(&flctx->flc_posix)) {
-        for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
+                spin_lock(&flctx->flc_lock);
-                if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
+                list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
-                        status = true;
+                        if (fl->fl_owner == (fl_owner_t)lowner) {
-                        break;
+                                status = true;
+                                break;
+                        }
                }
+                spin_unlock(&flctx->flc_lock);
        }
-        spin_unlock(&inode->i_lock);
        fput(filp);
        return status;
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15f7b73e0c0f..df5e66caf100 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -47,6 +47,7 @@
 #include "state.h"
 #include "cache.h"
 #include "netns.h"
+#include "pnfs.h"
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -234,6 +235,26 @@ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
        return ret;
 }
+/*
+ * We require the high 32 bits of 'seconds' to be 0, and
+ * we ignore all 32 bits of 'nseconds'.
+ */
+static __be32
+nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv)
+{
+        DECODE_HEAD;
+        u64 sec;
+        READ_BUF(12);
+        p = xdr_decode_hyper(p, &sec);
+        tv->tv_sec = sec;
+        tv->tv_nsec = be32_to_cpup(p++);
+        if (tv->tv_nsec >= (u32)1000000000)
+                return nfserr_inval;
+        DECODE_TAIL;
+}
 static __be32
 nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 {
@@ -267,7 +288,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 {
        int expected_len, len = 0;
        u32 dummy32;
-        u64 sec;
        char *buf;
        DECODE_HEAD;
@@ -358,15 +378,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                dummy32 = be32_to_cpup(p++);
                switch (dummy32) {
                case NFS4_SET_TO_CLIENT_TIME:
-                        /* We require the high 32 bits of 'seconds' to be 0, and we ignore
-                           all 32 bits of 'nseconds'. */
-                        READ_BUF(12);
                        len += 12;
-                        p = xdr_decode_hyper(p, &sec);
+                        status = nfsd4_decode_time(argp, &iattr->ia_atime);
-                        iattr->ia_atime.tv_sec = (time_t)sec;
+                        if (status)
-                        iattr->ia_atime.tv_nsec = be32_to_cpup(p++);
+                                return status;
-                        if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
-                                return nfserr_inval;
                        iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
                        break;
                case NFS4_SET_TO_SERVER_TIME:
@@ -382,15 +397,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                dummy32 = be32_to_cpup(p++);
                switch (dummy32) {
                case NFS4_SET_TO_CLIENT_TIME:
-                        /* We require the high 32 bits of 'seconds' to be 0, and we ignore
-                           all 32 bits of 'nseconds'. */
-                        READ_BUF(12);
                        len += 12;
-                        p = xdr_decode_hyper(p, &sec);
+                        status = nfsd4_decode_time(argp, &iattr->ia_mtime);
-                        iattr->ia_mtime.tv_sec = sec;
+                        if (status)
-                        iattr->ia_mtime.tv_nsec = be32_to_cpup(p++);
+                                return status;
-                        if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
-                                return nfserr_inval;
                        iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
                        break;
                case NFS4_SET_TO_SERVER_TIME:
@@ -1513,6 +1523,127 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
        DECODE_TAIL;
 }
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
+                struct nfsd4_getdeviceinfo *gdev)
+{
+        DECODE_HEAD;
+        u32 num, i;
+        READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
+        COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
+        gdev->gd_layout_type = be32_to_cpup(p++);
+        gdev->gd_maxcount = be32_to_cpup(p++);
+        num = be32_to_cpup(p++);
+        if (num) {
+                READ_BUF(4 * num);
+                gdev->gd_notify_types = be32_to_cpup(p++);
+                for (i = 1; i < num; i++) {
+                        if (be32_to_cpup(p++)) {
+                                status = nfserr_inval;
+                                goto out;
+                        }
+                }
+        }
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
+                struct nfsd4_layoutget *lgp)
+{
+        DECODE_HEAD;
+        READ_BUF(36);
+        lgp->lg_signal = be32_to_cpup(p++);
+        lgp->lg_layout_type = be32_to_cpup(p++);
+        lgp->lg_seg.iomode = be32_to_cpup(p++);
+        p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
+        p = xdr_decode_hyper(p, &lgp->lg_seg.length);
+        p = xdr_decode_hyper(p, &lgp->lg_minlength);
+        nfsd4_decode_stateid(argp, &lgp->lg_sid);
+        READ_BUF(4);
+        lgp->lg_maxcount = be32_to_cpup(p++);
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
+                struct nfsd4_layoutcommit *lcp)
+{
+        DECODE_HEAD;
+        u32 timechange;
+        READ_BUF(20);
+        p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
+        p = xdr_decode_hyper(p, &lcp->lc_seg.length);
+        lcp->lc_reclaim = be32_to_cpup(p++);
+        nfsd4_decode_stateid(argp, &lcp->lc_sid);
+        READ_BUF(4);
+        lcp->lc_newoffset = be32_to_cpup(p++);
+        if (lcp->lc_newoffset) {
+                READ_BUF(8);
+                p = xdr_decode_hyper(p, &lcp->lc_last_wr);
+        } else
+                lcp->lc_last_wr = 0;
+        READ_BUF(4);
+        timechange = be32_to_cpup(p++);
+        if (timechange) {
+                status = nfsd4_decode_time(argp, &lcp->lc_mtime);
+                if (status)
+                        return status;
+        } else {
+                lcp->lc_mtime.tv_nsec = UTIME_NOW;
+        }
+        READ_BUF(8);
+        lcp->lc_layout_type = be32_to_cpup(p++);
+        /*
+         * Save the layout update in XDR format and let the layout driver deal
+         * with it later.
+         */
+        lcp->lc_up_len = be32_to_cpup(p++);
+        if (lcp->lc_up_len > 0) {
+                READ_BUF(lcp->lc_up_len);
+                READMEM(lcp->lc_up_layout, lcp->lc_up_len);
+        }
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
+                struct nfsd4_layoutreturn *lrp)
+{
+        DECODE_HEAD;
+        READ_BUF(16);
+        lrp->lr_reclaim = be32_to_cpup(p++);
+        lrp->lr_layout_type = be32_to_cpup(p++);
+        lrp->lr_seg.iomode = be32_to_cpup(p++);
+        lrp->lr_return_type = be32_to_cpup(p++);
+        if (lrp->lr_return_type == RETURN_FILE) {
+                READ_BUF(16);
+                p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
+                p = xdr_decode_hyper(p, &lrp->lr_seg.length);
+                nfsd4_decode_stateid(argp, &lrp->lr_sid);
+                READ_BUF(4);
+                lrp->lrf_body_len = be32_to_cpup(p++);
+                if (lrp->lrf_body_len > 0) {
+                        READ_BUF(lrp->lrf_body_len);
+                        READMEM(lrp->lrf_body, lrp->lrf_body_len);
+                }
+        } else {
+                lrp->lr_seg.offset = 0;
+                lrp->lr_seg.length = NFS4_MAX_UINT64;
+        }
+        DECODE_TAIL;
+}
+#endif /* CONFIG_NFSD_PNFS */
 static __be32
 nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
                       struct nfsd4_fallocate *fallocate)
@@ -1607,11 +1738,19 @@ static nfsd4_dec nfsd4_dec_ops[] = {
        [OP_DESTROY_SESSION]    = (nfsd4_dec)nfsd4_decode_destroy_session,
        [OP_FREE_STATEID]       = (nfsd4_dec)nfsd4_decode_free_stateid,
        [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
+#ifdef CONFIG_NFSD_PNFS
+        [OP_GETDEVICEINFO]      = (nfsd4_dec)nfsd4_decode_getdeviceinfo,
+        [OP_GETDEVICELIST]      = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTCOMMIT]       = (nfsd4_dec)nfsd4_decode_layoutcommit,
+        [OP_LAYOUTGET]          = (nfsd4_dec)nfsd4_decode_layoutget,
+        [OP_LAYOUTRETURN]       = (nfsd4_dec)nfsd4_decode_layoutreturn,
+#else
        [OP_GETDEVICEINFO]      = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_GETDEVICELIST]      = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTCOMMIT]       = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTGET]          = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTRETURN]       = (nfsd4_dec)nfsd4_decode_notsupp,
+#endif
        [OP_SECINFO_NO_NAME]    = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
        [OP_SEQUENCE]           = (nfsd4_dec)nfsd4_decode_sequence,
        [OP_SET_SSV]            = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2539,6 +2678,30 @@ out_acl:
                        get_parent_attributes(exp, &stat);
                p = xdr_encode_hyper(p, stat.ino);
        }
+#ifdef CONFIG_NFSD_PNFS
+        if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
+            (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
+                if (exp->ex_layout_type) {
+                        p = xdr_reserve_space(xdr, 8);
+                        if (!p)
+                                goto out_resource;
+                        *p++ = cpu_to_be32(1);
+                        *p++ = cpu_to_be32(exp->ex_layout_type);
+                } else {
+                        p = xdr_reserve_space(xdr, 4);
+                        if (!p)
+                                goto out_resource;
+                        *p++ = cpu_to_be32(0);
+                }
+        }
+        if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+                p = xdr_reserve_space(xdr, 4);
+                if (!p)
+                        goto out_resource;
+                *p++ = cpu_to_be32(stat.blksize);
+        }
+#endif /* CONFIG_NFSD_PNFS */
        if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
                status = nfsd4_encode_security_label(xdr, rqstp, context,
                                                                contextlen);
@@ -2768,16 +2931,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
        if (entry_bytes > cd->rd_maxcount)
                goto fail;
        cd->rd_maxcount -= entry_bytes;
-        if (!cd->rd_dircount)
-                goto fail;
        /*
         * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
         * let's always let through the first entry, at least:
         */
-        name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
+        if (!cd->rd_dircount)
+                goto fail;
+        name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
        if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
                goto fail;
        cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
        cd->cookie_offset = cookie_offset;
 skip_entry:
        cd->common.err = nfs_ok;
@@ -3814,6 +3978,156 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
        return nfserr;
 }
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+                struct nfsd4_getdeviceinfo *gdev)
+{
+        struct xdr_stream *xdr = &resp->xdr;
+        const struct nfsd4_layout_ops *ops =
+                nfsd4_layout_ops[gdev->gd_layout_type];
+        u32 starting_len = xdr->buf->len, needed_len;
+        __be32 *p;
+        dprintk("%s: err %d\n", __func__, nfserr);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_resource;
+        p = xdr_reserve_space(xdr, 4);
+        if (!p)
+                goto out;
+        *p++ = cpu_to_be32(gdev->gd_layout_type);
+        /* If maxcount is 0 then just update notifications */
+        if (gdev->gd_maxcount != 0) {
+                nfserr = ops->encode_getdeviceinfo(xdr, gdev);
+                if (nfserr) {
+                        /*
+                         * We don't bother to burden the layout drivers with
+                         * enforcing gd_maxcount, just tell the client to
+                         * come back with a bigger buffer if it's not enough.
+                         */
+                        if (xdr->buf->len + 4 > gdev->gd_maxcount)
+                                goto toosmall;
+                        goto out;
+                }
+        }
+        nfserr = nfserr_resource;
+        if (gdev->gd_notify_types) {
+                p = xdr_reserve_space(xdr, 4 + 4);
+                if (!p)
+                        goto out;
+                *p++ = cpu_to_be32(1);                  /* bitmap length */
+                *p++ = cpu_to_be32(gdev->gd_notify_types);
+        } else {
+                p = xdr_reserve_space(xdr, 4);
+                if (!p)
+                        goto out;
+                *p++ = 0;
+        }
+        nfserr = 0;
+out:
+        kfree(gdev->gd_device);
+        dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr));
+        return nfserr;
+toosmall:
+        dprintk("%s: maxcount too small\n", __func__);
+        needed_len = xdr->buf->len + 4 /* notifications */;
+        xdr_truncate_encode(xdr, starting_len);
+        p = xdr_reserve_space(xdr, 4);
+        if (!p) {
+                nfserr = nfserr_resource;
+        } else {
+                *p++ = cpu_to_be32(needed_len);
+                nfserr = nfserr_toosmall;
+        }
+        goto out;
+}
+static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+                struct nfsd4_layoutget *lgp)
+{
+        struct xdr_stream *xdr = &resp->xdr;
+        const struct nfsd4_layout_ops *ops =
+                nfsd4_layout_ops[lgp->lg_layout_type];
+        __be32 *p;
+        dprintk("%s: err %d\n", __func__, nfserr);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_resource;
+        p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
+        if (!p)
+                goto out;
+        *p++ = cpu_to_be32(1);  /* we always set return-on-close */
+        *p++ = cpu_to_be32(lgp->lg_sid.si_generation);
+        p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
+                                    sizeof(stateid_opaque_t));
+        *p++ = cpu_to_be32(1);  /* we always return a single layout */
+        p = xdr_encode_hyper(p, lgp->lg_seg.offset);
+        p = xdr_encode_hyper(p, lgp->lg_seg.length);
+        *p++ = cpu_to_be32(lgp->lg_seg.iomode);
+        *p++ = cpu_to_be32(lgp->lg_layout_type);
+        nfserr = ops->encode_layoutget(xdr, lgp);
+out:
+        kfree(lgp->lg_content);
+        return nfserr;
+}
+static __be32
+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
+                          struct nfsd4_layoutcommit *lcp)
+{
+        struct xdr_stream *xdr = &resp->xdr;
+        __be32 *p;
+        if (nfserr)
+                return nfserr;
+        p = xdr_reserve_space(xdr, 4);
+        if (!p)
+                return nfserr_resource;
+        *p++ = cpu_to_be32(lcp->lc_size_chg);
+        if (lcp->lc_size_chg) {
+                p = xdr_reserve_space(xdr, 8);
+                if (!p)
+                        return nfserr_resource;
+                p = xdr_encode_hyper(p, lcp->lc_newsize);
+        }
+        return nfs_ok;
+}
+static __be32
+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
+                struct nfsd4_layoutreturn *lrp)
+{
+        struct xdr_stream *xdr = &resp->xdr;
+        __be32 *p;
+        if (nfserr)
+                return nfserr;
+        p = xdr_reserve_space(xdr, 4);
+        if (!p)
+                return nfserr_resource;
+        *p++ = cpu_to_be32(lrp->lrs_present);
+        if (lrp->lrs_present)
+                nfsd4_encode_stateid(xdr, &lrp->lr_sid);
+        return nfs_ok;
+}
+#endif /* CONFIG_NFSD_PNFS */
 static __be32
 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
                  struct nfsd4_seek *seek)
@@ -3890,11 +4204,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_DESTROY_SESSION]    = (nfsd4_enc)nfsd4_encode_noop,
        [OP_FREE_STATEID]       = (nfsd4_enc)nfsd4_encode_noop,
        [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+#ifdef CONFIG_NFSD_PNFS
+        [OP_GETDEVICEINFO]      = (nfsd4_enc)nfsd4_encode_getdeviceinfo,
+        [OP_GETDEVICELIST]      = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_LAYOUTCOMMIT]       = (nfsd4_enc)nfsd4_encode_layoutcommit,
+        [OP_LAYOUTGET]          = (nfsd4_enc)nfsd4_encode_layoutget,
+        [OP_LAYOUTRETURN]       = (nfsd4_enc)nfsd4_encode_layoutreturn,
+#else
        [OP_GETDEVICEINFO]      = (nfsd4_enc)nfsd4_encode_noop,
        [OP_GETDEVICELIST]      = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTCOMMIT]       = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTGET]          = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTRETURN]       = (nfsd4_enc)nfsd4_encode_noop,
+#endif
        [OP_SECINFO_NO_NAME]    = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
        [OP_SEQUENCE]           = (nfsd4_enc)nfsd4_encode_sequence,
        [OP_SET_SSV]            = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 19ace74d35f6..aa47d75ddb26 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -21,6 +21,7 @@
 #include "cache.h"
 #include "state.h"
 #include "netns.h"
+#include "pnfs.h"
 /*
 *      We have a single directory with several nodes in it.
@@ -1258,9 +1259,12 @@ static int __init init_nfsd(void)
        retval = nfsd4_init_slabs();
        if (retval)
                goto out_unregister_pernet;
-        retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+        retval = nfsd4_init_pnfs();
        if (retval)
                goto out_free_slabs;
+        retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+        if (retval)
+                goto out_exit_pnfs;
        nfsd_stat_init();       /* Statistics */
        retval = nfsd_reply_cache_init();
        if (retval)
@@ -1282,6 +1286,8 @@ out_free_lockd:
 out_free_stat:
        nfsd_stat_shutdown();
        nfsd_fault_inject_cleanup();
+out_exit_pnfs:
+        nfsd4_exit_pnfs();
 out_free_slabs:
        nfsd4_free_slabs();
 out_unregister_pernet:
@@ -1299,6 +1305,7 @@ static void __exit exit_nfsd(void)
        nfsd_stat_shutdown();
        nfsd_lockd_shutdown();
        nfsd4_free_slabs();
+        nfsd4_exit_pnfs();
        nfsd_fault_inject_cleanup();
        unregister_filesystem(&nfsd_fs_type);
        unregister_pernet_subsys(&nfsd_net_ops);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 33a46a8dfaf7..565c4da1a9eb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -325,15 +325,27 @@ void		nfsd_lockd_shutdown(void);
 #define NFSD4_SUPPORTED_ATTRS_WORD2 0
+/* 4.1 */
+#ifdef CONFIG_NFSD_PNFS
+#define PNFSD_SUPPORTED_ATTRS_WORD1     FATTR4_WORD1_FS_LAYOUT_TYPES
+#define PNFSD_SUPPORTED_ATTRS_WORD2 \
+(FATTR4_WORD2_LAYOUT_BLKSIZE    | FATTR4_WORD2_LAYOUT_TYPES)
+#else
+#define PNFSD_SUPPORTED_ATTRS_WORD1     0
+#define PNFSD_SUPPORTED_ATTRS_WORD2     0
+#endif /* CONFIG_NFSD_PNFS */
 #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
        NFSD4_SUPPORTED_ATTRS_WORD0
 #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
-        NFSD4_SUPPORTED_ATTRS_WORD1
+        (NFSD4_SUPPORTED_ATTRS_WORD1    | PNFSD_SUPPORTED_ATTRS_WORD1)
 #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
-        (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+        (NFSD4_SUPPORTED_ATTRS_WORD2    | PNFSD_SUPPORTED_ATTRS_WORD2 | \
+         FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+/* 4.2 */
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #define NFSD4_2_SECURITY_ATTRS          FATTR4_WORD2_SECURITY_LABEL
 #else
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 08236d70c667..84cae2079d21 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -187,6 +187,24 @@ fh_init(struct svc_fh *fhp, int maxsize)
        return fhp;
 }
+static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+        if (fh1->fh_size != fh2->fh_size)
+                return false;
+        if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
+                return false;
+        return true;
+}
+static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+        if (fh1->fh_fsid_type != fh2->fh_fsid_type)
+                return false;
+        if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type) != 0))
+                return false;
+        return true;
+}
 #ifdef CONFIG_NFSD_V3
 /*
 * The wcc data stored in current_fh should be cleared
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 314f5c8f8f1a..9277cc91c21b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -119,6 +119,7 @@ struct svc_program		nfsd_program = {
 static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
        [0] = 1,
        [1] = 1,
+        [2] = 1,
 };
 int nfsd_vers(int vers, enum vers_op change)
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
new file mode 100644
index 000000000000..fedb4d620a81
--- /dev/null
+++ b/fs/nfsd/pnfs.h
@@ -0,0 +1,81 @@
+#ifndef _FS_NFSD_PNFS_H
+#define _FS_NFSD_PNFS_H 1
+#include <linux/exportfs.h>
+#include <linux/nfsd/export.h>
+#include "state.h"
+#include "xdr4.h"
+struct xdr_stream;
+struct nfsd4_deviceid_map {
+        struct list_head        hash;
+        u64                     idx;
+        int                     fsid_type;
+        u32                     fsid[];
+};
+struct nfsd4_layout_ops {
+        u32             notify_types;
+        __be32 (*proc_getdeviceinfo)(struct super_block *sb,
+                        struct nfsd4_getdeviceinfo *gdevp);
+        __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
+                        struct nfsd4_getdeviceinfo *gdevp);
+        __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
+                        struct nfsd4_layoutget *lgp);
+        __be32 (*encode_layoutget)(struct xdr_stream *,
+                        struct nfsd4_layoutget *lgp);
+        __be32 (*proc_layoutcommit)(struct inode *inode,
+                        struct nfsd4_layoutcommit *lcp);
+};
+extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+extern const struct nfsd4_layout_ops bl_layout_ops;
+__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate, stateid_t *stateid,
+                bool create, u32 layout_type, struct nfs4_layout_stateid **lsp);
+__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp,
+                struct nfs4_layout_stateid *ls);
+__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp);
+__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp);
+int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+                u32 device_generation);
+struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx);
+#ifdef CONFIG_NFSD_PNFS
+void nfsd4_setup_layout_type(struct svc_export *exp);
+void nfsd4_return_all_client_layouts(struct nfs4_client *);
+void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+                struct nfs4_file *fp);
+int nfsd4_init_pnfs(void);
+void nfsd4_exit_pnfs(void);
+#else
+static inline void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+}
+static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+}
+static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+                struct nfs4_file *fp)
+{
+}
+static inline void nfsd4_exit_pnfs(void)
+{
+}
+static inline int nfsd4_init_pnfs(void)
+{
+        return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _FS_NFSD_PNFS_H */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 9d3be371240a..4f3bfeb11766 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -92,6 +92,7 @@ struct nfs4_stid {
 /* For a deleg stateid kept around only to process free_stateid's: */
 #define NFS4_REVOKED_DELEG_STID 16
 #define NFS4_CLOSED_DELEG_STID 32
+#define NFS4_LAYOUT_STID 64
        unsigned char sc_type;
        stateid_t sc_stateid;
        struct nfs4_client *sc_client;
@@ -297,6 +298,9 @@ struct nfs4_client {
        struct list_head        cl_delegations;
        struct list_head        cl_revoked;     /* unacknowledged, revoked 4.1 state */
        struct list_head        cl_lru;         /* tail queue */
+#ifdef CONFIG_NFSD_PNFS
+        struct list_head        cl_lo_states;   /* outstanding layout states */
+#endif
        struct xdr_netobj       cl_name;        /* id generated by client */
        nfs4_verifier           cl_verifier;    /* generated by client */
        time_t                  cl_time;        /* time of last lease renewal */
@@ -493,9 +497,13 @@ struct nfs4_file {
        atomic_t                fi_access[2];
        u32                     fi_share_deny;
        struct file             *fi_deleg_file;
-        atomic_t                fi_delegees;
+        int                     fi_delegees;
        struct knfsd_fh         fi_fhandle;
        bool                    fi_had_conflict;
+#ifdef CONFIG_NFSD_PNFS
+        struct list_head        fi_lo_states;
+        atomic_t                fi_lo_recalls;
+#endif
 };
 /*
@@ -528,6 +536,24 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
        return container_of(s, struct nfs4_ol_stateid, st_stid);
 }
+struct nfs4_layout_stateid {
+        struct nfs4_stid                ls_stid;
+        struct list_head                ls_perclnt;
+        struct list_head                ls_perfile;
+        spinlock_t                      ls_lock;
+        struct list_head                ls_layouts;
+        u32                             ls_layout_type;
+        struct file                     *ls_file;
+        struct nfsd4_callback           ls_recall;
+        stateid_t                       ls_recall_sid;
+        bool                            ls_recalled;
+};
+static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
+{
+        return container_of(s, struct nfs4_layout_stateid, ls_stid);
+}
 /* flags for preprocess_seqid_op() */
 #define RD_STATE                0x00000010
 #define WR_STATE                0x00000020
@@ -535,6 +561,7 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
 enum nfsd4_cb_op {
        NFSPROC4_CLNT_CB_NULL = 0,
        NFSPROC4_CLNT_CB_RECALL,
+        NFSPROC4_CLNT_CB_LAYOUT,
        NFSPROC4_CLNT_CB_SEQUENCE,
 };
@@ -545,6 +572,12 @@ struct nfsd_net;
 extern __be32 nfs4_preprocess_stateid_op(struct net *net,
                struct nfsd4_compound_state *cstate,
                stateid_t *stateid, int flags, struct file **filp);
+__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
+                     stateid_t *stateid, unsigned char typemask,
+                     struct nfs4_stid **s, struct nfsd_net *nn);
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+                struct kmem_cache *slab);
+void nfs4_unhash_stid(struct nfs4_stid *s);
 void nfs4_put_stid(struct nfs4_stid *s);
 void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
 extern void nfs4_release_reclaim(struct nfsd_net *);
@@ -567,6 +600,14 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
                                                        struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
+struct nfs4_file *find_file(struct knfsd_fh *fh);
+void put_nfs4_file(struct nfs4_file *fi);
+static inline void get_nfs4_file(struct nfs4_file *fi)
+{
+        atomic_inc(&fi->fi_ref);
+}
+struct file *find_any_file(struct nfs4_file *f);
 /* grace period management */
 void nfsd4_end_grace(struct nfsd_net *nn);
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
new file mode 100644
index 000000000000..82f89070594c
--- /dev/null
+++ b/fs/nfsd/trace.c
@@ -0,0 +1,5 @@
+#include "state.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
new file mode 100644
index 000000000000..c668520c344b
--- /dev/null
+++ b/fs/nfsd/trace.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfsd
+#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _NFSD_TRACE_H
+#include <linux/tracepoint.h>
+DECLARE_EVENT_CLASS(nfsd_stateid_class,
+        TP_PROTO(stateid_t *stp),
+        TP_ARGS(stp),
+        TP_STRUCT__entry(
+                __field(u32, cl_boot)
+                __field(u32, cl_id)
+                __field(u32, si_id)
+                __field(u32, si_generation)
+        ),
+        TP_fast_assign(
+                __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+                __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+                __entry->si_id = stp->si_opaque.so_id;
+                __entry->si_generation = stp->si_generation;
+        ),
+        TP_printk("client %08x:%08x stateid %08x:%08x",
+                __entry->cl_boot,
+                __entry->cl_id,
+                __entry->si_id,
+                __entry->si_generation)
+)
+#define DEFINE_STATEID_EVENT(name) \
+DEFINE_EVENT(nfsd_stateid_class, name, \
+        TP_PROTO(stateid_t *stp), \
+        TP_ARGS(stp))
+DEFINE_STATEID_EVENT(layoutstate_alloc);
+DEFINE_STATEID_EVENT(layoutstate_unhash);
+DEFINE_STATEID_EVENT(layoutstate_free);
+DEFINE_STATEID_EVENT(layout_get_lookup_fail);
+DEFINE_STATEID_EVENT(layout_commit_lookup_fail);
+DEFINE_STATEID_EVENT(layout_return_lookup_fail);
+DEFINE_STATEID_EVENT(layout_recall);
+DEFINE_STATEID_EVENT(layout_recall_done);
+DEFINE_STATEID_EVENT(layout_recall_fail);
+DEFINE_STATEID_EVENT(layout_recall_release);
+#endif /* _NFSD_TRACE_H */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 90a5925bd6ab..0bda93e58e1b 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,61 @@ struct nfsd4_reclaim_complete {
        u32 rca_one_fs;
 };
+struct nfsd4_deviceid {
+        u64                     fsid_idx;
+        u32                     generation;
+        u32                     pad;
+};
+struct nfsd4_layout_seg {
+        u32                     iomode;
+        u64                     offset;
+        u64                     length;
+};
+struct nfsd4_getdeviceinfo {
+        struct nfsd4_deviceid   gd_devid;       /* request */
+        u32                     gd_layout_type; /* request */
+        u32                     gd_maxcount;    /* request */
+        u32                     gd_notify_types;/* request - response */
+        void                    *gd_device;     /* response */
+};
+struct nfsd4_layoutget {
+        u64                     lg_minlength;   /* request */
+        u32                     lg_signal;      /* request */
+        u32                     lg_layout_type; /* request */
+        u32                     lg_maxcount;    /* request */
+        stateid_t               lg_sid;         /* request/response */
+        struct nfsd4_layout_seg lg_seg;         /* request/response */
+        void                    *lg_content;    /* response */
+};
+struct nfsd4_layoutcommit {
+        stateid_t               lc_sid;         /* request */
+        struct nfsd4_layout_seg lc_seg;         /* request */
+        u32                     lc_reclaim;     /* request */
+        u32                     lc_newoffset;   /* request */
+        u64                     lc_last_wr;     /* request */
+        struct timespec         lc_mtime;       /* request */
+        u32                     lc_layout_type; /* request */
+        u32                     lc_up_len;      /* layout length */
+        void                    *lc_up_layout;  /* decoded by callback */
+        u32                     lc_size_chg;    /* boolean for response */
+        u64                     lc_newsize;     /* response */
+};
+struct nfsd4_layoutreturn {
+        u32                     lr_return_type; /* request */
+        u32                     lr_layout_type; /* request */
+        struct nfsd4_layout_seg lr_seg;         /* request */
+        u32                     lr_reclaim;     /* request */
+        u32                     lrf_body_len;   /* request */
+        void                    *lrf_body;      /* request */
+        stateid_t               lr_sid;         /* request/response */
+        u32                     lrs_present;    /* response */
+};
 struct nfsd4_fallocate {
        /* request */
        stateid_t       falloc_stateid;
@@ -491,6 +546,10 @@ struct nfsd4_op {
                struct nfsd4_reclaim_complete   reclaim_complete;
                struct nfsd4_test_stateid       test_stateid;
                struct nfsd4_free_stateid       free_stateid;
+                struct nfsd4_getdeviceinfo      getdeviceinfo;
+                struct nfsd4_layoutget          layoutget;
+                struct nfsd4_layoutcommit       layoutcommit;
+                struct nfsd4_layoutreturn       layoutreturn;
                /* NFSv4.2 */
                struct nfsd4_fallocate          allocate;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index c5c55dfb91a9..c47f6fdb111a 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -21,3 +21,10 @@
 #define NFS4_dec_cb_recall_sz           (cb_compound_dec_hdr_sz  +      \
                                        cb_sequence_dec_sz +            \
                                        op_dec_sz)
+#define NFS4_enc_cb_layout_sz           (cb_compound_enc_hdr_sz +       \
+                                        cb_sequence_enc_sz +            \
+                                        1 + 3 +                         \
+                                        enc_nfs4_fh_sz + 4)
+#define NFS4_dec_cb_layout_sz           (cb_compound_dec_hdr_sz  +      \
+                                        cb_sequence_dec_sz +            \
+                                        op_dec_sz)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 3a03e0aea1fb..a8c728acb7a8 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -128,7 +128,6 @@ static const struct vm_operations_struct nilfs_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = nilfs_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 57ceaf33d177..748ca238915a 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -172,7 +172,6 @@ int nilfs_init_gcinode(struct inode *inode)
        inode->i_mode = S_IFREG;
        mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
        inode->i_mapping->a_ops = &empty_aops;
-        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
        ii->i_flags = 0;
        nilfs_bmap_init_gc(ii->i_bmap);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c4dcd1db57ee..892cf5ffdb8e 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -429,7 +429,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
        inode->i_mode = S_IFREG;
        mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
-        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
        inode->i_op = &def_mdt_iops;
        inode->i_fop = &def_mdt_fops;
@@ -457,13 +456,12 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
                               struct nilfs_shadow_map *shadow)
 {
        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
-        struct backing_dev_info *bdi = inode->i_sb->s_bdi;
        INIT_LIST_HEAD(&shadow->frozen_buffers);
        address_space_init_once(&shadow->frozen_data);
-        nilfs_mapping_init(&shadow->frozen_data, inode, bdi);
+        nilfs_mapping_init(&shadow->frozen_data, inode);
        address_space_init_once(&shadow->frozen_btnodes);
-        nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi);
+        nilfs_mapping_init(&shadow->frozen_btnodes, inode);
        mi->mi_shadow = shadow;
        return 0;
 }
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 91093cd74f0d..385704027575 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -141,7 +141,6 @@ enum {
 * @ti_save: Backup of journal_info field of task_struct
 * @ti_flags: Flags
 * @ti_count: Nest level
- * @ti_garbage: List of inode to be put when releasing semaphore
 */
 struct nilfs_transaction_info {
        u32                     ti_magic;
@@ -150,7 +149,6 @@ struct nilfs_transaction_info {
                                   one of other filesystems has a bug. */
        unsigned short          ti_flags;
        unsigned short          ti_count;
-        struct list_head        ti_garbage;
 };
 /* ti_magic */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index da276640f776..700ecbcca55d 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -461,14 +461,12 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
        return nc;
 }
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode)
-                        struct backing_dev_info *bdi)
 {
        mapping->host = inode;
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_NOFS);
        mapping->private_data = NULL;
-        mapping->backing_dev_info = bdi;
        mapping->a_ops = &empty_aops;
 }
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index ef30c5c2426f..a43b8287d012 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -57,8 +57,7 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_page(struct page *, bool);
 void nilfs_clear_dirty_pages(struct address_space *, bool);
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
-                        struct backing_dev_info *bdi);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
                                            sector_t start_blk,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 7ef18fc656c2..469086b9f99b 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -305,7 +305,6 @@ static void nilfs_transaction_lock(struct super_block *sb,
        ti->ti_count = 0;
        ti->ti_save = cur_ti;
        ti->ti_magic = NILFS_TI_MAGIC;
-        INIT_LIST_HEAD(&ti->ti_garbage);
        current->journal_info = ti;
        for (;;) {
@@ -332,8 +331,6 @@ static void nilfs_transaction_unlock(struct super_block *sb)
        up_write(&nilfs->ns_segctor_sem);
        current->journal_info = ti->ti_save;
-        if (!list_empty(&ti->ti_garbage))
-                nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
 }
 static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -746,6 +743,15 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs,
        }
 }
+static void nilfs_iput_work_func(struct work_struct *work)
+{
+        struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info,
+                                                 sc_iput_work);
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+        nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0);
+}
 static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
                                     struct nilfs_root *root)
 {
@@ -1900,8 +1906,8 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
 static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
                                             struct the_nilfs *nilfs)
 {
-        struct nilfs_transaction_info *ti = current->journal_info;
        struct nilfs_inode_info *ii, *n;
+        int defer_iput = false;
        spin_lock(&nilfs->ns_inode_lock);
        list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
@@ -1912,9 +1918,24 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
                clear_bit(NILFS_I_BUSY, &ii->i_state);
                brelse(ii->i_bh);
                ii->i_bh = NULL;
-                list_move_tail(&ii->i_dirty, &ti->ti_garbage);
+                list_del_init(&ii->i_dirty);
+                if (!ii->vfs_inode.i_nlink) {
+                        /*
+                         * Defer calling iput() to avoid a deadlock
+                         * over I_SYNC flag for inodes with i_nlink == 0
+                         */
+                        list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
+                        defer_iput = true;
+                } else {
+                        spin_unlock(&nilfs->ns_inode_lock);
+                        iput(&ii->vfs_inode);
+                        spin_lock(&nilfs->ns_inode_lock);
+                }
        }
        spin_unlock(&nilfs->ns_inode_lock);
+        if (defer_iput)
+                schedule_work(&sci->sc_iput_work);
 }
 /*
@@ -2583,6 +2604,8 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
        INIT_LIST_HEAD(&sci->sc_segbufs);
        INIT_LIST_HEAD(&sci->sc_write_logs);
        INIT_LIST_HEAD(&sci->sc_gc_inodes);
+        INIT_LIST_HEAD(&sci->sc_iput_queue);
+        INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
        init_timer(&sci->sc_timer);
        sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2609,6 +2632,8 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
                ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
                nilfs_transaction_unlock(sci->sc_super);
+                flush_work(&sci->sc_iput_work);
        } while (ret && retrycount-- > 0);
 }
@@ -2633,6 +2658,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
                || sci->sc_seq_request != sci->sc_seq_done);
        spin_unlock(&sci->sc_state_lock);
+        if (flush_work(&sci->sc_iput_work))
+                flag = true;
        if (flag || !nilfs_segctor_confirm(sci))
                nilfs_segctor_write_out(sci);
@@ -2642,6 +2670,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
                nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
        }
+        if (!list_empty(&sci->sc_iput_queue)) {
+                nilfs_warning(sci->sc_super, __func__,
+                              "iput queue is not empty\n");
+                nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
+        }
        WARN_ON(!list_empty(&sci->sc_segbufs));
        WARN_ON(!list_empty(&sci->sc_write_logs));
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 38a1d0013314..a48d6de1e02c 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -26,6 +26,7 @@
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
+#include <linux/workqueue.h>
 #include <linux/nilfs2_fs.h>
 #include "nilfs.h"
@@ -92,6 +93,8 @@ struct nilfs_segsum_pointer {
 * @sc_nblk_inc: Block count of current generation
 * @sc_dirty_files: List of files to be written
 * @sc_gc_inodes: List of GC inodes having blocks to be written
+ * @sc_iput_queue: list of inodes for which iput should be done
+ * @sc_iput_work: work struct to defer iput call
 * @sc_freesegs: array of segment numbers to be freed
 * @sc_nfreesegs: number of segments on @sc_freesegs
 * @sc_dsync_inode: inode whose data pages are written for a sync operation
@@ -135,6 +138,8 @@ struct nilfs_sc_info {
        struct list_head        sc_dirty_files;
        struct list_head        sc_gc_inodes;
+        struct list_head        sc_iput_queue;
+        struct work_struct      sc_iput_work;
        __u64                  *sc_freesegs;
        size_t                  sc_nfreesegs;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 2e5b3ec85b8f..5bc2a1cf73c3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -166,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
        ii->i_state = 0;
        ii->i_cno = 0;
        ii->vfs_inode.i_version = 1;
-        nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi);
+        nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
        return &ii->vfs_inode;
 }
@@ -1057,7 +1057,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct the_nilfs *nilfs;
        struct nilfs_root *fsroot;
-        struct backing_dev_info *bdi;
        __u64 cno;
        int err;
@@ -1077,8 +1076,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_time_gran = 1;
        sb->s_max_links = NILFS_LINK_MAX;
-        bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+        sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
-        sb->s_bdi = bdi ? : &default_backing_dev_info;
        err = load_nilfs(nilfs, sb);
        if (err)
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 22c629eedd82..2a24249b30af 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,5 +1,6 @@
 config FSNOTIFY
        def_bool n
+        select SRCU
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 30d3addfad75..51ceb8107284 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -140,7 +140,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
        }
        if (S_ISDIR(path->dentry->d_inode->i_mode) &&
-            (marks_ignored_mask & FS_ISDIR))
+            !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
                return false;
        if (event_mask & marks_mask & ~marks_ignored_mask)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index bff8567aa42d..cf275500a665 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -487,20 +487,27 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
                                            unsigned int flags,
                                            int *destroy)
 {
-        __u32 oldmask;
+        __u32 oldmask = 0;
        spin_lock(&fsn_mark->lock);
        if (!(flags & FAN_MARK_IGNORED_MASK)) {
+                __u32 tmask = fsn_mark->mask & ~mask;
+                if (flags & FAN_MARK_ONDIR)
+                        tmask &= ~FAN_ONDIR;
                oldmask = fsn_mark->mask;
-                fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask));
+                fsnotify_set_mark_mask_locked(fsn_mark, tmask);
        } else {
-                oldmask = fsn_mark->ignored_mask;
+                __u32 tmask = fsn_mark->ignored_mask & ~mask;
-                fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask));
+                if (flags & FAN_MARK_ONDIR)
+                        tmask &= ~FAN_ONDIR;
+                fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
        }
+        *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
        spin_unlock(&fsn_mark->lock);
-        *destroy = !(oldmask & ~mask);
        return mask & oldmask;
 }
@@ -569,20 +576,22 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
        spin_lock(&fsn_mark->lock);
        if (!(flags & FAN_MARK_IGNORED_MASK)) {
+                __u32 tmask = fsn_mark->mask | mask;
+                if (flags & FAN_MARK_ONDIR)
+                        tmask |= FAN_ONDIR;
                oldmask = fsn_mark->mask;
-                fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
+                fsnotify_set_mark_mask_locked(fsn_mark, tmask);
        } else {
                __u32 tmask = fsn_mark->ignored_mask | mask;
+                if (flags & FAN_MARK_ONDIR)
+                        tmask |= FAN_ONDIR;
                fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
                if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
                        fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
        }
-        if (!(flags & FAN_MARK_ONDIR)) {
-                __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR;
-                fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
-        }
        spin_unlock(&fsn_mark->lock);
        return mask & ~oldmask;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 643faa44f22b..1da9b2d184dc 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,6 +19,7 @@
 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+#include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/gfp.h>
 #include <linux/pagemap.h>
@@ -2091,7 +2092,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
        count = iov_length(iov, nr_segs);
        pos = *ppos;
        /* We can write back this queue in page reclaim. */
-        current->backing_dev_info = mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
        written = 0;
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 7e8282dcea2a..c58a1bcfda0f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -245,16 +245,14 @@ int ocfs2_set_acl(handle_t *handle,
                        ret = posix_acl_equiv_mode(acl, &mode);
                        if (ret < 0)
                                return ret;
-                        else {
-                                if (ret == 0)
-                                        acl = NULL;
-                                ret = ocfs2_acl_set_mode(inode, di_bh,
+                        if (ret == 0)
-                                                         handle, mode);
+                                acl = NULL;
-                                if (ret)
-                                        return ret;
-                        }
+                        ret = ocfs2_acl_set_mode(inode, di_bh,
+                                                 handle, mode);
+                        if (ret)
+                                return ret;
                }
                break;
        case ACL_TYPE_DEFAULT:
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index fcae9ef1a328..044158bd22be 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6873,7 +6873,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
-                goto out_unlock;
+                goto out;
        }
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
@@ -6931,7 +6931,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                if (ret) {
                        mlog_errno(ret);
                        need_free = 1;
-                        goto out_commit;
+                        goto out_unlock;
                }
                page_end = PAGE_CACHE_SIZE;
@@ -6964,12 +6964,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                if (ret) {
                        mlog_errno(ret);
                        need_free = 1;
-                        goto out_commit;
+                        goto out_unlock;
                }
                inode->i_blocks = ocfs2_inode_sector_count(inode);
        }
+out_unlock:
+        if (pages)
+                ocfs2_unlock_and_free_pages(pages, num_pages);
 out_commit:
        if (ret < 0 && did_quota)
                dquot_free_space_nodirty(inode,
@@ -6989,15 +6993,11 @@ out_commit:
        ocfs2_commit_trans(osb, handle);
-out_unlock:
+out:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
+        if (pages)
-out:
-        if (pages) {
-                ocfs2_unlock_and_free_pages(pages, num_pages);
                kfree(pages);
-        }
        return ret;
 }
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2e355e0f8335..56c403a563bc 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1016,7 +1016,8 @@ void o2net_fill_node_map(unsigned long *map, unsigned bytes)
        memset(map, 0, bytes);
        for (node = 0; node < O2NM_MAX_NODES; ++node) {
-                o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
+                if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret))
+                        continue;
                if (!ret) {
                        set_bit(node, map);
                        sc_put(sc);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index dc024367110a..b95e7df5b76a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -107,12 +107,12 @@ struct o2net_node {
        struct list_head                nn_status_list;
        /* connects are attempted from when heartbeat comes up until either hb
-         * goes down, the node is unconfigured, no connect attempts succeed
+         * goes down, the node is unconfigured, or a connect succeeds.
-         * before O2NET_CONN_IDLE_DELAY, or a connect succeeds.  connect_work
+         * connect_work is queued from set_nn_state both from hb up and from
-         * is queued from set_nn_state both from hb up and from itself if a
+         * itself if a connect attempt fails and so can be self-arming.
-         * connect attempt fails and so can be self-arming.  shutdown is
+         * shutdown is careful to first mark the nn such that no connects will
-         * careful to first mark the nn such that no connects will be attempted
+         * be attempted before canceling delayed connect work and flushing the
-         * before canceling delayed connect work and flushing the queue. */
+         * queue. */
        struct delayed_work             nn_connect_work;
        unsigned long                   nn_last_connect_attempt;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 319e786175af..b08050bd3f2e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3456,10 +3456,8 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
        int blocksize = dir->i_sb->s_blocksize;
        status = ocfs2_read_dir_block(dir, 0, &bh, 0);
-        if (status) {
+        if (status)
-                mlog_errno(status);
                goto bail;
-        }
        rec_len = OCFS2_DIR_REC_LEN(namelen);
        offset = 0;
@@ -3480,10 +3478,9 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                        status = ocfs2_read_dir_block(dir,
                                             offset >> sb->s_blocksize_bits,
                                             &bh, 0);
-                        if (status) {
+                        if (status)
-                                mlog_errno(status);
                                goto bail;
-                        }
                        /* move to next block */
                        de = (struct ocfs2_dir_entry *) bh->b_data;
                }
@@ -3513,7 +3510,6 @@ next:
                de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
        }
-        status = 0;
 bail:
        brelse(bh);
        if (status)
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index b46278f9ae44..fd6bbbbd7d78 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -385,8 +385,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
                head = &res->granted;
        list_for_each_entry(lock, head, list) {
-                if (lock->ml.cookie == cookie)
+                /* if lock is found but unlock is pending ignore the bast */
+                if (lock->ml.cookie == cookie) {
+                        if (lock->unlock_pending)
+                                break;
                        goto do_ast;
+                }
        }
        mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 149eb556b8c6..825136070d2c 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -406,7 +406,7 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
        }
        spin_unlock(&dlm->spinlock);
-        out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
+        out += snprintf(buf + out, len - out, "Total on list: %lu\n", total);
        return out;
 }
@@ -464,7 +464,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
        spin_unlock(&dlm->master_lock);
        out += snprintf(buf + out, len - out,
-                        "Total: %ld, Longest: %ld\n", total, longest);
+                        "Total: %lu, Longest: %lu\n", total, longest);
        return out;
 }
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 50a59d2337b2..7df88a6dd626 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -674,20 +674,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
        spin_unlock(&dlm->spinlock);
 }
-int dlm_joined(struct dlm_ctxt *dlm)
-{
-        int ret = 0;
-        spin_lock(&dlm_domain_lock);
-        if (dlm->dlm_state == DLM_CTXT_JOINED)
-                ret = 1;
-        spin_unlock(&dlm_domain_lock);
-        return ret;
-}
 int dlm_shutting_down(struct dlm_ctxt *dlm)
 {
        int ret = 0;
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
index 2f7f60bfeb3b..fd6122a38dbd 100644
--- a/fs/ocfs2/dlm/dlmdomain.h
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -28,7 +28,6 @@
 extern spinlock_t dlm_domain_lock;
 extern struct list_head dlm_domains;
-int dlm_joined(struct dlm_ctxt *dlm);
 int dlm_shutting_down(struct dlm_ctxt *dlm);
 void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
                                        int node_num);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index cecd875653e4..ce12e0b1a31f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1070,6 +1070,9 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
                                             dead_node, dlm->name);
                                        list_del_init(&lock->list);
                                        dlm_lock_put(lock);
+                                        /* Can't schedule DLM_UNLOCK_FREE_LOCK
+                                         * - do manually */
+                                        dlm_lock_put(lock);
                                        break;
                                }
                        }
@@ -2346,6 +2349,10 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
                                                     dead_node, dlm->name);
                                                list_del_init(&lock->list);
                                                dlm_lock_put(lock);
+                                                /* Can't schedule
+                                                 * DLM_UNLOCK_FREE_LOCK
+                                                 * - do manually */
+                                                dlm_lock_put(lock);
                                                break;
                                        }
                                }
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 57c40e34f56f..061ba6a91bf2 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -390,12 +390,6 @@ clear_fields:
        ip->ip_conn = NULL;
 }
-static struct backing_dev_info dlmfs_backing_dev_info = {
-        .name           = "ocfs2-dlmfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
 static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 {
        struct inode *inode = new_inode(sb);
@@ -404,7 +398,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
        if (inode) {
                inode->i_ino = get_next_ino();
                inode_init_owner(inode, NULL, mode);
-                inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inc_nlink(inode);
@@ -428,7 +421,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
        inode->i_ino = get_next_ino();
        inode_init_owner(inode, parent, mode);
-        inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        ip = DLMFS_I(inode);
@@ -643,10 +635,6 @@ static int __init init_dlmfs_fs(void)
        int status;
        int cleanup_inode = 0, cleanup_worker = 0;
-        status = bdi_init(&dlmfs_backing_dev_info);
-        if (status)
-                return status;
        dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
                                sizeof(struct dlmfs_inode_private),
                                0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -673,7 +661,6 @@ bail:
                        kmem_cache_destroy(dlmfs_inode_cache);
                if (cleanup_worker)
                        destroy_workqueue(user_dlm_worker);
-                bdi_destroy(&dlmfs_backing_dev_info);
        } else
                printk("OCFS2 User DLM kernel interface loaded\n");
        return status;
@@ -693,7 +680,6 @@ static void __exit exit_dlmfs_fs(void)
        rcu_barrier();
        kmem_cache_destroy(dlmfs_inode_cache);
-        bdi_destroy(&dlmfs_backing_dev_info);
 }
 MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1c423af04c69..11849a44dc5a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3750,6 +3750,9 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
                        break;
                spin_unlock(&dentry_attach_lock);
+                if (S_ISDIR(dl->dl_inode->i_mode))
+                        shrink_dcache_parent(dentry);
                mlog(0, "d_delete(%pd);\n", dentry);
                /*
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3950693dd0f6..e0f04d55fd05 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -569,7 +569,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
        handle_t *handle = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
        struct ocfs2_alloc_context *meta_ac = NULL;
-        enum ocfs2_alloc_restarted why;
+        enum ocfs2_alloc_restarted why = RESTART_NONE;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_extent_tree et;
        int did_quota = 0;
@@ -2363,7 +2363,7 @@ relock:
                        goto out_dio;
                }
        } else {
-                current->backing_dev_info = file->f_mapping->backing_dev_info;
+                current->backing_dev_info = inode_to_bdi(inode);
                written = generic_perform_write(file, from, *ppos);
                if (likely(written >= 0))
                        iocb->ki_pos = *ppos + written;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 4f502382180f..d10860fde165 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1447,7 +1447,6 @@ bail:
         * requires that we call do_exit().  And it isn't exported, but
         * complete_and_exit() seems to be a minimal wrapper around it. */
        complete_and_exit(NULL, status);
-        return status;
 }
 void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 10d66c75cecb..9581d190f6e1 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -173,7 +173,6 @@ out:
 static const struct vm_operations_struct ocfs2_file_vm_ops = {
        .fault          = ocfs2_fault,
        .page_mkwrite   = ocfs2_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7d6b7d090452..fdbcbfed529e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -279,6 +279,8 @@ enum ocfs2_mount_options
                                                     writes */
        OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
        OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
+        OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15,  /* Journal Async Commit */
 };
 #define OCFS2_OSB_SOFT_RO       0x0001
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 1eae330193a6..b6d51333ad02 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -48,6 +48,7 @@ struct ocfs2_quota_recovery {
 /* In-memory structure with quota header information */
 struct ocfs2_mem_dqinfo {
        unsigned int dqi_type;          /* Quota type this structure describes */
+        unsigned int dqi_flags;         /* Flags OLQF_* */
        unsigned int dqi_chunks;        /* Number of chunks in local quota file */
        unsigned int dqi_blocks;        /* Number of blocks allocated for local quota file */
        unsigned int dqi_syncms;        /* How often should we sync with other nodes */
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 10b653930ee2..3d0b63d34225 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -73,12 +73,6 @@ static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
               ol_dqblk_block_off(sb, c, off);
 }
-/* Compute block number from given offset */
-static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
-{
-        return off >> sb->s_blocksize_bits;
-}
 static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
 {
        return off & ((1 << sb->s_blocksize_bits) - 1);
@@ -292,7 +286,7 @@ static void olq_update_info(struct buffer_head *bh, void *private)
        ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
                                                OCFS2_LOCAL_INFO_OFF);
        spin_lock(&dq_data_lock);
-        ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+        ldinfo->dqi_flags = cpu_to_le32(oinfo->dqi_flags);
        ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
        ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
        spin_unlock(&dq_data_lock);
@@ -701,8 +695,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        /* We don't need the lock and we have to acquire quota file locks
         * which will later depend on this lock */
        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
-        info->dqi_maxblimit = 0x7fffffffffffffffLL;
+        info->dqi_max_spc_limit = 0x7fffffffffffffffLL;
-        info->dqi_maxilimit = 0x7fffffffffffffffLL;
+        info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
        oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
        if (!oinfo) {
                mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
@@ -737,13 +731,13 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        }
        ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
                                                OCFS2_LOCAL_INFO_OFF);
-        info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+        oinfo->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
        oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
        oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
        oinfo->dqi_libh = bh;
        /* We crashed when using local quota file? */
-        if (!(info->dqi_flags & OLQF_CLEAN)) {
+        if (!(oinfo->dqi_flags & OLQF_CLEAN)) {
                rec = OCFS2_SB(sb)->quota_rec;
                if (!rec) {
                        rec = ocfs2_alloc_quota_recovery();
@@ -772,7 +766,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        }
        /* Now mark quota file as used */
-        info->dqi_flags &= ~OLQF_CLEAN;
+        oinfo->dqi_flags &= ~OLQF_CLEAN;
        status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
        if (status < 0) {
                mlog_errno(status);
@@ -857,7 +851,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
                goto out;
        /* Mark local file as clean */
-        info->dqi_flags |= OLQF_CLEAN;
+        oinfo->dqi_flags |= OLQF_CLEAN;
        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
                                 oinfo->dqi_libh,
                                 olq_update_info,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d81f6e2a97f5..ee541f92dab4 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2428,8 +2428,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
                        get_bh(prev_bh);
                }
-                rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
                trace_ocfs2_calc_refcount_meta_credits_iterate(
                                recs_add, (unsigned long long)cpos, clusters,
                                (unsigned long long)le64_to_cpu(rec.r_cpos),
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 41ffd36c689c..6a348b0294ab 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -39,7 +39,7 @@
 #define OCFS2_CHECK_RESERVATIONS
 #endif
-DEFINE_SPINLOCK(resv_lock);
+static DEFINE_SPINLOCK(resv_lock);
 #define OCFS2_MIN_RESV_WINDOW_BITS      8
 #define OCFS2_MAX_RESV_WINDOW_BITS      1024
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 83723179e1ec..87a1f7679d9b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -191,6 +191,7 @@ enum {
        Opt_coherency_full,
        Opt_resv_level,
        Opt_dir_resv_level,
+        Opt_journal_async_commit,
        Opt_err,
 };
@@ -222,6 +223,7 @@ static const match_table_t tokens = {
        {Opt_coherency_full, "coherency=full"},
        {Opt_resv_level, "resv_level=%u"},
        {Opt_dir_resv_level, "dir_resv_level=%u"},
+        {Opt_journal_async_commit, "journal_async_commit"},
        {Opt_err, NULL}
 };
@@ -1000,36 +1002,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
        }
 }
-/* Handle quota on quotactl */
-static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
-{
-        unsigned int feature[OCFS2_MAXQUOTAS] = {
-                                        OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
-                                        OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
-        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
-                return -EINVAL;
-        return dquot_enable(sb_dqopt(sb)->files[type], type,
-                            format_id, DQUOT_LIMITS_ENABLED);
-}
-/* Handle quota off quotactl */
-static int ocfs2_quota_off(struct super_block *sb, int type)
-{
-        return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-}
-static const struct quotactl_ops ocfs2_quotactl_ops = {
-        .quota_on_meta  = ocfs2_quota_on,
-        .quota_off      = ocfs2_quota_off,
-        .quota_sync     = dquot_quota_sync,
-        .get_info       = dquot_get_dqinfo,
-        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = dquot_set_dqblk,
-};
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct dentry *root;
@@ -1500,6 +1472,9 @@ static int ocfs2_parse_options(struct super_block *sb,
                            option < OCFS2_MAX_RESV_LEVEL)
                                mopt->dir_resv_level = option;
                        break;
+                case Opt_journal_async_commit:
+                        mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
+                        break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -1606,6 +1581,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
        if (osb->osb_dir_resv_level != osb->osb_resv_level)
                seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
+        if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
+                seq_printf(s, ",journal_async_commit");
        return 0;
 }
@@ -2079,7 +2057,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        sb->s_op = &ocfs2_sops;
        sb->s_d_op = &ocfs2_dentry_ops;
        sb->s_export_op = &ocfs2_export_ops;
-        sb->s_qcop = &ocfs2_quotactl_ops;
+        sb->s_qcop = &dquot_quotactl_sysfile_ops;
        sb->dq_op = &ocfs2_quota_operations;
        sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
        sb->s_xattr = ocfs2_xattr_handlers;
@@ -2475,6 +2453,15 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
                goto finally;
        }
+        if (osb->s_mount_opt & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
+                jbd2_journal_set_features(osb->journal->j_journal,
+                                JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+        else
+                jbd2_journal_clear_features(osb->journal->j_journal,
+                                JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
        if (dirty) {
                /* recover my local alloc if we didn't unmount cleanly. */
                status = ocfs2_begin_local_alloc_recovery(osb,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 662f8dee149f..85b190dc132f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5334,16 +5334,6 @@ out:
        return ret;
 }
-static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
-                                        struct ocfs2_xattr_bucket *bucket,
-                                        int offs)
-{
-        int block_off = offs >> inode->i_sb->s_blocksize_bits;
-        offs = offs % inode->i_sb->s_blocksize;
-        return bucket_block(bucket, block_off) + offs;
-}
 /*
 * Truncate the specified xe_off entry in xattr bucket.
 * bucket is indicated by header_bh and len is the new length.
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1e3187da1fed..7eee2d8b97d9 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -5,6 +5,7 @@
 #include <linux/ksm.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
+#include <linux/huge_mm.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
@@ -121,9 +122,18 @@ u64 stable_page_flags(struct page *page)
         * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
         * to make sure a given page is a thp, not a non-huge compound page.
         */
-        else if (PageTransCompound(page) && (PageLRU(compound_head(page)) ||
+        else if (PageTransCompound(page)) {
-                                             PageAnon(compound_head(page))))
+                struct page *head = compound_head(page);
-                u |= 1 << KPF_THP;
+                if (PageLRU(head) || PageAnon(head))
+                        u |= 1 << KPF_THP;
+                else if (is_huge_zero_page(head)) {
+                        u |= 1 << KPF_ZERO_PAGE;
+                        u |= 1 << KPF_THP;
+                }
+        } else if (is_zero_pfn(page_to_pfn(page)))
+                u |= 1 << KPF_ZERO_PAGE;
        /*
         * Caveats on high order pages: page->_count will only be set
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 246eae84b13b..0e36c1e49fe3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -21,7 +21,7 @@
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-        unsigned long data, text, lib, swap;
+        unsigned long data, text, lib, swap, ptes, pmds;
        unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
        /*
@@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
        text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
        lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
        swap = get_mm_counter(mm, MM_SWAPENTS);
+        ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
+        pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
        seq_printf(m,
                "VmPeak:\t%8lu kB\n"
                "VmSize:\t%8lu kB\n"
@@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                "VmExe:\t%8lu kB\n"
                "VmLib:\t%8lu kB\n"
                "VmPTE:\t%8lu kB\n"
+                "VmPMD:\t%8lu kB\n"
                "VmSwap:\t%8lu kB\n",
                hiwater_vm << (PAGE_SHIFT-10),
                total_vm << (PAGE_SHIFT-10),
@@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                total_rss << (PAGE_SHIFT-10),
                data << (PAGE_SHIFT-10),
                mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-                (PTRS_PER_PTE * sizeof(pte_t) *
+                ptes >> 10,
-                 atomic_long_read(&mm->nr_ptes)) >> 10,
+                pmds >> 10,
                swap << (PAGE_SHIFT-10));
 }
@@ -433,7 +436,6 @@ const struct file_operations proc_tid_maps_operations = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
 struct mem_size_stats {
-        struct vm_area_struct *vma;
        unsigned long resident;
        unsigned long shared_clean;
        unsigned long shared_dirty;
@@ -443,7 +445,6 @@ struct mem_size_stats {
        unsigned long anonymous;
        unsigned long anonymous_thp;
        unsigned long swap;
-        unsigned long nonlinear;
        u64 pss;
 };
@@ -483,8 +484,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
                struct mm_walk *walk)
 {
        struct mem_size_stats *mss = walk->private;
-        struct vm_area_struct *vma = mss->vma;
+        struct vm_area_struct *vma = walk->vma;
-        pgoff_t pgoff = linear_page_index(vma, addr);
        struct page *page = NULL;
        if (pte_present(*pte)) {
@@ -496,17 +496,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
                        mss->swap += PAGE_SIZE;
                else if (is_migration_entry(swpent))
                        page = migration_entry_to_page(swpent);
-        } else if (pte_file(*pte)) {
-                if (pte_to_pgoff(*pte) != pgoff)
-                        mss->nonlinear += PAGE_SIZE;
        }
        if (!page)
                return;
-        if (page->index != pgoff)
-                mss->nonlinear += PAGE_SIZE;
        smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
 }
@@ -515,7 +508,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
                struct mm_walk *walk)
 {
        struct mem_size_stats *mss = walk->private;
-        struct vm_area_struct *vma = mss->vma;
+        struct vm_area_struct *vma = walk->vma;
        struct page *page;
        /* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -536,8 +529,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                           struct mm_walk *walk)
 {
-        struct mem_size_stats *mss = walk->private;
+        struct vm_area_struct *vma = walk->vma;
-        struct vm_area_struct *vma = mss->vma;
        pte_t *pte;
        spinlock_t *ptl;
@@ -596,7 +588,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
                [ilog2(VM_ACCOUNT)]     = "ac",
                [ilog2(VM_NORESERVE)]   = "nr",
                [ilog2(VM_HUGETLB)]     = "ht",
-                [ilog2(VM_NONLINEAR)]   = "nl",
                [ilog2(VM_ARCH_1)]      = "ar",
                [ilog2(VM_DONTDUMP)]    = "dd",
 #ifdef CONFIG_MEM_SOFT_DIRTY
@@ -630,10 +621,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
        };
        memset(&mss, 0, sizeof mss);
-        mss.vma = vma;
        /* mmap_sem is held in m_start */
-        if (vma->vm_mm && !is_vm_hugetlb_page(vma))
+        walk_page_vma(vma, &smaps_walk);
-                walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
        show_map_vma(m, vma, is_pid);
@@ -668,10 +657,6 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
                   (vma->vm_flags & VM_LOCKED) ?
                        (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
-        if (vma->vm_flags & VM_NONLINEAR)
-                seq_printf(m, "Nonlinear:      %8lu kB\n",
-                                mss.nonlinear >> 10);
        show_smap_vma_flags(m, vma);
        m_cache_vma(m, vma);
        return 0;
@@ -751,14 +736,13 @@ enum clear_refs_types {
 };
 struct clear_refs_private {
-        struct vm_area_struct *vma;
        enum clear_refs_types type;
 };
+#ifdef CONFIG_MEM_SOFT_DIRTY
 static inline void clear_soft_dirty(struct vm_area_struct *vma,
                unsigned long addr, pte_t *pte)
 {
-#ifdef CONFIG_MEM_SOFT_DIRTY
        /*
         * The soft-dirty tracker uses #PF-s to catch writes
         * to pages, so write-protect the pte as well. See the
@@ -772,24 +756,63 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
                ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
        } else if (is_swap_pte(ptent)) {
                ptent = pte_swp_clear_soft_dirty(ptent);
-        } else if (pte_file(ptent)) {
-                ptent = pte_file_clear_soft_dirty(ptent);
        }
        set_pte_at(vma->vm_mm, addr, pte, ptent);
-#endif
 }
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+                unsigned long addr, pmd_t *pmdp)
+{
+        pmd_t pmd = *pmdp;
+        pmd = pmd_wrprotect(pmd);
+        pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+        if (vma->vm_flags & VM_SOFTDIRTY)
+                vma->vm_flags &= ~VM_SOFTDIRTY;
+        set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+}
+#else
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+                unsigned long addr, pte_t *pte)
+{
+}
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+                unsigned long addr, pmd_t *pmdp)
+{
+}
+#endif
 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
                                unsigned long end, struct mm_walk *walk)
 {
        struct clear_refs_private *cp = walk->private;
-        struct vm_area_struct *vma = cp->vma;
+        struct vm_area_struct *vma = walk->vma;
        pte_t *pte, ptent;
        spinlock_t *ptl;
        struct page *page;
-        split_huge_page_pmd(vma, addr, pmd);
+        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+                if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+                        clear_soft_dirty_pmd(vma, addr, pmd);
+                        goto out;
+                }
+                page = pmd_page(*pmd);
+                /* Clear accessed and referenced bits. */
+                pmdp_test_and_clear_young(vma, addr, pmd);
+                ClearPageReferenced(page);
+out:
+                spin_unlock(ptl);
+                return 0;
+        }
        if (pmd_trans_unstable(pmd))
                return 0;
@@ -818,6 +841,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        return 0;
 }
+static int clear_refs_test_walk(unsigned long start, unsigned long end,
+                                struct mm_walk *walk)
+{
+        struct clear_refs_private *cp = walk->private;
+        struct vm_area_struct *vma = walk->vma;
+        if (vma->vm_flags & VM_PFNMAP)
+                return 1;
+        /*
+         * Writing 1 to /proc/pid/clear_refs affects all pages.
+         * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
+         * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
+         * Writing 4 to /proc/pid/clear_refs affects all pages.
+         */
+        if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
+                return 1;
+        if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
+                return 1;
+        return 0;
+}
 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
 {
@@ -858,6 +903,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                };
                struct mm_walk clear_refs_walk = {
                        .pmd_entry = clear_refs_pte_range,
+                        .test_walk = clear_refs_test_walk,
                        .mm = mm,
                        .private = &cp,
                };
@@ -877,28 +923,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                        }
                        mmu_notifier_invalidate_range_start(mm, 0, -1);
                }
-                for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                walk_page_range(0, ~0UL, &clear_refs_walk);
-                        cp.vma = vma;
-                        if (is_vm_hugetlb_page(vma))
-                                continue;
-                        /*
-                         * Writing 1 to /proc/pid/clear_refs affects all pages.
-                         *
-                         * Writing 2 to /proc/pid/clear_refs only affects
-                         * Anonymous pages.
-                         *
-                         * Writing 3 to /proc/pid/clear_refs only affects file
-                         * mapped pages.
-                         *
-                         * Writing 4 to /proc/pid/clear_refs affects all pages.
-                         */
-                        if (type == CLEAR_REFS_ANON && vma->vm_file)
-                                continue;
-                        if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
-                                continue;
-                        walk_page_range(vma->vm_start, vma->vm_end,
-                                        &clear_refs_walk);
-                }
                if (type == CLEAR_REFS_SOFT_DIRTY)
                        mmu_notifier_invalidate_range_end(mm, 0, -1);
                flush_tlb_mm(mm);
@@ -1066,15 +1091,13 @@ static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemap
 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                             struct mm_walk *walk)
 {
-        struct vm_area_struct *vma;
+        struct vm_area_struct *vma = walk->vma;
        struct pagemapread *pm = walk->private;
        spinlock_t *ptl;
-        pte_t *pte;
+        pte_t *pte, *orig_pte;
        int err = 0;
-        /* find the first VMA at or above 'addr' */
+        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
-        vma = find_vma(walk->mm, addr);
-        if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                int pmd_flags2;
                if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1100,51 +1123,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        if (pmd_trans_unstable(pmd))
                return 0;
-        while (1) {
+        /*
-                /* End of address space hole, which we mark as non-present. */
+         * We can assume that @vma always points to a valid one and @end never
-                unsigned long hole_end;
+         * goes beyond vma->vm_end.
+         */
-                if (vma)
+        orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
-                        hole_end = min(end, vma->vm_start);
+        for (; addr < end; pte++, addr += PAGE_SIZE) {
-                else
+                pagemap_entry_t pme;
-                        hole_end = end;
-                for (; addr < hole_end; addr += PAGE_SIZE) {
-                        pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
-                        err = add_to_pagemap(addr, &pme, pm);
-                        if (err)
-                                return err;
-                }
-                if (!vma || vma->vm_start >= end)
-                        break;
-                /*
-                 * We can't possibly be in a hugetlb VMA. In general,
-                 * for a mm_walk with a pmd_entry and a hugetlb_entry,
-                 * the pmd_entry can only be called on addresses in a
-                 * hugetlb if the walk starts in a non-hugetlb VMA and
-                 * spans a hugepage VMA. Since pagemap_read walks are
-                 * PMD-sized and PMD-aligned, this will never be true.
-                 */
-                BUG_ON(is_vm_hugetlb_page(vma));
-                /* Addresses in the VMA. */
-                for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
-                        pagemap_entry_t pme;
-                        pte = pte_offset_map(pmd, addr);
-                        pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
-                        pte_unmap(pte);
-                        err = add_to_pagemap(addr, &pme, pm);
-                        if (err)
-                                return err;
-                }
-                if (addr == end)
+                pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
+                err = add_to_pagemap(addr, &pme, pm);
+                if (err)
                        break;
-                vma = find_vma(walk->mm, addr);
        }
+        pte_unmap_unlock(orig_pte, ptl);
        cond_resched();
@@ -1170,15 +1162,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
                                 struct mm_walk *walk)
 {
        struct pagemapread *pm = walk->private;
-        struct vm_area_struct *vma;
+        struct vm_area_struct *vma = walk->vma;
        int err = 0;
        int flags2;
        pagemap_entry_t pme;
-        vma = find_vma(walk->mm, addr);
+        if (vma->vm_flags & VM_SOFTDIRTY)
-        WARN_ON_ONCE(!vma);
-        if (vma && (vma->vm_flags & VM_SOFTDIRTY))
                flags2 = __PM_SOFT_DIRTY;
        else
                flags2 = 0;
@@ -1338,7 +1327,6 @@ const struct file_operations proc_pagemap_operations = {
 #ifdef CONFIG_NUMA
 struct numa_maps {
-        struct vm_area_struct *vma;
        unsigned long pages;
        unsigned long anon;
        unsigned long active;
@@ -1407,18 +1395,17 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
                unsigned long end, struct mm_walk *walk)
 {
-        struct numa_maps *md;
+        struct numa_maps *md = walk->private;
+        struct vm_area_struct *vma = walk->vma;
        spinlock_t *ptl;
        pte_t *orig_pte;
        pte_t *pte;
-        md = walk->private;
+        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
-        if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
                pte_t huge_pte = *(pte_t *)pmd;
                struct page *page;
-                page = can_gather_numa_stats(huge_pte, md->vma, addr);
+                page = can_gather_numa_stats(huge_pte, vma, addr);
                if (page)
                        gather_stats(page, md, pte_dirty(huge_pte),
                                     HPAGE_PMD_SIZE/PAGE_SIZE);
@@ -1430,7 +1417,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
                return 0;
        orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        do {
-                struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
+                struct page *page = can_gather_numa_stats(*pte, vma, addr);
                if (!page)
                        continue;
                gather_stats(page, md, pte_dirty(*pte), 1);
@@ -1440,7 +1427,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
        return 0;
 }
 #ifdef CONFIG_HUGETLB_PAGE
-static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
                unsigned long addr, unsigned long end, struct mm_walk *walk)
 {
        struct numa_maps *md;
@@ -1459,7 +1446,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
 }
 #else
-static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
                unsigned long addr, unsigned long end, struct mm_walk *walk)
 {
        return 0;
@@ -1477,7 +1464,12 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        struct numa_maps *md = &numa_priv->md;
        struct file *file = vma->vm_file;
        struct mm_struct *mm = vma->vm_mm;
-        struct mm_walk walk = {};
+        struct mm_walk walk = {
+                .hugetlb_entry = gather_hugetlb_stats,
+                .pmd_entry = gather_pte_stats,
+                .private = md,
+                .mm = mm,
+        };
        struct mempolicy *pol;
        char buffer[64];
        int nid;
@@ -1488,13 +1480,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        /* Ensure we start with an empty set of numa_maps statistics. */
        memset(md, 0, sizeof(*md));
-        md->vma = vma;
-        walk.hugetlb_entry = gather_hugetbl_stats;
-        walk.pmd_entry = gather_pte_stats;
-        walk.private = md;
-        walk.mm = mm;
        pol = __get_vma_policy(vma, vma->vm_start);
        if (pol) {
                mpol_to_str(buffer, sizeof(buffer), pol);
@@ -1528,7 +1513,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        if (is_vm_hugetlb_page(vma))
                seq_puts(m, " huge");
-        walk_page_range(vma->vm_start, vma->vm_end, &walk);
+        /* mmap_sem is held by m_start */
+        walk_page_vma(vma, &walk);
        if (!md->pages)
                goto out;
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 983d9510becc..916b8e23d968 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -21,6 +21,16 @@ config PSTORE_CONSOLE
          When the option is enabled, pstore will log all kernel
          messages, even if no oops or panic happened.
+config PSTORE_PMSG
+        bool "Log user space messages"
+        depends on PSTORE
+        help
+          When the option is enabled, pstore will export a character
+          interface /dev/pmsg0 to log user space messages. On reboot
+          data can be retrieved from /sys/fs/pstore/pmsg-ramoops-[ID].
+          If unsure, say N.
 config PSTORE_FTRACE
        bool "Persistent function tracer"
        depends on PSTORE
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index 4c9095c2781e..e647d8e81712 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -7,5 +7,7 @@ obj-y += pstore.o
 pstore-objs += inode.o platform.o
 obj-$(CONFIG_PSTORE_FTRACE)     += ftrace.o
+obj-$(CONFIG_PSTORE_PMSG)       += pmsg.o
 ramoops-objs += ram.o ram_core.o
 obj-$(CONFIG_PSTORE_RAM)        += ramoops.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 50416602774d..b32ce53d24ee 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -338,32 +338,38 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
        switch (type) {
        case PSTORE_TYPE_DMESG:
-                sprintf(name, "dmesg-%s-%lld%s", psname, id,
+                scnprintf(name, sizeof(name), "dmesg-%s-%lld%s",
-                                                compressed ? ".enc.z" : "");
+                          psname, id, compressed ? ".enc.z" : "");
                break;
        case PSTORE_TYPE_CONSOLE:
-                sprintf(name, "console-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "console-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_FTRACE:
-                sprintf(name, "ftrace-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_MCE:
-                sprintf(name, "mce-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_PPC_RTAS:
-                sprintf(name, "rtas-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_PPC_OF:
-                sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld",
+                          psname, id);
                break;
        case PSTORE_TYPE_PPC_COMMON:
-                sprintf(name, "powerpc-common-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "powerpc-common-%s-%lld",
+                          psname, id);
+                break;
+        case PSTORE_TYPE_PMSG:
+                scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_UNKNOWN:
-                sprintf(name, "unknown-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id);
                break;
        default:
-                sprintf(name, "type%d-%s-%lld", type, psname, id);
+                scnprintf(name, sizeof(name), "type%d-%s-%lld",
+                          type, psname, id);
                break;
        }
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 3b3d305277c4..c36ba2cd0b5d 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -45,6 +45,12 @@ extern void pstore_register_ftrace(void);
 static inline void pstore_register_ftrace(void) {}
 #endif
+#ifdef CONFIG_PSTORE_PMSG
+extern void pstore_register_pmsg(void);
+#else
+static inline void pstore_register_pmsg(void) {}
+#endif
 extern struct pstore_info *psinfo;
 extern void     pstore_set_kmsg_bytes(int);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 0a9b72cdfeca..c4c9a10c5760 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -301,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
                if (big_oops_buf) {
                        dst = big_oops_buf;
-                        hsize = sprintf(dst, "%s#%d Part%d\n", why,
+                        hsize = sprintf(dst, "%s#%d Part%u\n", why,
                                                        oopscount, part);
                        size = big_oops_buf_sz - hsize;
@@ -321,7 +321,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
                        }
                } else {
                        dst = psinfo->buf;
-                        hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount,
+                        hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount,
                                                                        part);
                        size = psinfo->bufsize - hsize;
                        dst += hsize;
@@ -447,6 +447,7 @@ int pstore_register(struct pstore_info *psi)
        if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
                pstore_register_console();
                pstore_register_ftrace();
+                pstore_register_pmsg();
        }
        if (pstore_update_ms >= 0) {
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
new file mode 100644
index 000000000000..feb5dd2948b4
--- /dev/null
+++ b/fs/pstore/pmsg.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2014  Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+static DEFINE_MUTEX(pmsg_lock);
+#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE)
+static ssize_t write_pmsg(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+        size_t i, buffer_size;
+        char *buffer;
+        if (!count)
+                return 0;
+        if (!access_ok(VERIFY_READ, buf, count))
+                return -EFAULT;
+        buffer_size = count;
+        if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
+                buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
+        buffer = vmalloc(buffer_size);
+        mutex_lock(&pmsg_lock);
+        for (i = 0; i < count; ) {
+                size_t c = min(count - i, buffer_size);
+                u64 id;
+                long ret;
+                ret = __copy_from_user(buffer, buf + i, c);
+                if (unlikely(ret != 0)) {
+                        mutex_unlock(&pmsg_lock);
+                        vfree(buffer);
+                        return -EFAULT;
+                }
+                psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c,
+                                  psinfo);
+                i += c;
+        }
+        mutex_unlock(&pmsg_lock);
+        vfree(buffer);
+        return count;
+}
+static const struct file_operations pmsg_fops = {
+        .owner          = THIS_MODULE,
+        .llseek         = noop_llseek,
+        .write          = write_pmsg,
+};
+static struct class *pmsg_class;
+static int pmsg_major;
+#define PMSG_NAME "pmsg"
+#undef pr_fmt
+#define pr_fmt(fmt) PMSG_NAME ": " fmt
+static char *pmsg_devnode(struct device *dev, umode_t *mode)
+{
+        if (mode)
+                *mode = 0220;
+        return NULL;
+}
+void pstore_register_pmsg(void)
+{
+        struct device *pmsg_device;
+        pmsg_major = register_chrdev(0, PMSG_NAME, &pmsg_fops);
+        if (pmsg_major < 0) {
+                pr_err("register_chrdev failed\n");
+                goto err;
+        }
+        pmsg_class = class_create(THIS_MODULE, PMSG_NAME);
+        if (IS_ERR(pmsg_class)) {
+                pr_err("device class file already in use\n");
+                goto err_class;
+        }
+        pmsg_class->devnode = pmsg_devnode;
+        pmsg_device = device_create(pmsg_class, NULL, MKDEV(pmsg_major, 0),
+                                        NULL, "%s%d", PMSG_NAME, 0);
+        if (IS_ERR(pmsg_device)) {
+                pr_err("failed to create device\n");
+                goto err_device;
+        }
+        return;
+err_device:
+        class_destroy(pmsg_class);
+err_class:
+        unregister_chrdev(pmsg_major, PMSG_NAME);
+err:
+        return;
+}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 8613e5b35c22..39d1373128e9 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -51,6 +51,10 @@ static ulong ramoops_ftrace_size = MIN_MEM_SIZE;
 module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400);
 MODULE_PARM_DESC(ftrace_size, "size of ftrace log");
+static ulong ramoops_pmsg_size = MIN_MEM_SIZE;
+module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
+MODULE_PARM_DESC(pmsg_size, "size of user space message log");
 static ulong mem_address;
 module_param(mem_address, ulong, 0400);
 MODULE_PARM_DESC(mem_address,
@@ -82,12 +86,14 @@ struct ramoops_context {
        struct persistent_ram_zone **przs;
        struct persistent_ram_zone *cprz;
        struct persistent_ram_zone *fprz;
+        struct persistent_ram_zone *mprz;
        phys_addr_t phys_addr;
        unsigned long size;
        unsigned int memtype;
        size_t record_size;
        size_t console_size;
        size_t ftrace_size;
+        size_t pmsg_size;
        int dump_oops;
        struct persistent_ram_ecc_info ecc_info;
        unsigned int max_dump_cnt;
@@ -96,6 +102,7 @@ struct ramoops_context {
        unsigned int dump_read_cnt;
        unsigned int console_read_cnt;
        unsigned int ftrace_read_cnt;
+        unsigned int pmsg_read_cnt;
        struct pstore_info pstore;
 };
@@ -109,6 +116,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
        cxt->dump_read_cnt = 0;
        cxt->console_read_cnt = 0;
        cxt->ftrace_read_cnt = 0;
+        cxt->pmsg_read_cnt = 0;
        return 0;
 }
@@ -164,6 +172,12 @@ static int ramoops_read_kmsg_hdr(char *buffer, struct timespec *time,
        return header_length;
 }
+static bool prz_ok(struct persistent_ram_zone *prz)
+{
+        return !!prz && !!(persistent_ram_old_size(prz) +
+                           persistent_ram_ecc_string(prz, NULL, 0));
+}
 static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
                                   int *count, struct timespec *time,
                                   char **buf, bool *compressed,
@@ -178,13 +192,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
        prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
                                   cxt->max_dump_cnt, id, type,
                                   PSTORE_TYPE_DMESG, 1);
-        if (!prz)
+        if (!prz_ok(prz))
                prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
                                           1, id, type, PSTORE_TYPE_CONSOLE, 0);
-        if (!prz)
+        if (!prz_ok(prz))
                prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
                                           1, id, type, PSTORE_TYPE_FTRACE, 0);
-        if (!prz)
+        if (!prz_ok(prz))
+                prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
+                                           1, id, type, PSTORE_TYPE_PMSG, 0);
+        if (!prz_ok(prz))
                return 0;
        if (!persistent_ram_old(prz))
@@ -252,6 +269,11 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
                        return -ENOMEM;
                persistent_ram_write(cxt->fprz, buf, size);
                return 0;
+        } else if (type == PSTORE_TYPE_PMSG) {
+                if (!cxt->mprz)
+                        return -ENOMEM;
+                persistent_ram_write(cxt->mprz, buf, size);
+                return 0;
        }
        if (type != PSTORE_TYPE_DMESG)
@@ -309,6 +331,9 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
        case PSTORE_TYPE_FTRACE:
                prz = cxt->fprz;
                break;
+        case PSTORE_TYPE_PMSG:
+                prz = cxt->mprz;
+                break;
        default:
                return -EINVAL;
        }
@@ -435,7 +460,7 @@ static int ramoops_probe(struct platform_device *pdev)
                goto fail_out;
        if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
-                        !pdata->ftrace_size)) {
+                        !pdata->ftrace_size && !pdata->pmsg_size)) {
                pr_err("The memory size and the record/console size must be "
                        "non-zero\n");
                goto fail_out;
@@ -447,6 +472,8 @@ static int ramoops_probe(struct platform_device *pdev)
                pdata->console_size = rounddown_pow_of_two(pdata->console_size);
        if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
                pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
+        if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size))
+                pdata->pmsg_size = rounddown_pow_of_two(pdata->pmsg_size);
        cxt->size = pdata->mem_size;
        cxt->phys_addr = pdata->mem_address;
@@ -454,12 +481,14 @@ static int ramoops_probe(struct platform_device *pdev)
        cxt->record_size = pdata->record_size;
        cxt->console_size = pdata->console_size;
        cxt->ftrace_size = pdata->ftrace_size;
+        cxt->pmsg_size = pdata->pmsg_size;
        cxt->dump_oops = pdata->dump_oops;
        cxt->ecc_info = pdata->ecc_info;
        paddr = cxt->phys_addr;
-        dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size;
+        dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
+                        - cxt->pmsg_size;
        err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
        if (err)
                goto fail_out;
@@ -474,13 +503,9 @@ static int ramoops_probe(struct platform_device *pdev)
        if (err)
                goto fail_init_fprz;
-        if (!cxt->przs && !cxt->cprz && !cxt->fprz) {
+        err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0);
-                pr_err("memory size too small, minimum is %zu\n",
+        if (err)
-                        cxt->console_size + cxt->record_size +
+                goto fail_init_mprz;
-                        cxt->ftrace_size);
-                err = -EINVAL;
-                goto fail_cnt;
-        }
        cxt->pstore.data = cxt;
        /*
@@ -525,7 +550,8 @@ fail_buf:
        kfree(cxt->pstore.buf);
 fail_clear:
        cxt->pstore.bufsize = 0;
-fail_cnt:
+        kfree(cxt->mprz);
+fail_init_mprz:
        kfree(cxt->fprz);
 fail_init_fprz:
        kfree(cxt->cprz);
@@ -583,6 +609,7 @@ static void ramoops_register_dummy(void)
        dummy_data->record_size = record_size;
        dummy_data->console_size = ramoops_console_size;
        dummy_data->ftrace_size = ramoops_ftrace_size;
+        dummy_data->pmsg_size = ramoops_pmsg_size;
        dummy_data->dump_oops = dump_oops;
        /*
         * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index c51df1dd237e..4a09975aac90 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -5,6 +5,7 @@
 config QUOTA
        bool "Quota support"
        select QUOTACTL
+        select SRCU
        help
          If you say Y here, you will be able to set per user limits for disk
          usage (also called disk quotas). Currently, it works for the
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8f0acef3d184..0ccd4ba3a246 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1248,7 +1248,7 @@ static int ignore_hardlimit(struct dquot *dquot)
        return capable(CAP_SYS_RESOURCE) &&
               (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
-                !(info->dqi_flags & V1_DQF_RSQUASH));
+                !(info->dqi_flags & DQF_ROOT_SQUASH));
 }
 /* needs dq_data_lock */
@@ -2385,41 +2385,106 @@ out:
 }
 EXPORT_SYMBOL(dquot_quota_on_mount);
-static inline qsize_t qbtos(qsize_t blocks)
+static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
 {
-        return blocks << QIF_DQBLKSIZE_BITS;
+        int ret;
+        int type;
+        struct quota_info *dqopt = sb_dqopt(sb);
+        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
+                return -ENOSYS;
+        /* Accounting cannot be turned on while fs is mounted */
+        flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT);
+        if (!flags)
+                return -EINVAL;
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!(flags & qtype_enforce_flag(type)))
+                        continue;
+                /* Can't enforce without accounting */
+                if (!sb_has_quota_usage_enabled(sb, type))
+                        return -EINVAL;
+                ret = dquot_enable(dqopt->files[type], type,
+                                   dqopt->info[type].dqi_fmt_id,
+                                   DQUOT_LIMITS_ENABLED);
+                if (ret < 0)
+                        goto out_err;
+        }
+        return 0;
+out_err:
+        /* Backout enforcement enablement we already did */
+        for (type--; type >= 0; type--)  {
+                if (flags & qtype_enforce_flag(type))
+                        dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+        }
+        /* Error code translation for better compatibility with XFS */
+        if (ret == -EBUSY)
+                ret = -EEXIST;
+        return ret;
 }
-static inline qsize_t stoqb(qsize_t space)
+static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
 {
-        return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+        int ret;
+        int type;
+        struct quota_info *dqopt = sb_dqopt(sb);
+        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
+                return -ENOSYS;
+        /*
+         * We don't support turning off accounting via quotactl. In principle
+         * quota infrastructure can do this but filesystems don't expect
+         * userspace to be able to do it.
+         */
+        if (flags &
+                  (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT))
+                return -EOPNOTSUPP;
+        /* Filter out limits not enabled */
+        for (type = 0; type < MAXQUOTAS; type++)
+                if (!sb_has_quota_limits_enabled(sb, type))
+                        flags &= ~qtype_enforce_flag(type);
+        /* Nothing left? */
+        if (!flags)
+                return -EEXIST;
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (flags & qtype_enforce_flag(type)) {
+                        ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+                        if (ret < 0)
+                                goto out_err;
+                }
+        }
+        return 0;
+out_err:
+        /* Backout enforcement disabling we already did */
+        for (type--; type >= 0; type--)  {
+                if (flags & qtype_enforce_flag(type))
+                        dquot_enable(dqopt->files[type], type,
+                                     dqopt->info[type].dqi_fmt_id,
+                                     DQUOT_LIMITS_ENABLED);
+        }
+        return ret;
 }
 /* Generic routine for getting common part of quota structure */
-static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
+static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 {
        struct mem_dqblk *dm = &dquot->dq_dqb;
        memset(di, 0, sizeof(*di));
-        di->d_version = FS_DQUOT_VERSION;
-        di->d_flags = dquot->dq_id.type == USRQUOTA ?
-                        FS_USER_QUOTA : FS_GROUP_QUOTA;
-        di->d_id = from_kqid_munged(current_user_ns(), dquot->dq_id);
        spin_lock(&dq_data_lock);
-        di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
+        di->d_spc_hardlimit = dm->dqb_bhardlimit;
-        di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit);
+        di->d_spc_softlimit = dm->dqb_bsoftlimit;
        di->d_ino_hardlimit = dm->dqb_ihardlimit;
        di->d_ino_softlimit = dm->dqb_isoftlimit;
-        di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace;
+        di->d_space = dm->dqb_curspace + dm->dqb_rsvspace;
-        di->d_icount = dm->dqb_curinodes;
+        di->d_ino_count = dm->dqb_curinodes;
-        di->d_btimer = dm->dqb_btime;
+        di->d_spc_timer = dm->dqb_btime;
-        di->d_itimer = dm->dqb_itime;
+        di->d_ino_timer = dm->dqb_itime;
        spin_unlock(&dq_data_lock);
 }
 int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
-                    struct fs_disk_quota *di)
+                    struct qc_dqblk *di)
 {
        struct dquot *dquot;
@@ -2433,70 +2498,70 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
 }
 EXPORT_SYMBOL(dquot_get_dqblk);
-#define VFS_FS_DQ_MASK \
+#define VFS_QC_MASK \
-        (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
+        (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
-         FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \
+         QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
-         FS_DQ_BTIMER | FS_DQ_ITIMER)
+         QC_SPC_TIMER | QC_INO_TIMER)
 /* Generic routine for setting common part of quota structure */
-static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
+static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 {
        struct mem_dqblk *dm = &dquot->dq_dqb;
        int check_blim = 0, check_ilim = 0;
        struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
-        if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
+        if (di->d_fieldmask & ~VFS_QC_MASK)
                return -EINVAL;
-        if (((di->d_fieldmask & FS_DQ_BSOFT) &&
+        if (((di->d_fieldmask & QC_SPC_SOFT) &&
-             (di->d_blk_softlimit > dqi->dqi_maxblimit)) ||
+             di->d_spc_softlimit > dqi->dqi_max_spc_limit) ||
-            ((di->d_fieldmask & FS_DQ_BHARD) &&
+            ((di->d_fieldmask & QC_SPC_HARD) &&
-             (di->d_blk_hardlimit > dqi->dqi_maxblimit)) ||
+             di->d_spc_hardlimit > dqi->dqi_max_spc_limit) ||
-            ((di->d_fieldmask & FS_DQ_ISOFT) &&
+            ((di->d_fieldmask & QC_INO_SOFT) &&
-             (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
+             (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) ||
-            ((di->d_fieldmask & FS_DQ_IHARD) &&
+            ((di->d_fieldmask & QC_INO_HARD) &&
-             (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
+             (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
                return -ERANGE;
        spin_lock(&dq_data_lock);
-        if (di->d_fieldmask & FS_DQ_BCOUNT) {
+        if (di->d_fieldmask & QC_SPACE) {
-                dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace;
+                dm->dqb_curspace = di->d_space - dm->dqb_rsvspace;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }
-        if (di->d_fieldmask & FS_DQ_BSOFT)
+        if (di->d_fieldmask & QC_SPC_SOFT)
-                dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit);
+                dm->dqb_bsoftlimit = di->d_spc_softlimit;
-        if (di->d_fieldmask & FS_DQ_BHARD)
+        if (di->d_fieldmask & QC_SPC_HARD)
-                dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit);
+                dm->dqb_bhardlimit = di->d_spc_hardlimit;
-        if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) {
+        if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) {
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }
-        if (di->d_fieldmask & FS_DQ_ICOUNT) {
+        if (di->d_fieldmask & QC_INO_COUNT) {
-                dm->dqb_curinodes = di->d_icount;
+                dm->dqb_curinodes = di->d_ino_count;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }
-        if (di->d_fieldmask & FS_DQ_ISOFT)
+        if (di->d_fieldmask & QC_INO_SOFT)
                dm->dqb_isoftlimit = di->d_ino_softlimit;
-        if (di->d_fieldmask & FS_DQ_IHARD)
+        if (di->d_fieldmask & QC_INO_HARD)
                dm->dqb_ihardlimit = di->d_ino_hardlimit;
-        if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) {
+        if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) {
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }
-        if (di->d_fieldmask & FS_DQ_BTIMER) {
+        if (di->d_fieldmask & QC_SPC_TIMER) {
-                dm->dqb_btime = di->d_btimer;
+                dm->dqb_btime = di->d_spc_timer;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
        }
-        if (di->d_fieldmask & FS_DQ_ITIMER) {
+        if (di->d_fieldmask & QC_INO_TIMER) {
-                dm->dqb_itime = di->d_itimer;
+                dm->dqb_itime = di->d_ino_timer;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
        }
@@ -2506,7 +2571,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
                    dm->dqb_curspace < dm->dqb_bsoftlimit) {
                        dm->dqb_btime = 0;
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
-                } else if (!(di->d_fieldmask & FS_DQ_BTIMER))
+                } else if (!(di->d_fieldmask & QC_SPC_TIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
        }
@@ -2515,7 +2580,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
                    dm->dqb_curinodes < dm->dqb_isoftlimit) {
                        dm->dqb_itime = 0;
                        clear_bit(DQ_INODES_B, &dquot->dq_flags);
-                } else if (!(di->d_fieldmask & FS_DQ_ITIMER))
+                } else if (!(di->d_fieldmask & QC_INO_TIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
        }
@@ -2531,7 +2596,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 }
 int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
-                  struct fs_disk_quota *di)
+                  struct qc_dqblk *di)
 {
        struct dquot *dquot;
        int rc;
@@ -2582,6 +2647,14 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
                goto out;
        }
        mi = sb_dqopt(sb)->info + type;
+        if (ii->dqi_valid & IIF_FLAGS) {
+                if (ii->dqi_flags & ~DQF_SETINFO_MASK ||
+                    (ii->dqi_flags & DQF_ROOT_SQUASH &&
+                     mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) {
+                        err = -EINVAL;
+                        goto out;
+                }
+        }
        spin_lock(&dq_data_lock);
        if (ii->dqi_valid & IIF_BGRACE)
                mi->dqi_bgrace = ii->dqi_bgrace;
@@ -2611,6 +2684,17 @@ const struct quotactl_ops dquot_quotactl_ops = {
 };
 EXPORT_SYMBOL(dquot_quotactl_ops);
+const struct quotactl_ops dquot_quotactl_sysfile_ops = {
+        .quota_enable   = dquot_quota_enable,
+        .quota_disable  = dquot_quota_disable,
+        .quota_sync     = dquot_quota_sync,
+        .get_info       = dquot_get_dqinfo,
+        .set_info       = dquot_set_dqinfo,
+        .get_dqblk      = dquot_get_dqblk,
+        .set_dqblk      = dquot_set_dqblk
+};
+EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
 static int do_proc_dqstats(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2aa4151f99d2..d14a799c7785 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -66,18 +66,40 @@ static int quota_sync_all(int type)
        return ret;
 }
+unsigned int qtype_enforce_flag(int type)
+{
+        switch (type) {
+        case USRQUOTA:
+                return FS_QUOTA_UDQ_ENFD;
+        case GRPQUOTA:
+                return FS_QUOTA_GDQ_ENFD;
+        case PRJQUOTA:
+                return FS_QUOTA_PDQ_ENFD;
+        }
+        return 0;
+}
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
                         struct path *path)
 {
-        if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
+        if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
                return -ENOSYS;
-        if (sb->s_qcop->quota_on_meta)
+        if (sb->s_qcop->quota_enable)
-                return sb->s_qcop->quota_on_meta(sb, type, id);
+                return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type));
        if (IS_ERR(path))
                return PTR_ERR(path);
        return sb->s_qcop->quota_on(sb, type, id, path);
 }
+static int quota_quotaoff(struct super_block *sb, int type)
+{
+        if (!sb->s_qcop->quota_off && !sb->s_qcop->quota_disable)
+                return -ENOSYS;
+        if (sb->s_qcop->quota_disable)
+                return sb->s_qcop->quota_disable(sb, qtype_enforce_flag(type));
+        return sb->s_qcop->quota_off(sb, type);
+}
 static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
 {
        __u32 fmt;
@@ -118,17 +140,27 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
        return sb->s_qcop->set_info(sb, type, &info);
 }
-static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
+static inline qsize_t qbtos(qsize_t blocks)
+{
+        return blocks << QIF_DQBLKSIZE_BITS;
+}
+static inline qsize_t stoqb(qsize_t space)
+{
+        return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
+static void copy_to_if_dqblk(struct if_dqblk *dst, struct qc_dqblk *src)
 {
        memset(dst, 0, sizeof(*dst));
-        dst->dqb_bhardlimit = src->d_blk_hardlimit;
+        dst->dqb_bhardlimit = stoqb(src->d_spc_hardlimit);
-        dst->dqb_bsoftlimit = src->d_blk_softlimit;
+        dst->dqb_bsoftlimit = stoqb(src->d_spc_softlimit);
-        dst->dqb_curspace = src->d_bcount;
+        dst->dqb_curspace = src->d_space;
        dst->dqb_ihardlimit = src->d_ino_hardlimit;
        dst->dqb_isoftlimit = src->d_ino_softlimit;
-        dst->dqb_curinodes = src->d_icount;
+        dst->dqb_curinodes = src->d_ino_count;
-        dst->dqb_btime = src->d_btimer;
+        dst->dqb_btime = src->d_spc_timer;
-        dst->dqb_itime = src->d_itimer;
+        dst->dqb_itime = src->d_ino_timer;
        dst->dqb_valid = QIF_ALL;
 }
@@ -136,7 +168,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
                          void __user *addr)
 {
        struct kqid qid;
-        struct fs_disk_quota fdq;
+        struct qc_dqblk fdq;
        struct if_dqblk idq;
        int ret;
@@ -154,36 +186,36 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
        return 0;
 }
-static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src)
+static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src)
 {
-        dst->d_blk_hardlimit = src->dqb_bhardlimit;
+        dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit);
-        dst->d_blk_softlimit  = src->dqb_bsoftlimit;
+        dst->d_spc_softlimit = qbtos(src->dqb_bsoftlimit);
-        dst->d_bcount = src->dqb_curspace;
+        dst->d_space = src->dqb_curspace;
        dst->d_ino_hardlimit = src->dqb_ihardlimit;
        dst->d_ino_softlimit = src->dqb_isoftlimit;
-        dst->d_icount = src->dqb_curinodes;
+        dst->d_ino_count = src->dqb_curinodes;
-        dst->d_btimer = src->dqb_btime;
+        dst->d_spc_timer = src->dqb_btime;
-        dst->d_itimer = src->dqb_itime;
+        dst->d_ino_timer = src->dqb_itime;
        dst->d_fieldmask = 0;
        if (src->dqb_valid & QIF_BLIMITS)
-                dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD;
+                dst->d_fieldmask |= QC_SPC_SOFT | QC_SPC_HARD;
        if (src->dqb_valid & QIF_SPACE)
-                dst->d_fieldmask |= FS_DQ_BCOUNT;
+                dst->d_fieldmask |= QC_SPACE;
        if (src->dqb_valid & QIF_ILIMITS)
-                dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD;
+                dst->d_fieldmask |= QC_INO_SOFT | QC_INO_HARD;
        if (src->dqb_valid & QIF_INODES)
-                dst->d_fieldmask |= FS_DQ_ICOUNT;
+                dst->d_fieldmask |= QC_INO_COUNT;
        if (src->dqb_valid & QIF_BTIME)
-                dst->d_fieldmask |= FS_DQ_BTIMER;
+                dst->d_fieldmask |= QC_SPC_TIMER;
        if (src->dqb_valid & QIF_ITIME)
-                dst->d_fieldmask |= FS_DQ_ITIMER;
+                dst->d_fieldmask |= QC_INO_TIMER;
 }
 static int quota_setquota(struct super_block *sb, int type, qid_t id,
                          void __user *addr)
 {
-        struct fs_disk_quota fdq;
+        struct qc_dqblk fdq;
        struct if_dqblk idq;
        struct kqid qid;
@@ -198,15 +230,26 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
        return sb->s_qcop->set_dqblk(sb, qid, &fdq);
 }
-static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
+static int quota_enable(struct super_block *sb, void __user *addr)
+{
+        __u32 flags;
+        if (copy_from_user(&flags, addr, sizeof(flags)))
+                return -EFAULT;
+        if (!sb->s_qcop->quota_enable)
+                return -ENOSYS;
+        return sb->s_qcop->quota_enable(sb, flags);
+}
+static int quota_disable(struct super_block *sb, void __user *addr)
 {
        __u32 flags;
        if (copy_from_user(&flags, addr, sizeof(flags)))
                return -EFAULT;
-        if (!sb->s_qcop->set_xstate)
+        if (!sb->s_qcop->quota_disable)
                return -ENOSYS;
-        return sb->s_qcop->set_xstate(sb, flags, cmd);
+        return sb->s_qcop->quota_disable(sb, flags);
 }
 static int quota_getxstate(struct super_block *sb, void __user *addr)
@@ -247,10 +290,78 @@ static int quota_getxstatev(struct super_block *sb, void __user *addr)
        return ret;
 }
+/*
+ * XFS defines BBTOB and BTOBB macros inside fs/xfs/ and we cannot move them
+ * out of there as xfsprogs rely on definitions being in that header file. So
+ * just define same functions here for quota purposes.
+ */
+#define XFS_BB_SHIFT 9
+static inline u64 quota_bbtob(u64 blocks)
+{
+        return blocks << XFS_BB_SHIFT;
+}
+static inline u64 quota_btobb(u64 bytes)
+{
+        return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT;
+}
+static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src)
+{
+        dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit);
+        dst->d_spc_softlimit = quota_bbtob(src->d_blk_softlimit);
+        dst->d_ino_hardlimit = src->d_ino_hardlimit;
+        dst->d_ino_softlimit = src->d_ino_softlimit;
+        dst->d_space = quota_bbtob(src->d_bcount);
+        dst->d_ino_count = src->d_icount;
+        dst->d_ino_timer = src->d_itimer;
+        dst->d_spc_timer = src->d_btimer;
+        dst->d_ino_warns = src->d_iwarns;
+        dst->d_spc_warns = src->d_bwarns;
+        dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit);
+        dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit);
+        dst->d_rt_space = quota_bbtob(src->d_rtbcount);
+        dst->d_rt_spc_timer = src->d_rtbtimer;
+        dst->d_rt_spc_warns = src->d_rtbwarns;
+        dst->d_fieldmask = 0;
+        if (src->d_fieldmask & FS_DQ_ISOFT)
+                dst->d_fieldmask |= QC_INO_SOFT;
+        if (src->d_fieldmask & FS_DQ_IHARD)
+                dst->d_fieldmask |= QC_INO_HARD;
+        if (src->d_fieldmask & FS_DQ_BSOFT)
+                dst->d_fieldmask |= QC_SPC_SOFT;
+        if (src->d_fieldmask & FS_DQ_BHARD)
+                dst->d_fieldmask |= QC_SPC_HARD;
+        if (src->d_fieldmask & FS_DQ_RTBSOFT)
+                dst->d_fieldmask |= QC_RT_SPC_SOFT;
+        if (src->d_fieldmask & FS_DQ_RTBHARD)
+                dst->d_fieldmask |= QC_RT_SPC_HARD;
+        if (src->d_fieldmask & FS_DQ_BTIMER)
+                dst->d_fieldmask |= QC_SPC_TIMER;
+        if (src->d_fieldmask & FS_DQ_ITIMER)
+                dst->d_fieldmask |= QC_INO_TIMER;
+        if (src->d_fieldmask & FS_DQ_RTBTIMER)
+                dst->d_fieldmask |= QC_RT_SPC_TIMER;
+        if (src->d_fieldmask & FS_DQ_BWARNS)
+                dst->d_fieldmask |= QC_SPC_WARNS;
+        if (src->d_fieldmask & FS_DQ_IWARNS)
+                dst->d_fieldmask |= QC_INO_WARNS;
+        if (src->d_fieldmask & FS_DQ_RTBWARNS)
+                dst->d_fieldmask |= QC_RT_SPC_WARNS;
+        if (src->d_fieldmask & FS_DQ_BCOUNT)
+                dst->d_fieldmask |= QC_SPACE;
+        if (src->d_fieldmask & FS_DQ_ICOUNT)
+                dst->d_fieldmask |= QC_INO_COUNT;
+        if (src->d_fieldmask & FS_DQ_RTBCOUNT)
+                dst->d_fieldmask |= QC_RT_SPACE;
+}
 static int quota_setxquota(struct super_block *sb, int type, qid_t id,
                           void __user *addr)
 {
        struct fs_disk_quota fdq;
+        struct qc_dqblk qdq;
        struct kqid qid;
        if (copy_from_user(&fdq, addr, sizeof(fdq)))
@@ -260,13 +371,44 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
        qid = make_kqid(current_user_ns(), type, id);
        if (!qid_valid(qid))
                return -EINVAL;
-        return sb->s_qcop->set_dqblk(sb, qid, &fdq);
+        copy_from_xfs_dqblk(&qdq, &fdq);
+        return sb->s_qcop->set_dqblk(sb, qid, &qdq);
+}
+static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src,
+                              int type, qid_t id)
+{
+        memset(dst, 0, sizeof(*dst));
+        dst->d_version = FS_DQUOT_VERSION;
+        dst->d_id = id;
+        if (type == USRQUOTA)
+                dst->d_flags = FS_USER_QUOTA;
+        else if (type == PRJQUOTA)
+                dst->d_flags = FS_PROJ_QUOTA;
+        else
+                dst->d_flags = FS_GROUP_QUOTA;
+        dst->d_blk_hardlimit = quota_btobb(src->d_spc_hardlimit);
+        dst->d_blk_softlimit = quota_btobb(src->d_spc_softlimit);
+        dst->d_ino_hardlimit = src->d_ino_hardlimit;
+        dst->d_ino_softlimit = src->d_ino_softlimit;
+        dst->d_bcount = quota_btobb(src->d_space);
+        dst->d_icount = src->d_ino_count;
+        dst->d_itimer = src->d_ino_timer;
+        dst->d_btimer = src->d_spc_timer;
+        dst->d_iwarns = src->d_ino_warns;
+        dst->d_bwarns = src->d_spc_warns;
+        dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit);
+        dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit);
+        dst->d_rtbcount = quota_btobb(src->d_rt_space);
+        dst->d_rtbtimer = src->d_rt_spc_timer;
+        dst->d_rtbwarns = src->d_rt_spc_warns;
 }
 static int quota_getxquota(struct super_block *sb, int type, qid_t id,
                           void __user *addr)
 {
        struct fs_disk_quota fdq;
+        struct qc_dqblk qdq;
        struct kqid qid;
        int ret;
@@ -275,8 +417,11 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
        qid = make_kqid(current_user_ns(), type, id);
        if (!qid_valid(qid))
                return -EINVAL;
-        ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
+        ret = sb->s_qcop->get_dqblk(sb, qid, &qdq);
-        if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
+        if (ret)
+                return ret;
+        copy_to_xfs_dqblk(&fdq, &qdq, type, id);
+        if (copy_to_user(addr, &fdq, sizeof(fdq)))
                return -EFAULT;
        return ret;
 }
@@ -317,9 +462,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
        case Q_QUOTAON:
                return quota_quotaon(sb, type, cmd, id, path);
        case Q_QUOTAOFF:
-                if (!sb->s_qcop->quota_off)
+                return quota_quotaoff(sb, type);
-                        return -ENOSYS;
-                return sb->s_qcop->quota_off(sb, type);
        case Q_GETFMT:
                return quota_getfmt(sb, type, addr);
        case Q_GETINFO:
@@ -335,8 +478,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
                        return -ENOSYS;
                return sb->s_qcop->quota_sync(sb, type);
        case Q_XQUOTAON:
+                return quota_enable(sb, addr);
        case Q_XQUOTAOFF:
-                return quota_setxstate(sb, cmd, addr);
+                return quota_disable(sb, addr);
        case Q_XQUOTARM:
                return quota_rmxquota(sb, addr);
        case Q_XGETQSTAT:
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 469c6848b322..8fe79beced5c 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -169,8 +169,8 @@ static int v1_read_file_info(struct super_block *sb, int type)
        }
        ret = 0;
        /* limits are stored as unsigned 32-bit data */
-        dqopt->info[type].dqi_maxblimit = 0xffffffff;
+        dqopt->info[type].dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
-        dqopt->info[type].dqi_maxilimit = 0xffffffff;
+        dqopt->info[type].dqi_max_ino_limit = 0xffffffff;
        dqopt->info[type].dqi_igrace =
                        dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
        dqopt->info[type].dqi_bgrace =
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 02751ec695c5..9cb10d7197f7 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -117,16 +117,17 @@ static int v2_read_file_info(struct super_block *sb, int type)
        qinfo = info->dqi_priv;
        if (version == 0) {
                /* limits are stored as unsigned 32-bit data */
-                info->dqi_maxblimit = 0xffffffff;
+                info->dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
-                info->dqi_maxilimit = 0xffffffff;
+                info->dqi_max_ino_limit = 0xffffffff;
        } else {
-                /* used space is stored as unsigned 64-bit value */
+                /* used space is stored as unsigned 64-bit value in bytes */
-                info->dqi_maxblimit = 0xffffffffffffffffULL;    /* 2^64-1 */
+                info->dqi_max_spc_limit = 0xffffffffffffffffULL; /* 2^64-1 */
-                info->dqi_maxilimit = 0xffffffffffffffffULL;
+                info->dqi_max_ino_limit = 0xffffffffffffffffULL;
        }
        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
-        info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
+        /* No flags currently supported */
+        info->dqi_flags = 0;
        qinfo->dqi_sb = sb;
        qinfo->dqi_type = type;
        qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
@@ -157,7 +158,8 @@ static int v2_write_file_info(struct super_block *sb, int type)
        info->dqi_flags &= ~DQF_INFO_DIRTY;
        dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
        dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
-        dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+        /* No flags currently supported */
+        dinfo.dqi_flags = cpu_to_le32(0);
        spin_unlock(&dq_data_lock);
        dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
        dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index bbafbde3471a..f6ab41b39612 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -34,7 +34,14 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
                                                   unsigned long flags);
 static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
+static unsigned ramfs_mmap_capabilities(struct file *file)
+{
+        return NOMMU_MAP_DIRECT | NOMMU_MAP_COPY | NOMMU_MAP_READ |
+                NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
+}
 const struct file_operations ramfs_file_operations = {
+        .mmap_capabilities      = ramfs_mmap_capabilities,
        .mmap                   = ramfs_nommu_mmap,
        .get_unmapped_area      = ramfs_nommu_get_unmapped_area,
        .read                   = new_sync_read,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d365b1c4eb3c..889d558b4e05 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -50,14 +50,6 @@ static const struct address_space_operations ramfs_aops = {
        .set_page_dirty = __set_page_dirty_no_writeback,
 };
-static struct backing_dev_info ramfs_backing_dev_info = {
-        .name           = "ramfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK |
-                          BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
-                          BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
-};
 struct inode *ramfs_get_inode(struct super_block *sb,
                                const struct inode *dir, umode_t mode, dev_t dev)
 {
@@ -67,7 +59,6 @@ struct inode *ramfs_get_inode(struct super_block *sb,
                inode->i_ino = get_next_ino();
                inode_init_owner(inode, dir, mode);
                inode->i_mapping->a_ops = &ramfs_aops;
-                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
                mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
                mapping_set_unevictable(inode->i_mapping);
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -267,19 +258,9 @@ static struct file_system_type ramfs_fs_type = {
 int __init init_ramfs_fs(void)
 {
        static unsigned long once;
-        int err;
        if (test_and_set_bit(0, &once))
                return 0;
+        return register_filesystem(&ramfs_fs_type);
-        err = bdi_init(&ramfs_backing_dev_info);
-        if (err)
-                return err;
-        err = register_filesystem(&ramfs_fs_type);
-        if (err)
-                bdi_destroy(&ramfs_backing_dev_info);
-        return err;
 }
 fs_initcall(init_ramfs_fs);
diff --git a/fs/read_write.c b/fs/read_write.c
index c0805c93b6fa..4060691e78f7 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -358,7 +358,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
                        return retval;
        }
-        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
+        if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
                retval = locks_mandatory_area(
                        read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
                        inode, file, pos, count);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index ea06c7554860..7da9e2153953 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -70,6 +70,15 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
        return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
 }
+static unsigned romfs_mmap_capabilities(struct file *file)
+{
+        struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd;
+        if (!mtd)
+                return NOMMU_MAP_COPY;
+        return mtd_mmap_capabilities(mtd);
+}
 const struct file_operations romfs_ro_fops = {
        .llseek                 = generic_file_llseek,
        .read                   = new_sync_read,
@@ -77,4 +86,5 @@ const struct file_operations romfs_ro_fops = {
        .splice_read            = generic_file_splice_read,
        .mmap                   = romfs_mmap,
        .get_unmapped_area      = romfs_get_unmapped_area,
+        .mmap_capabilities      = romfs_mmap_capabilities,
 };
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index e98dd88197d5..268733cda397 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -355,9 +355,6 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
        case ROMFH_REG:
                i->i_fop = &romfs_ro_fops;
                i->i_data.a_ops = &romfs_aops;
-                if (i->i_sb->s_mtd)
-                        i->i_data.backing_dev_info =
-                                i->i_sb->s_mtd->backing_dev_info;
                if (nextfh & ROMFH_EXEC)
                        mode |= S_IXUGO;
                break;
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..05a021638b11 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,8 +36,8 @@
 #include "internal.h"
-LIST_HEAD(super_blocks);
+static LIST_HEAD(super_blocks);
-DEFINE_SPINLOCK(sb_lock);
+static DEFINE_SPINLOCK(sb_lock);
 static char *sb_writers_name[SB_FREEZE_LEVELS] = {
        "sb_writers",
@@ -185,8 +185,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        }
        init_waitqueue_head(&s->s_writers.wait);
        init_waitqueue_head(&s->s_writers.wait_unfrozen);
+        s->s_bdi = &noop_backing_dev_info;
        s->s_flags = flags;
-        s->s_bdi = &default_backing_dev_info;
        INIT_HLIST_NODE(&s->s_instances);
        INIT_HLIST_BL_HEAD(&s->s_anon);
        INIT_LIST_HEAD(&s->s_inodes);
@@ -863,10 +863,7 @@ EXPORT_SYMBOL(free_anon_bdev);
 int set_anon_super(struct super_block *s, void *data)
 {
-        int error = get_anon_bdev(&s->s_dev);
+        return get_anon_bdev(&s->s_dev);
-        if (!error)
-                s->s_bdi = &noop_backing_dev_info;
-        return error;
 }
 EXPORT_SYMBOL(set_anon_super);
@@ -1111,7 +1108,6 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
        sb = root->d_sb;
        BUG_ON(!sb);
        WARN_ON(!sb->s_bdi);
-        WARN_ON(sb->s_bdi == &default_backing_dev_info);
        sb->s_flags |= MS_BORN;
        error = security_sb_kern_mount(sb, flags, secdata);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ea41649e4ca5..c49b1981ac95 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -108,8 +108,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
        inode->i_mtime = inode->i_atime = inode->i_ctime =
                         ubifs_current_time(inode);
        inode->i_mapping->nrpages = 0;
-        /* Disable readahead */
-        inode->i_mapping->backing_dev_info = &c->bdi;
        switch (mode & S_IFMT) {
        case S_IFREG:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 538519ee37d9..035e51011444 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1536,7 +1536,6 @@ static const struct vm_operations_struct ubifs_file_vm_ops = {
        .fault        = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = ubifs_vm_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 106bf20629ce..6197154f36ca 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -156,9 +156,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
        if (err)
                goto out_invalid;
-        /* Disable read-ahead */
-        inode->i_mapping->backing_dev_info = &c->bdi;
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
                inode->i_mapping->a_ops = &ubifs_file_address_operations;
@@ -2017,7 +2014,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
         * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
         */
        c->bdi.name = "ubifs",
-        c->bdi.capabilities = BDI_CAP_MAP_COPY;
+        c->bdi.capabilities = 0;
        err  = bdi_init(&c->bdi);
        if (err)
                goto out_close;
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 0e0e99bd6bce..c6e17a744c3b 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -2,10 +2,12 @@ config UDF_FS
        tristate "UDF file system support"
        select CRC_ITU_T
        help
-          This is the new file system used on some CD-ROMs and DVDs. Say Y if
+          This is a file system used on some CD-ROMs and DVDs. Since the
-          you intend to mount DVD discs or CDRW's written in packet mode, or
+          file system is supported by multiple operating systems and is more
-          if written to by other UDF utilities, such as DirectCD.
+          compatible with standard unix file systems, it is also suitable for
-          Please read <file:Documentation/filesystems/udf.txt>.
+          removable USB disks. Say Y if you intend to mount DVD discs or CDRW's
+          written in packet mode, or if you want to use UDF for removable USB
+          disks. Please read <file:Documentation/filesystems/udf.txt>.
          To compile this file system support as a module, choose M here: the
          module will be called udf.
diff --git a/fs/udf/file.c b/fs/udf/file.c
index bb15771b92ae..08f3555fbeac 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -224,7 +224,7 @@ out:
 static int udf_release_file(struct inode *inode, struct file *filp)
 {
        if (filp->f_mode & FMODE_WRITE &&
-            atomic_read(&inode->i_writecount) > 1) {
+            atomic_read(&inode->i_writecount) == 1) {
                /*
                 * Grab i_mutex to avoid races with writes changing i_size
                 * while we are running.
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 5bc71d9a674a..a445d599098d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -750,7 +750,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
        /* Are we beyond EOF? */
        if (etype == -1) {
                int ret;
-                isBeyondEOF = 1;
+                isBeyondEOF = true;
                if (count) {
                        if (c)
                                laarr[0] = laarr[1];
@@ -792,7 +792,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
                endnum = c + 1;
                lastblock = 1;
        } else {
-                isBeyondEOF = 0;
+                isBeyondEOF = false;
                endnum = startnum = ((count > 2) ? 2 : count);
                /* if the current extent is in position 0,
@@ -1288,6 +1288,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
        struct kernel_lb_addr *iloc = &iinfo->i_location;
        unsigned int link_count;
        unsigned int indirections = 0;
+        int bs = inode->i_sb->s_blocksize;
        int ret = -EIO;
 reread:
@@ -1374,38 +1375,35 @@ reread:
        if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
                iinfo->i_efe = 1;
                iinfo->i_use = 0;
-                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, bs -
                                        sizeof(struct extendedFileEntry));
                if (ret)
                        goto out;
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct extendedFileEntry),
-                       inode->i_sb->s_blocksize -
+                       bs - sizeof(struct extendedFileEntry));
-                                        sizeof(struct extendedFileEntry));
        } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
                iinfo->i_efe = 0;
                iinfo->i_use = 0;
-                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry));
-                                                sizeof(struct fileEntry));
                if (ret)
                        goto out;
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct fileEntry),
-                       inode->i_sb->s_blocksize - sizeof(struct fileEntry));
+                       bs - sizeof(struct fileEntry));
        } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
                iinfo->i_efe = 0;
                iinfo->i_use = 1;
                iinfo->i_lenAlloc = le32_to_cpu(
                                ((struct unallocSpaceEntry *)bh->b_data)->
                                 lengthAllocDescs);
-                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, bs -
                                        sizeof(struct unallocSpaceEntry));
                if (ret)
                        goto out;
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct unallocSpaceEntry),
-                       inode->i_sb->s_blocksize -
+                       bs - sizeof(struct unallocSpaceEntry));
-                                        sizeof(struct unallocSpaceEntry));
                return 0;
        }
@@ -1489,6 +1487,15 @@ reread:
        }
        inode->i_generation = iinfo->i_unique;
+        /*
+         * Sanity check length of allocation descriptors and extended attrs to
+         * avoid integer overflows
+         */
+        if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs)
+                goto out;
+        /* Now do exact checks */
+        if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs)
+                goto out;
        /* Sanity checks for files in ICB so that we don't get confused later */
        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
                /*
@@ -1498,8 +1505,7 @@ reread:
                if (iinfo->i_lenAlloc != inode->i_size)
                        goto out;
                /* File in ICB has to fit in there... */
-                if (inode->i_size > inode->i_sb->s_blocksize -
+                if (inode->i_size > bs - udf_file_entry_alloc_offset(inode))
-                                        udf_file_entry_alloc_offset(inode))
                        goto out;
        }
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3ccb2f11fc76..f169411c4ea0 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1599,7 +1599,7 @@ static noinline int udf_process_sequence(
        struct udf_vds_record *curr;
        struct generic_desc *gd;
        struct volDescPtr *vdp;
-        int done = 0;
+        bool done = false;
        uint32_t vdsn;
        uint16_t ident;
        long next_s = 0, next_e = 0;
@@ -1680,7 +1680,7 @@ static noinline int udf_process_sequence(
                                lastblock = next_e;
                                next_s = next_e = 0;
                        } else
-                                done = 1;
+                                done = true;
                        break;
                }
                brelse(bh);
@@ -2300,6 +2300,7 @@ static void udf_put_super(struct super_block *sb)
                udf_close_lvid(sb);
        brelse(sbi->s_lvid_bh);
        udf_sb_free_partitions(sb);
+        mutex_destroy(&sbi->s_alloc_mutex);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
 }
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 53e95b2a1369..a7a3a63bb360 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -91,16 +91,6 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
        return ptr;
 }
-void
-kmem_free(const void *ptr)
-{
-        if (!is_vmalloc_addr(ptr)) {
-                kfree(ptr);
-        } else {
-                vfree(ptr);
-        }
-}
 void *
 kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
             xfs_km_flags_t flags)
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 64db0e53edea..cc6b768fc068 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -63,7 +63,10 @@ kmem_flags_convert(xfs_km_flags_t flags)
 extern void *kmem_alloc(size_t, xfs_km_flags_t);
 extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
 extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
-extern void  kmem_free(const void *);
+static inline void  kmem_free(const void *ptr)
+{
+        kvfree(ptr);
+}
 extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 5d38e8b8a913..15105dbc9e28 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -403,7 +403,7 @@ xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
                if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
                        xfs_sb_version_addattr2(&mp->m_sb);
                        spin_unlock(&mp->m_sb_lock);
-                        xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+                        xfs_log_sb(tp);
                } else
                        spin_unlock(&mp->m_sb_lock);
        }
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b5eb4743f75a..61ec015dca16 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -973,7 +973,11 @@ xfs_bmap_local_to_extents(
        *firstblock = args.fsbno;
        bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-        /* initialise the block and copy the data */
+        /*
+         * Initialise the block and copy the data
+         *
+         * Note: init_fn must set the buffer log item type correctly!
+         */
        init_fn(tp, bp, ip, ifp);
        /* account for the change in fork size and log everything */
@@ -1221,22 +1225,20 @@ xfs_bmap_add_attrfork(
                goto bmap_cancel;
        if (!xfs_sb_version_hasattr(&mp->m_sb) ||
           (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
-                __int64_t sbfields = 0;
+                bool log_sb = false;
                spin_lock(&mp->m_sb_lock);
                if (!xfs_sb_version_hasattr(&mp->m_sb)) {
                        xfs_sb_version_addattr(&mp->m_sb);
-                        sbfields |= XFS_SB_VERSIONNUM;
+                        log_sb = true;
                }
                if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
                        xfs_sb_version_addattr2(&mp->m_sb);
-                        sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+                        log_sb = true;
                }
-                if (sbfields) {
+                spin_unlock(&mp->m_sb_lock);
-                        spin_unlock(&mp->m_sb_lock);
+                if (log_sb)
-                        xfs_mod_sb(tp, sbfields);
+                        xfs_log_sb(tp);
-                } else
-                        spin_unlock(&mp->m_sb_lock);
        }
        error = xfs_bmap_finish(&tp, &flist, &committed);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 44db6db86402..b9d8a499d2c4 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -28,6 +28,37 @@ struct xfs_trans;
 extern kmem_zone_t      *xfs_bmap_free_item_zone;
 /*
+ * Argument structure for xfs_bmap_alloc.
+ */
+struct xfs_bmalloca {
+        xfs_fsblock_t           *firstblock; /* i/o first block allocated */
+        struct xfs_bmap_free    *flist; /* bmap freelist */
+        struct xfs_trans        *tp;    /* transaction pointer */
+        struct xfs_inode        *ip;    /* incore inode pointer */
+        struct xfs_bmbt_irec    prev;   /* extent before the new one */
+        struct xfs_bmbt_irec    got;    /* extent after, or delayed */
+        xfs_fileoff_t           offset; /* offset in file filling in */
+        xfs_extlen_t            length; /* i/o length asked/allocated */
+        xfs_fsblock_t           blkno;  /* starting block of new extent */
+        struct xfs_btree_cur    *cur;   /* btree cursor */
+        xfs_extnum_t            idx;    /* current extent index */
+        int                     nallocs;/* number of extents alloc'd */
+        int                     logflags;/* flags for transaction logging */
+        xfs_extlen_t            total;  /* total blocks needed for xaction */
+        xfs_extlen_t            minlen; /* minimum allocation size (blocks) */
+        xfs_extlen_t            minleft; /* amount must be left after alloc */
+        bool                    eof;    /* set if allocating past last extent */
+        bool                    wasdel; /* replacing a delayed allocation */
+        bool                    userdata;/* set if is user data */
+        bool                    aeof;   /* allocated space at eof */
+        bool                    conv;   /* overwriting unwritten extents */
+        int                     flags;
+};
+/*
 * List of extents to be free "later".
 * The list is kept sorted on xbf_startblock.
 */
@@ -149,6 +180,8 @@ void	xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
 void    xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
                struct xfs_bmap_free *flist, struct xfs_mount *mp);
 void    xfs_bmap_cancel(struct xfs_bmap_free *flist);
+int     xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
+                        int *committed);
 void    xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
 int     xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
                xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index fbd6da263571..8eb718979383 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -151,10 +151,13 @@ typedef struct xfs_sb {
        __uint32_t      sb_features2;   /* additional feature bits */
        /*
-         * bad features2 field as a result of failing to pad the sb
+         * bad features2 field as a result of failing to pad the sb structure to
-         * structure to 64 bits. Some machines will be using this field
+         * 64 bits. Some machines will be using this field for features2 bits.
-         * for features2 bits. Easiest just to mark it bad and not use
+         * Easiest just to mark it bad and not use it for anything else.
-         * it for anything else.
+         *
+         * This is not kept up to date in memory; it is always overwritten by
+         * the value in sb_features2 when formatting the incore superblock to
+         * the disk buffer.
         */
        __uint32_t      sb_bad_features2;
@@ -304,8 +307,8 @@ typedef enum {
 #define XFS_SB_ICOUNT           XFS_SB_MVAL(ICOUNT)
 #define XFS_SB_IFREE            XFS_SB_MVAL(IFREE)
 #define XFS_SB_FDBLOCKS         XFS_SB_MVAL(FDBLOCKS)
-#define XFS_SB_FEATURES2        XFS_SB_MVAL(FEATURES2)
+#define XFS_SB_FEATURES2        (XFS_SB_MVAL(FEATURES2) | \
-#define XFS_SB_BAD_FEATURES2    XFS_SB_MVAL(BAD_FEATURES2)
+                                 XFS_SB_MVAL(BAD_FEATURES2))
 #define XFS_SB_FEATURES_COMPAT  XFS_SB_MVAL(FEATURES_COMPAT)
 #define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
 #define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
@@ -319,9 +322,9 @@ typedef enum {
         XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
         XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
         XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
-         XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
+         XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \
-         XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
+         XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \
-         XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
+         XFS_SB_PQUOTINO)
 /*
@@ -453,13 +456,11 @@ static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
 {
        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
        sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
-        sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
 }
 static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
 {
        sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
-        sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
        if (!sbp->sb_features2)
                sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
 }
@@ -475,7 +476,6 @@ static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
 {
        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
        sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
-        sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
 }
 /*
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 18dc721ca19f..18dc721ca19f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 752915fa775a..b0a5fe95a3e2 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -40,69 +40,6 @@
 * Physical superblock buffer manipulations. Shared with libxfs in userspace.
 */
-static const struct {
-        short offset;
-        short type;     /* 0 = integer
-                         * 1 = binary / string (no translation)
-                         */
-} xfs_sb_info[] = {
-        { offsetof(xfs_sb_t, sb_magicnum),      0 },
-        { offsetof(xfs_sb_t, sb_blocksize),     0 },
-        { offsetof(xfs_sb_t, sb_dblocks),       0 },
-        { offsetof(xfs_sb_t, sb_rblocks),       0 },
-        { offsetof(xfs_sb_t, sb_rextents),      0 },
-        { offsetof(xfs_sb_t, sb_uuid),          1 },
-        { offsetof(xfs_sb_t, sb_logstart),      0 },
-        { offsetof(xfs_sb_t, sb_rootino),       0 },
-        { offsetof(xfs_sb_t, sb_rbmino),        0 },
-        { offsetof(xfs_sb_t, sb_rsumino),       0 },
-        { offsetof(xfs_sb_t, sb_rextsize),      0 },
-        { offsetof(xfs_sb_t, sb_agblocks),      0 },
-        { offsetof(xfs_sb_t, sb_agcount),       0 },
-        { offsetof(xfs_sb_t, sb_rbmblocks),     0 },
-        { offsetof(xfs_sb_t, sb_logblocks),     0 },
-        { offsetof(xfs_sb_t, sb_versionnum),    0 },
-        { offsetof(xfs_sb_t, sb_sectsize),      0 },
-        { offsetof(xfs_sb_t, sb_inodesize),     0 },
-        { offsetof(xfs_sb_t, sb_inopblock),     0 },
-        { offsetof(xfs_sb_t, sb_fname[0]),      1 },
-        { offsetof(xfs_sb_t, sb_blocklog),      0 },
-        { offsetof(xfs_sb_t, sb_sectlog),       0 },
-        { offsetof(xfs_sb_t, sb_inodelog),      0 },
-        { offsetof(xfs_sb_t, sb_inopblog),      0 },
-        { offsetof(xfs_sb_t, sb_agblklog),      0 },
-        { offsetof(xfs_sb_t, sb_rextslog),      0 },
-        { offsetof(xfs_sb_t, sb_inprogress),    0 },
-        { offsetof(xfs_sb_t, sb_imax_pct),      0 },
-        { offsetof(xfs_sb_t, sb_icount),        0 },
-        { offsetof(xfs_sb_t, sb_ifree),         0 },
-        { offsetof(xfs_sb_t, sb_fdblocks),      0 },
-        { offsetof(xfs_sb_t, sb_frextents),     0 },
-        { offsetof(xfs_sb_t, sb_uquotino),      0 },
-        { offsetof(xfs_sb_t, sb_gquotino),      0 },
-        { offsetof(xfs_sb_t, sb_qflags),        0 },
-        { offsetof(xfs_sb_t, sb_flags),         0 },
-        { offsetof(xfs_sb_t, sb_shared_vn),     0 },
-        { offsetof(xfs_sb_t, sb_inoalignmt),    0 },
-        { offsetof(xfs_sb_t, sb_unit),          0 },
-        { offsetof(xfs_sb_t, sb_width),         0 },
-        { offsetof(xfs_sb_t, sb_dirblklog),     0 },
-        { offsetof(xfs_sb_t, sb_logsectlog),    0 },
-        { offsetof(xfs_sb_t, sb_logsectsize),   0 },
-        { offsetof(xfs_sb_t, sb_logsunit),      0 },
-        { offsetof(xfs_sb_t, sb_features2),     0 },
-        { offsetof(xfs_sb_t, sb_bad_features2), 0 },
-        { offsetof(xfs_sb_t, sb_features_compat),       0 },
-        { offsetof(xfs_sb_t, sb_features_ro_compat),    0 },
-        { offsetof(xfs_sb_t, sb_features_incompat),     0 },
-        { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
-        { offsetof(xfs_sb_t, sb_crc),           0 },
-        { offsetof(xfs_sb_t, sb_pad),           0 },
-        { offsetof(xfs_sb_t, sb_pquotino),      0 },
-        { offsetof(xfs_sb_t, sb_lsn),           0 },
-        { sizeof(xfs_sb_t),                     0 }
-};
 /*
 * Reference counting access wrappers to the perag structures.
 * Because we never free per-ag structures, the only thing we
@@ -461,58 +398,49 @@ xfs_sb_from_disk(
        __xfs_sb_from_disk(to, from, true);
 }
-static inline void
+static void
 xfs_sb_quota_to_disk(
-        xfs_dsb_t       *to,
+        struct xfs_dsb  *to,
-        xfs_sb_t        *from,
+        struct xfs_sb   *from)
-        __int64_t       *fields)
 {
        __uint16_t      qflags = from->sb_qflags;
+        to->sb_uquotino = cpu_to_be64(from->sb_uquotino);
+        if (xfs_sb_version_has_pquotino(from)) {
+                to->sb_qflags = cpu_to_be16(from->sb_qflags);
+                to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+                to->sb_pquotino = cpu_to_be64(from->sb_pquotino);
+                return;
+        }
        /*
-         * We need to do these manipilations only if we are working
+         * The in-core version of sb_qflags do not have XFS_OQUOTA_*
-         * with an older version of on-disk superblock.
+         * flags, whereas the on-disk version does.  So, convert incore
+         * XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags.
         */
-        if (xfs_sb_version_has_pquotino(from))
+        qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
-                return;
+                        XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
-        if (*fields & XFS_SB_QFLAGS) {
+        if (from->sb_qflags &
-                /*
+                        (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
-                 * The in-core version of sb_qflags do not have
+                qflags |= XFS_OQUOTA_ENFD;
-                 * XFS_OQUOTA_* flags, whereas the on-disk version
+        if (from->sb_qflags &
-                 * does.  So, convert incore XFS_{PG}QUOTA_* flags
+                        (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
-                 * to on-disk XFS_OQUOTA_* flags.
+                qflags |= XFS_OQUOTA_CHKD;
-                 */
+        to->sb_qflags = cpu_to_be16(qflags);
-                qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
-                                XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
-                if (from->sb_qflags &
-                                (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
-                        qflags |= XFS_OQUOTA_ENFD;
-                if (from->sb_qflags &
-                                (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
-                        qflags |= XFS_OQUOTA_CHKD;
-                to->sb_qflags = cpu_to_be16(qflags);
-                *fields &= ~XFS_SB_QFLAGS;
-        }
        /*
-         * GQUOTINO and PQUOTINO cannot be used together in versions of
+         * GQUOTINO and PQUOTINO cannot be used together in versions
-         * superblock that do not have pquotino. from->sb_flags tells us which
+         * of superblock that do not have pquotino. from->sb_flags
-         * quota is active and should be copied to disk. If neither are active,
+         * tells us which quota is active and should be copied to
-         * make sure we write NULLFSINO to the sb_gquotino field as a quota
+         * disk. If neither are active, we should NULL the inode.
-         * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature
-         * bit is set.
         *
-         * Note that we don't need to handle the sb_uquotino or sb_pquotino here
+         * In all cases, the separate pquotino must remain 0 because it
-         * as they do not require any translation. Hence the main sb field loop
+         * it beyond the "end" of the valid non-pquotino superblock.
-         * will write them appropriately from the in-core superblock.
         */
-        if ((*fields & XFS_SB_GQUOTINO) &&
+        if (from->sb_qflags & XFS_GQUOTA_ACCT)
-                                (from->sb_qflags & XFS_GQUOTA_ACCT))
                to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
-        else if ((*fields & XFS_SB_PQUOTINO) &&
+        else if (from->sb_qflags & XFS_PQUOTA_ACCT)
-                                (from->sb_qflags & XFS_PQUOTA_ACCT))
                to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
        else {
                /*
@@ -526,63 +454,78 @@ xfs_sb_quota_to_disk(
                        to->sb_gquotino = cpu_to_be64(NULLFSINO);
        }
-        *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO);
+        to->sb_pquotino = 0;
 }
-/*
- * Copy in core superblock to ondisk one.
- *
- * The fields argument is mask of superblock fields to copy.
- */
 void
 xfs_sb_to_disk(
-        xfs_dsb_t       *to,
+        struct xfs_dsb  *to,
-        xfs_sb_t        *from,
+        struct xfs_sb   *from)
-        __int64_t       fields)
 {
-        xfs_caddr_t     to_ptr = (xfs_caddr_t)to;
+        xfs_sb_quota_to_disk(to, from);
-        xfs_caddr_t     from_ptr = (xfs_caddr_t)from;
-        xfs_sb_field_t  f;
-        int             first;
-        int             size;
-        ASSERT(fields);
-        if (!fields)
-                return;
-        /* We should never write the crc here, it's updated in the IO path */
+        to->sb_magicnum = cpu_to_be32(from->sb_magicnum);
-        fields &= ~XFS_SB_CRC;
+        to->sb_blocksize = cpu_to_be32(from->sb_blocksize);
+        to->sb_dblocks = cpu_to_be64(from->sb_dblocks);
-        xfs_sb_quota_to_disk(to, from, &fields);
+        to->sb_rblocks = cpu_to_be64(from->sb_rblocks);
-        while (fields) {
+        to->sb_rextents = cpu_to_be64(from->sb_rextents);
-                f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+        memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
-                first = xfs_sb_info[f].offset;
+        to->sb_logstart = cpu_to_be64(from->sb_logstart);
-                size = xfs_sb_info[f + 1].offset - first;
+        to->sb_rootino = cpu_to_be64(from->sb_rootino);
+        to->sb_rbmino = cpu_to_be64(from->sb_rbmino);
-                ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
+        to->sb_rsumino = cpu_to_be64(from->sb_rsumino);
+        to->sb_rextsize = cpu_to_be32(from->sb_rextsize);
-                if (size == 1 || xfs_sb_info[f].type == 1) {
+        to->sb_agblocks = cpu_to_be32(from->sb_agblocks);
-                        memcpy(to_ptr + first, from_ptr + first, size);
+        to->sb_agcount = cpu_to_be32(from->sb_agcount);
-                } else {
+        to->sb_rbmblocks = cpu_to_be32(from->sb_rbmblocks);
-                        switch (size) {
+        to->sb_logblocks = cpu_to_be32(from->sb_logblocks);
-                        case 2:
+        to->sb_versionnum = cpu_to_be16(from->sb_versionnum);
-                                *(__be16 *)(to_ptr + first) =
+        to->sb_sectsize = cpu_to_be16(from->sb_sectsize);
-                                      cpu_to_be16(*(__u16 *)(from_ptr + first));
+        to->sb_inodesize = cpu_to_be16(from->sb_inodesize);
-                                break;
+        to->sb_inopblock = cpu_to_be16(from->sb_inopblock);
-                        case 4:
+        memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
-                                *(__be32 *)(to_ptr + first) =
+        to->sb_blocklog = from->sb_blocklog;
-                                      cpu_to_be32(*(__u32 *)(from_ptr + first));
+        to->sb_sectlog = from->sb_sectlog;
-                                break;
+        to->sb_inodelog = from->sb_inodelog;
-                        case 8:
+        to->sb_inopblog = from->sb_inopblog;
-                                *(__be64 *)(to_ptr + first) =
+        to->sb_agblklog = from->sb_agblklog;
-                                      cpu_to_be64(*(__u64 *)(from_ptr + first));
+        to->sb_rextslog = from->sb_rextslog;
-                                break;
+        to->sb_inprogress = from->sb_inprogress;
-                        default:
+        to->sb_imax_pct = from->sb_imax_pct;
-                                ASSERT(0);
+        to->sb_icount = cpu_to_be64(from->sb_icount);
-                        }
+        to->sb_ifree = cpu_to_be64(from->sb_ifree);
-                }
+        to->sb_fdblocks = cpu_to_be64(from->sb_fdblocks);
+        to->sb_frextents = cpu_to_be64(from->sb_frextents);
-                fields &= ~(1LL << f);
+        to->sb_flags = from->sb_flags;
+        to->sb_shared_vn = from->sb_shared_vn;
+        to->sb_inoalignmt = cpu_to_be32(from->sb_inoalignmt);
+        to->sb_unit = cpu_to_be32(from->sb_unit);
+        to->sb_width = cpu_to_be32(from->sb_width);
+        to->sb_dirblklog = from->sb_dirblklog;
+        to->sb_logsectlog = from->sb_logsectlog;
+        to->sb_logsectsize = cpu_to_be16(from->sb_logsectsize);
+        to->sb_logsunit = cpu_to_be32(from->sb_logsunit);
+        /*
+         * We need to ensure that bad_features2 always matches features2.
+         * Hence we enforce that here rather than having to remember to do it
+         * everywhere else that updates features2.
+         */
+        from->sb_bad_features2 = from->sb_features2;
+        to->sb_features2 = cpu_to_be32(from->sb_features2);
+        to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2);
+        if (xfs_sb_version_hascrc(from)) {
+                to->sb_features_compat = cpu_to_be32(from->sb_features_compat);
+                to->sb_features_ro_compat =
+                                cpu_to_be32(from->sb_features_ro_compat);
+                to->sb_features_incompat =
+                                cpu_to_be32(from->sb_features_incompat);
+                to->sb_features_log_incompat =
+                                cpu_to_be32(from->sb_features_log_incompat);
+                to->sb_pad = 0;
+                to->sb_lsn = cpu_to_be64(from->sb_lsn);
        }
 }
@@ -816,42 +759,51 @@ xfs_initialize_perag_data(
 }
 /*
- * xfs_mod_sb() can be used to copy arbitrary changes to the
+ * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock
- * in-core superblock into the superblock buffer to be logged.
+ * into the superblock buffer to be logged.  It does not provide the higher
- * It does not provide the higher level of locking that is
+ * level of locking that is needed to protect the in-core superblock from
- * needed to protect the in-core superblock from concurrent
+ * concurrent access.
- * access.
 */
 void
-xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
+xfs_log_sb(
+        struct xfs_trans        *tp)
 {
-        xfs_buf_t       *bp;
+        struct xfs_mount        *mp = tp->t_mountp;
-        int             first;
+        struct xfs_buf          *bp = xfs_trans_getsb(tp, mp, 0);
-        int             last;
-        xfs_mount_t     *mp;
-        xfs_sb_field_t  f;
-        ASSERT(fields);
-        if (!fields)
-                return;
-        mp = tp->t_mountp;
-        bp = xfs_trans_getsb(tp, mp, 0);
-        first = sizeof(xfs_sb_t);
-        last = 0;
-        /* translate/copy */
-        xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
+        xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+        xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
+}
-        /* find modified range */
+/*
-        f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+ * xfs_sync_sb
-        ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+ *
-        last = xfs_sb_info[f + 1].offset - 1;
+ * Sync the superblock to disk.
+ *
+ * Note that the caller is responsible for checking the frozen state of the
+ * filesystem. This procedure uses the non-blocking transaction allocator and
+ * thus will allow modifications to a frozen fs. This is required because this
+ * code can be called during the process of freezing where use of the high-level
+ * allocator would deadlock.
+ */
+int
+xfs_sync_sb(
+        struct xfs_mount        *mp,
+        bool                    wait)
+{
+        struct xfs_trans        *tp;
+        int                     error;
-        f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+        tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
-        ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-        first = xfs_sb_info[f].offset;
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
-        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+        xfs_log_sb(tp);
-        xfs_trans_log_buf(tp, bp, first, last);
+        if (wait)
+                xfs_trans_set_sync(tp);
+        return xfs_trans_commit(tp, 0);
 }
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 8eb1c54bafbf..b25bb9a343f3 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -27,11 +27,12 @@ extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
 extern void     xfs_perag_put(struct xfs_perag *pag);
 extern int      xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
-extern void     xfs_sb_calc_crc(struct xfs_buf  *);
+extern void     xfs_sb_calc_crc(struct xfs_buf *bp);
-extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern void     xfs_log_sb(struct xfs_trans *tp);
-extern void     xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *);
+extern int      xfs_sync_sb(struct xfs_mount *mp, bool wait);
-extern void     xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void     xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
-extern void     xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+extern void     xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
+extern void     xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
 extern void     xfs_sb_quota_from_disk(struct xfs_sb *sbp);
 #endif  /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 82404da2ca67..8dda4b321343 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -82,7 +82,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 #define XFS_TRANS_ATTR_RM               23
 #define XFS_TRANS_ATTR_FLAG             24
 #define XFS_TRANS_CLEAR_AGI_BUCKET      25
-#define XFS_TRANS_QM_SBCHANGE           26
+#define XFS_TRANS_SB_CHANGE             26
 /*
 * Dummy entries since we use the transaction type to index into the
 * trans_type[] in xlog_recover_print_trans_head()
@@ -95,17 +95,15 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 #define XFS_TRANS_QM_DQCLUSTER          32
 #define XFS_TRANS_QM_QINOCREATE         33
 #define XFS_TRANS_QM_QUOTAOFF_END       34
-#define XFS_TRANS_SB_UNIT               35
+#define XFS_TRANS_FSYNC_TS              35
-#define XFS_TRANS_FSYNC_TS              36
+#define XFS_TRANS_GROWFSRT_ALLOC        36
-#define XFS_TRANS_GROWFSRT_ALLOC        37
+#define XFS_TRANS_GROWFSRT_ZERO         37
-#define XFS_TRANS_GROWFSRT_ZERO         38
+#define XFS_TRANS_GROWFSRT_FREE         38
-#define XFS_TRANS_GROWFSRT_FREE         39
+#define XFS_TRANS_SWAPEXT               39
-#define XFS_TRANS_SWAPEXT               40
+#define XFS_TRANS_CHECKPOINT            40
-#define XFS_TRANS_SB_COUNT              41
+#define XFS_TRANS_ICREATE               41
-#define XFS_TRANS_CHECKPOINT            42
+#define XFS_TRANS_CREATE_TMPFILE        42
-#define XFS_TRANS_ICREATE               43
+#define XFS_TRANS_TYPE_MAX              43
-#define XFS_TRANS_CREATE_TMPFILE        44
-#define XFS_TRANS_TYPE_MAX              44
 /* new transaction types need to be reflected in xfs_logprint(8) */
 #define XFS_TRANS_TYPES \
@@ -113,7 +111,6 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
        { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
        { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
        { XFS_TRANS_CREATE,             "CREATE" }, \
-        { XFS_TRANS_CREATE_TMPFILE,     "CREATE_TMPFILE" }, \
        { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
        { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
        { XFS_TRANS_REMOVE,             "REMOVE" }, \
@@ -134,23 +131,23 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
        { XFS_TRANS_ATTR_RM,            "ATTR_RM" }, \
        { XFS_TRANS_ATTR_FLAG,          "ATTR_FLAG" }, \
        { XFS_TRANS_CLEAR_AGI_BUCKET,   "CLEAR_AGI_BUCKET" }, \
-        { XFS_TRANS_QM_SBCHANGE,        "QM_SBCHANGE" }, \
+        { XFS_TRANS_SB_CHANGE,          "SBCHANGE" }, \
+        { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
+        { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
        { XFS_TRANS_QM_QUOTAOFF,        "QM_QUOTAOFF" }, \
        { XFS_TRANS_QM_DQALLOC,         "QM_DQALLOC" }, \
        { XFS_TRANS_QM_SETQLIM,         "QM_SETQLIM" }, \
        { XFS_TRANS_QM_DQCLUSTER,       "QM_DQCLUSTER" }, \
        { XFS_TRANS_QM_QINOCREATE,      "QM_QINOCREATE" }, \
        { XFS_TRANS_QM_QUOTAOFF_END,    "QM_QOFF_END" }, \
-        { XFS_TRANS_SB_UNIT,            "SB_UNIT" }, \
        { XFS_TRANS_FSYNC_TS,           "FSYNC_TS" }, \
        { XFS_TRANS_GROWFSRT_ALLOC,     "GROWFSRT_ALLOC" }, \
        { XFS_TRANS_GROWFSRT_ZERO,      "GROWFSRT_ZERO" }, \
        { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
        { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
-        { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
        { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
-        { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
+        { XFS_TRANS_ICREATE,            "ICREATE" }, \
-        { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
+        { XFS_TRANS_CREATE_TMPFILE,     "CREATE_TMPFILE" }, \
        { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
 /*
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index c80c5236c3da..e7e26bd6468f 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -178,6 +178,8 @@ xfs_symlink_local_to_remote(
        struct xfs_mount        *mp = ip->i_mount;
        char                    *buf;
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
        if (!xfs_sb_version_hascrc(&mp->m_sb)) {
                bp->b_ops = NULL;
                memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 6c1330f29050..68cb1e7bf2bb 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -716,17 +716,6 @@ xfs_calc_clear_agi_bucket_reservation(
 }
 /*
- * Clearing the quotaflags in the superblock.
- *      the super block for changing quota flags: sector size
- */
-STATIC uint
-xfs_calc_qm_sbchange_reservation(
-        struct xfs_mount        *mp)
-{
-        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-/*
 * Adjusting quota limits.
 *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
 */
@@ -864,9 +853,6 @@ xfs_trans_resv_calc(
         * The following transactions are logged in logical format with
         * a default log count.
         */
-        resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
-        resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
        resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
        resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 1097d14cd583..2d5bdfce6d8f 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -56,7 +56,6 @@ struct xfs_trans_resv {
        struct xfs_trans_res    tr_growrtalloc; /* grow realtime allocations */
        struct xfs_trans_res    tr_growrtzero;  /* grow realtime zeroing */
        struct xfs_trans_res    tr_growrtfree;  /* grow realtime freeing */
-        struct xfs_trans_res    tr_qm_sbchange; /* change quota flags */
        struct xfs_trans_res    tr_qm_setqlim;  /* adjust quota limits */
        struct xfs_trans_res    tr_qm_dqalloc;  /* allocate quota on disk */
        struct xfs_trans_res    tr_qm_quotaoff; /* turn quota off */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b79dc66b2ecd..b79dc66b2ecd 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 18e2f3bbae5e..3a9b7a1b8704 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -135,30 +135,22 @@ xfs_setfilesize_trans_alloc(
 */
 STATIC int
 xfs_setfilesize(
-        struct xfs_ioend        *ioend)
+        struct xfs_inode        *ip,
+        struct xfs_trans        *tp,
+        xfs_off_t               offset,
+        size_t                  size)
 {
-        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
-        struct xfs_trans        *tp = ioend->io_append_trans;
        xfs_fsize_t             isize;
-        /*
-         * The transaction may have been allocated in the I/O submission thread,
-         * thus we need to mark ourselves as beeing in a transaction manually.
-         * Similarly for freeze protection.
-         */
-        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
-                           0, 1, _THIS_IP_);
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
+        isize = xfs_new_eof(ip, offset + size);
        if (!isize) {
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                xfs_trans_cancel(tp, 0);
                return 0;
        }
-        trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+        trace_xfs_setfilesize(ip, offset, size);
        ip->i_d.di_size = isize;
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -167,6 +159,25 @@ xfs_setfilesize(
        return xfs_trans_commit(tp, 0);
 }
+STATIC int
+xfs_setfilesize_ioend(
+        struct xfs_ioend        *ioend)
+{
+        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+        struct xfs_trans        *tp = ioend->io_append_trans;
+        /*
+         * The transaction may have been allocated in the I/O submission thread,
+         * thus we need to mark ourselves as being in a transaction manually.
+         * Similarly for freeze protection.
+         */
+        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+        rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                           0, 1, _THIS_IP_);
+        return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
+}
 /*
 * Schedule IO completion handling on the final put of an ioend.
 *
@@ -182,8 +193,7 @@ xfs_finish_ioend(
                if (ioend->io_type == XFS_IO_UNWRITTEN)
                        queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-                else if (ioend->io_append_trans ||
+                else if (ioend->io_append_trans)
-                         (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
                        queue_work(mp->m_data_workqueue, &ioend->io_work);
                else
                        xfs_destroy_ioend(ioend);
@@ -215,22 +225,8 @@ xfs_end_io(
        if (ioend->io_type == XFS_IO_UNWRITTEN) {
                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
                                                  ioend->io_size);
-        } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
-                /*
-                 * For direct I/O we do not know if we need to allocate blocks
-                 * or not so we can't preallocate an append transaction as that
-                 * results in nested reservations and log space deadlocks. Hence
-                 * allocate the transaction here. While this is sub-optimal and
-                 * can block IO completion for some time, we're stuck with doing
-                 * it this way until we can pass the ioend to the direct IO
-                 * allocation callbacks and avoid nesting that way.
-                 */
-                error = xfs_setfilesize_trans_alloc(ioend);
-                if (error)
-                        goto done;
-                error = xfs_setfilesize(ioend);
        } else if (ioend->io_append_trans) {
-                error = xfs_setfilesize(ioend);
+                error = xfs_setfilesize_ioend(ioend);
        } else {
                ASSERT(!xfs_ioend_is_append(ioend));
        }
@@ -242,17 +238,6 @@ done:
 }
 /*
- * Call IO completion handling in caller context on the final put of an ioend.
- */
-STATIC void
-xfs_finish_ioend_sync(
-        struct xfs_ioend        *ioend)
-{
-        if (atomic_dec_and_test(&ioend->io_remaining))
-                xfs_end_io(&ioend->io_work);
-}
-/*
 * Allocate and initialise an IO completion structure.
 * We need to track unwritten extent write completion here initially.
 * We'll need to extend this for updating the ondisk inode size later
@@ -273,7 +258,6 @@ xfs_alloc_ioend(
         * all the I/O from calling the completion routine too early.
         */
        atomic_set(&ioend->io_remaining, 1);
-        ioend->io_isdirect = 0;
        ioend->io_error = 0;
        ioend->io_list = NULL;
        ioend->io_type = type;
@@ -1459,11 +1443,7 @@ xfs_get_blocks_direct(
 *
 * If the private argument is non-NULL __xfs_get_blocks signals us that we
 * need to issue a transaction to convert the range from unwritten to written
- * extents.  In case this is regular synchronous I/O we just call xfs_end_io
+ * extents.
- * to do this and we are done.  But in case this was a successful AIO
- * request this handler is called from interrupt context, from which we
- * can't start transactions.  In that case offload the I/O completion to
- * the workqueues we also use for buffered I/O completion.
 */
 STATIC void
 xfs_end_io_direct_write(
@@ -1472,7 +1452,12 @@ xfs_end_io_direct_write(
        ssize_t                 size,
        void                    *private)
 {
-        struct xfs_ioend        *ioend = iocb->private;
+        struct inode            *inode = file_inode(iocb->ki_filp);
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return;
        /*
         * While the generic direct I/O code updates the inode size, it does
@@ -1480,22 +1465,33 @@ xfs_end_io_direct_write(
         * end_io handler thinks the on-disk size is outside the in-core
         * size.  To prevent this just update it a little bit earlier here.
         */
-        if (offset + size > i_size_read(ioend->io_inode))
+        if (offset + size > i_size_read(inode))
-                i_size_write(ioend->io_inode, offset + size);
+                i_size_write(inode, offset + size);
        /*
-         * blockdev_direct_IO can return an error even after the I/O
+         * For direct I/O we do not know if we need to allocate blocks or not,
-         * completion handler was called.  Thus we need to protect
+         * so we can't preallocate an append transaction, as that results in
-         * against double-freeing.
+         * nested reservations and log space deadlocks. Hence allocate the
+         * transaction here. While this is sub-optimal and can block IO
+         * completion for some time, we're stuck with doing it this way until
+         * we can pass the ioend to the direct IO allocation callbacks and
+         * avoid nesting that way.
         */
-        iocb->private = NULL;
+        if (private && size > 0) {
+                xfs_iomap_write_unwritten(ip, offset, size);
-        ioend->io_offset = offset;
+        } else if (offset + size > ip->i_d.di_size) {
-        ioend->io_size = size;
+                struct xfs_trans        *tp;
-        if (private && size > 0)
+                int                     error;
-                ioend->io_type = XFS_IO_UNWRITTEN;
+                tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+                if (error) {
+                        xfs_trans_cancel(tp, 0);
+                        return;
+                }
-        xfs_finish_ioend_sync(ioend);
+                xfs_setfilesize(ip, tp, offset, size);
+        }
 }
 STATIC ssize_t
@@ -1507,39 +1503,16 @@ xfs_vm_direct_IO(
 {
        struct inode            *inode = iocb->ki_filp->f_mapping->host;
        struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
-        struct xfs_ioend        *ioend = NULL;
-        ssize_t                 ret;
        if (rw & WRITE) {
-                size_t size = iov_iter_count(iter);
+                return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
-                /*
-                 * We cannot preallocate a size update transaction here as we
-                 * don't know whether allocation is necessary or not. Hence we
-                 * can only tell IO completion that one is necessary if we are
-                 * not doing unwritten extent conversion.
-                 */
-                iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
-                if (offset + size > XFS_I(inode)->i_d.di_size)
-                        ioend->io_isdirect = 1;
-                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
                                            offset, xfs_get_blocks_direct,
                                            xfs_end_io_direct_write, NULL,
                                            DIO_ASYNC_EXTEND);
-                if (ret != -EIOCBQUEUED && iocb->private)
-                        goto out_destroy_ioend;
-        } else {
-                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
-                                            offset, xfs_get_blocks_direct,
-                                            NULL, NULL, 0);
        }
+        return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
-        return ret;
+                                    offset, xfs_get_blocks_direct,
+                                    NULL, NULL, 0);
-out_destroy_ioend:
-        xfs_destroy_ioend(ioend);
-        return ret;
 }
 /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index f94dd459dff9..ac644e0137a4 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -24,14 +24,12 @@ extern mempool_t *xfs_ioend_pool;
 * Types of I/O for bmap clustering and I/O completion tracking.
 */
 enum {
-        XFS_IO_DIRECT = 0,      /* special case for direct I/O ioends */
        XFS_IO_DELALLOC,        /* covers delalloc region */
        XFS_IO_UNWRITTEN,       /* covers allocated but uninitialized data */
        XFS_IO_OVERWRITE,       /* covers already allocated extent */
 };
 #define XFS_IO_TYPES \
-        { 0,                    "" }, \
        { XFS_IO_DELALLOC,              "delalloc" }, \
        { XFS_IO_UNWRITTEN,             "unwritten" }, \
        { XFS_IO_OVERWRITE,             "overwrite" }
@@ -45,7 +43,6 @@ typedef struct xfs_ioend {
        unsigned int            io_type;        /* delalloc / unwritten */
        int                     io_error;       /* I/O error code */
        atomic_t                io_remaining;   /* hold count */
-        unsigned int            io_isdirect : 1;/* direct I/O */
        struct inode            *io_inode;      /* file being written to */
        struct buffer_head      *io_buffer_head;/* buffer linked list head */
        struct buffer_head      *io_buffer_tail;/* buffer linked list tail */
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 2fdb72d2c908..736429a72a12 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -26,43 +26,8 @@ struct xfs_ifork;
 struct xfs_inode;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_bmalloca;
-/*
- * Argument structure for xfs_bmap_alloc.
- */
-struct xfs_bmalloca {
-        xfs_fsblock_t           *firstblock; /* i/o first block allocated */
-        struct xfs_bmap_free    *flist; /* bmap freelist */
-        struct xfs_trans        *tp;    /* transaction pointer */
-        struct xfs_inode        *ip;    /* incore inode pointer */
-        struct xfs_bmbt_irec    prev;   /* extent before the new one */
-        struct xfs_bmbt_irec    got;    /* extent after, or delayed */
-        xfs_fileoff_t           offset; /* offset in file filling in */
-        xfs_extlen_t            length; /* i/o length asked/allocated */
-        xfs_fsblock_t           blkno;  /* starting block of new extent */
-        struct xfs_btree_cur    *cur;   /* btree cursor */
-        xfs_extnum_t            idx;    /* current extent index */
-        int                     nallocs;/* number of extents alloc'd */
-        int                     logflags;/* flags for transaction logging */
-        xfs_extlen_t            total;  /* total blocks needed for xaction */
-        xfs_extlen_t            minlen; /* minimum allocation size (blocks) */
-        xfs_extlen_t            minleft; /* amount must be left after alloc */
-        bool                    eof;    /* set if allocating past last extent */
-        bool                    wasdel; /* replacing a delayed allocation */
-        bool                    userdata;/* set if is user data */
-        bool                    aeof;   /* allocated space at eof */
-        bool                    conv;   /* overwriting unwritten extents */
-        int                     flags;
-        struct completion       *done;
-        struct work_struct      work;
-        int                     result;
-};
-int     xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
-                        int *committed);
 int     xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
 int     xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
                     int whichfork, int *eof);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 3f9bd58edec7..507d96a57ac7 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -319,6 +319,10 @@ xfs_buf_item_format(
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
               (bip->bli_flags & XFS_BLI_STALE));
+        ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
+               (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
+                && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
        /*
         * If it is an inode buffer, transfer the in-memory state to the
@@ -535,7 +539,7 @@ xfs_buf_item_push(
        if ((bp->b_flags & XBF_WRITE_FAIL) &&
            ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
                xfs_warn(bp->b_target->bt_mount,
-"Detected failing async write on buffer block 0x%llx. Retrying async write.\n",
+"Detected failing async write on buffer block 0x%llx. Retrying async write.",
                         (long long)bp->b_bn);
        }
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index c24c67e22a2a..2f536f33cd26 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -86,7 +86,7 @@ static inline void xfs_dqflock(xfs_dquot_t *dqp)
        wait_for_completion(&dqp->q_flush);
 }
-static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp)
+static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp)
 {
        return try_wait_for_completion(&dqp->q_flush);
 }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 13e974e6a889..1cdba95c78cb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -127,6 +127,42 @@ xfs_iozero(
        return (-status);
 }
+int
+xfs_update_prealloc_flags(
+        struct xfs_inode        *ip,
+        enum xfs_prealloc_flags flags)
+{
+        struct xfs_trans        *tp;
+        int                     error;
+        tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
+        error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        if (!(flags & XFS_PREALLOC_INVISIBLE)) {
+                ip->i_d.di_mode &= ~S_ISUID;
+                if (ip->i_d.di_mode & S_IXGRP)
+                        ip->i_d.di_mode &= ~S_ISGID;
+                xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        }
+        if (flags & XFS_PREALLOC_SET)
+                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+        if (flags & XFS_PREALLOC_CLEAR)
+                ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        if (flags & XFS_PREALLOC_SYNC)
+                xfs_trans_set_sync(tp);
+        return xfs_trans_commit(tp, 0);
+}
 /*
 * Fsync operations on directories are much simpler than on regular files,
 * as there is no file data to flush, and thus also no need for explicit
@@ -699,7 +735,7 @@ xfs_file_buffered_aio_write(
        iov_iter_truncate(from, count);
        /* We can write back this queue in page reclaim */
-        current->backing_dev_info = mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
 write_retry:
        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
@@ -784,8 +820,8 @@ xfs_file_fallocate(
 {
        struct inode            *inode = file_inode(file);
        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_trans        *tp;
        long                    error;
+        enum xfs_prealloc_flags flags = 0;
        loff_t                  new_size = 0;
        if (!S_ISREG(inode->i_mode))
@@ -822,6 +858,8 @@ xfs_file_fallocate(
                if (error)
                        goto out_unlock;
        } else {
+                flags |= XFS_PREALLOC_SET;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
                    offset + len > i_size_read(inode)) {
                        new_size = offset + len;
@@ -839,28 +877,10 @@ xfs_file_fallocate(
                        goto out_unlock;
        }
-        tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
-        error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                goto out_unlock;
-        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-        ip->i_d.di_mode &= ~S_ISUID;
-        if (ip->i_d.di_mode & S_IXGRP)
-                ip->i_d.di_mode &= ~S_ISGID;
-        if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
-                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
-        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        if (file->f_flags & O_DSYNC)
-                xfs_trans_set_sync(tp);
+                flags |= XFS_PREALLOC_SYNC;
-        error = xfs_trans_commit(tp, 0);
+        error = xfs_update_prealloc_flags(ip, flags);
        if (error)
                goto out_unlock;
@@ -1384,5 +1404,4 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = xfs_vm_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index fdc64220fcb0..fba6532efba4 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -488,6 +488,7 @@ xfs_growfs_data_private(
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
        if (dpct)
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+        xfs_trans_set_sync(tp);
        error = xfs_trans_commit(tp, 0);
        if (error)
                return error;
@@ -541,7 +542,7 @@ xfs_growfs_data_private(
                        saved_error = error;
                        continue;
                }
-                xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
+                xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
                error = xfs_bwrite(bp);
                xfs_buf_relse(bp);
@@ -756,37 +757,6 @@ out:
        return 0;
 }
-/*
- * Dump a transaction into the log that contains no real change. This is needed
- * to be able to make the log dirty or stamp the current tail LSN into the log
- * during the covering operation.
- *
- * We cannot use an inode here for this - that will push dirty state back up
- * into the VFS and then periodic inode flushing will prevent log covering from
- * making progress. Hence we log a field in the superblock instead and use a
- * synchronous transaction to ensure the superblock is immediately unpinned
- * and can be written back.
- */
-int
-xfs_fs_log_dummy(
-        xfs_mount_t     *mp)
-{
-        xfs_trans_t     *tp;
-        int             error;
-        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        /* log the UUID because it is an unchanging field */
-        xfs_mod_sb(tp, XFS_SB_UUID);
-        xfs_trans_set_sync(tp);
-        return xfs_trans_commit(tp, 0);
-}
 int
 xfs_fs_goingdown(
        xfs_mount_t     *mp,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 41f804e740d7..daafa1f6d260 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1995,6 +1995,7 @@ xfs_iunlink(
        agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
        offset = offsetof(xfs_agi_t, agi_unlinked) +
                (sizeof(xfs_agino_t) * bucket_index);
+        xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
        xfs_trans_log_buf(tp, agibp, offset,
                          (offset + sizeof(xfs_agino_t) - 1));
        return 0;
@@ -2086,6 +2087,7 @@ xfs_iunlink_remove(
                agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
                offset = offsetof(xfs_agi_t, agi_unlinked) +
                        (sizeof(xfs_agino_t) * bucket_index);
+                xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
                xfs_trans_log_buf(tp, agibp, offset,
                                  (offset + sizeof(xfs_agino_t) - 1));
        } else {
@@ -2656,6 +2658,124 @@ xfs_sort_for_rename(
 }
 /*
+ * xfs_cross_rename()
+ *
+ * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall
+ */
+STATIC int
+xfs_cross_rename(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp1,
+        struct xfs_name         *name1,
+        struct xfs_inode        *ip1,
+        struct xfs_inode        *dp2,
+        struct xfs_name         *name2,
+        struct xfs_inode        *ip2,
+        struct xfs_bmap_free    *free_list,
+        xfs_fsblock_t           *first_block,
+        int                     spaceres)
+{
+        int             error = 0;
+        int             ip1_flags = 0;
+        int             ip2_flags = 0;
+        int             dp2_flags = 0;
+        /* Swap inode number for dirent in first parent */
+        error = xfs_dir_replace(tp, dp1, name1,
+                                ip2->i_ino,
+                                first_block, free_list, spaceres);
+        if (error)
+                goto out;
+        /* Swap inode number for dirent in second parent */
+        error = xfs_dir_replace(tp, dp2, name2,
+                                ip1->i_ino,
+                                first_block, free_list, spaceres);
+        if (error)
+                goto out;
+        /*
+         * If we're renaming one or more directories across different parents,
+         * update the respective ".." entries (and link counts) to match the new
+         * parents.
+         */
+        if (dp1 != dp2) {
+                dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+                if (S_ISDIR(ip2->i_d.di_mode)) {
+                        error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
+                                                dp1->i_ino, first_block,
+                                                free_list, spaceres);
+                        if (error)
+                                goto out;
+                        /* transfer ip2 ".." reference to dp1 */
+                        if (!S_ISDIR(ip1->i_d.di_mode)) {
+                                error = xfs_droplink(tp, dp2);
+                                if (error)
+                                        goto out;
+                                error = xfs_bumplink(tp, dp1);
+                                if (error)
+                                        goto out;
+                        }
+                        /*
+                         * Although ip1 isn't changed here, userspace needs
+                         * to be warned about the change, so that applications
+                         * relying on it (like backup ones), will properly
+                         * notify the change
+                         */
+                        ip1_flags |= XFS_ICHGTIME_CHG;
+                        ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+                }
+                if (S_ISDIR(ip1->i_d.di_mode)) {
+                        error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
+                                                dp2->i_ino, first_block,
+                                                free_list, spaceres);
+                        if (error)
+                                goto out;
+                        /* transfer ip1 ".." reference to dp2 */
+                        if (!S_ISDIR(ip2->i_d.di_mode)) {
+                                error = xfs_droplink(tp, dp1);
+                                if (error)
+                                        goto out;
+                                error = xfs_bumplink(tp, dp2);
+                                if (error)
+                                        goto out;
+                        }
+                        /*
+                         * Although ip2 isn't changed here, userspace needs
+                         * to be warned about the change, so that applications
+                         * relying on it (like backup ones), will properly
+                         * notify the change
+                         */
+                        ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+                        ip2_flags |= XFS_ICHGTIME_CHG;
+                }
+        }
+        if (ip1_flags) {
+                xfs_trans_ichgtime(tp, ip1, ip1_flags);
+                xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
+        }
+        if (ip2_flags) {
+                xfs_trans_ichgtime(tp, ip2, ip2_flags);
+                xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
+        }
+        if (dp2_flags) {
+                xfs_trans_ichgtime(tp, dp2, dp2_flags);
+                xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
+        }
+        xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
+out:
+        return error;
+}
+/*
 * xfs_rename
 */
 int
@@ -2665,7 +2785,8 @@ xfs_rename(
        xfs_inode_t     *src_ip,
        xfs_inode_t     *target_dp,
        struct xfs_name *target_name,
-        xfs_inode_t     *target_ip)
+        xfs_inode_t     *target_ip,
+        unsigned int    flags)
 {
        xfs_trans_t     *tp = NULL;
        xfs_mount_t     *mp = src_dp->i_mount;
@@ -2743,6 +2864,18 @@ xfs_rename(
        }
        /*
+         * Handle RENAME_EXCHANGE flags
+         */
+        if (flags & RENAME_EXCHANGE) {
+                error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
+                                         target_dp, target_name, target_ip,
+                                         &free_list, &first_block, spaceres);
+                if (error)
+                        goto abort_return;
+                goto finish_rename;
+        }
+        /*
         * Set up the target.
         */
        if (target_ip == NULL) {
@@ -2881,6 +3014,7 @@ xfs_rename(
        if (new_parent)
                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+finish_rename:
        /*
         * If this is a synchronous mount, make sure that the
         * rename transaction goes to disk before returning to
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 4ed2ba9342dc..86cd6b39bed7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -338,7 +338,7 @@ int		xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 int             xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
                           struct xfs_inode *src_ip, struct xfs_inode *target_dp,
                           struct xfs_name *target_name,
-                           struct xfs_inode *target_ip);
+                           struct xfs_inode *target_ip, unsigned int flags);
 void            xfs_ilock(xfs_inode_t *, uint);
 int             xfs_ilock_nowait(xfs_inode_t *, uint);
@@ -377,6 +377,15 @@ int		xfs_droplink(struct xfs_trans *, struct xfs_inode *);
 int             xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
 /* from xfs_file.c */
+enum xfs_prealloc_flags {
+        XFS_PREALLOC_SET        = (1 << 1),
+        XFS_PREALLOC_CLEAR      = (1 << 2),
+        XFS_PREALLOC_SYNC       = (1 << 3),
+        XFS_PREALLOC_INVISIBLE  = (1 << 4),
+};
+int             xfs_update_prealloc_flags(struct xfs_inode *,
+                        enum xfs_prealloc_flags);
 int             xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
 int             xfs_iozero(struct xfs_inode *, loff_t, size_t);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a1831980a68e..f7afb86c9148 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -606,11 +606,8 @@ xfs_ioc_space(
        unsigned int            cmd,
        xfs_flock64_t           *bf)
 {
-        struct xfs_mount        *mp = ip->i_mount;
-        struct xfs_trans        *tp;
        struct iattr            iattr;
-        bool                    setprealloc = false;
+        enum xfs_prealloc_flags flags = 0;
-        bool                    clrprealloc = false;
        int                     error;
        /*
@@ -630,6 +627,11 @@ xfs_ioc_space(
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
+        if (filp->f_flags & O_DSYNC)
+                flags |= XFS_PREALLOC_SYNC;
+        if (ioflags & XFS_IO_INVIS)     
+                flags |= XFS_PREALLOC_INVISIBLE;
        error = mnt_want_write_file(filp);
        if (error)
                return error;
@@ -673,25 +675,23 @@ xfs_ioc_space(
        }
        if (bf->l_start < 0 ||
-            bf->l_start > mp->m_super->s_maxbytes ||
+            bf->l_start > inode->i_sb->s_maxbytes ||
            bf->l_start + bf->l_len < 0 ||
-            bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) {
+            bf->l_start + bf->l_len >= inode->i_sb->s_maxbytes) {
                error = -EINVAL;
                goto out_unlock;
        }
        switch (cmd) {
        case XFS_IOC_ZERO_RANGE:
+                flags |= XFS_PREALLOC_SET;
                error = xfs_zero_file_space(ip, bf->l_start, bf->l_len);
-                if (!error)
-                        setprealloc = true;
                break;
        case XFS_IOC_RESVSP:
        case XFS_IOC_RESVSP64:
+                flags |= XFS_PREALLOC_SET;
                error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len,
                                                XFS_BMAPI_PREALLOC);
-                if (!error)
-                        setprealloc = true;
                break;
        case XFS_IOC_UNRESVSP:
        case XFS_IOC_UNRESVSP64:
@@ -701,6 +701,7 @@ xfs_ioc_space(
        case XFS_IOC_ALLOCSP64:
        case XFS_IOC_FREESP:
        case XFS_IOC_FREESP64:
+                flags |= XFS_PREALLOC_CLEAR;
                if (bf->l_start > XFS_ISIZE(ip)) {
                        error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
                                        bf->l_start - XFS_ISIZE(ip), 0);
@@ -712,8 +713,6 @@ xfs_ioc_space(
                iattr.ia_size = bf->l_start;
                error = xfs_setattr_size(ip, &iattr);
-                if (!error)
-                        clrprealloc = true;
                break;
        default:
                ASSERT(0);
@@ -723,32 +722,7 @@ xfs_ioc_space(
        if (error)
                goto out_unlock;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+        error = xfs_update_prealloc_flags(ip, flags);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                goto out_unlock;
-        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-        if (!(ioflags & XFS_IO_INVIS)) {
-                ip->i_d.di_mode &= ~S_ISUID;
-                if (ip->i_d.di_mode & S_IXGRP)
-                        ip->i_d.di_mode &= ~S_ISGID;
-                xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        }
-        if (setprealloc)
-                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
-        else if (clrprealloc)
-                ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        if (filp->f_flags & O_DSYNC)
-                xfs_trans_set_sync(tp);
-        error = xfs_trans_commit(tp, 0);
 out_unlock:
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -1013,20 +987,182 @@ xfs_diflags_to_linux(
                inode->i_flags &= ~S_NOATIME;
 }
-#define FSX_PROJID      1
+static int
-#define FSX_EXTSIZE     2
+xfs_ioctl_setattr_xflags(
-#define FSX_XFLAGS      4
+        struct xfs_trans        *tp,
-#define FSX_NONBLOCK    8
+        struct xfs_inode        *ip,
+        struct fsxattr          *fa)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        /* Can't change realtime flag if any extents are allocated. */
+        if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+            XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME))
+                return -EINVAL;
+        /* If realtime flag is set then must have realtime device */
+        if (fa->fsx_xflags & XFS_XFLAG_REALTIME) {
+                if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
+                    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize))
+                        return -EINVAL;
+        }
+        /*
+         * Can't modify an immutable/append-only file unless
+         * we have appropriate permission.
+         */
+        if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
+             (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+            !capable(CAP_LINUX_IMMUTABLE))
+                return -EPERM;
+        xfs_set_diflags(ip, fa->fsx_xflags);
+        xfs_diflags_to_linux(ip);
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        XFS_STATS_INC(xs_ig_attrchg);
+        return 0;
+}
+/*
+ * Set up the transaction structure for the setattr operation, checking that we
+ * have permission to do so. On success, return a clean transaction and the
+ * inode locked exclusively ready for further operation specific checks. On
+ * failure, return an error without modifying or locking the inode.
+ */
+static struct xfs_trans *
+xfs_ioctl_setattr_get_trans(
+        struct xfs_inode        *ip)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_trans        *tp;
+        int                     error;
+        if (mp->m_flags & XFS_MOUNT_RDONLY)
+                return ERR_PTR(-EROFS);
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return ERR_PTR(-EIO);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+        if (error)
+                goto out_cancel;
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        /*
+         * CAP_FOWNER overrides the following restrictions:
+         *
+         * The user ID of the calling process must be equal to the file owner
+         * ID, except in cases where the CAP_FSETID capability is applicable.
+         */
+        if (!inode_owner_or_capable(VFS_I(ip))) {
+                error = -EPERM;
+                goto out_cancel;
+        }
+        if (mp->m_flags & XFS_MOUNT_WSYNC)
+                xfs_trans_set_sync(tp);
+        return tp;
+out_cancel:
+        xfs_trans_cancel(tp, 0);
+        return ERR_PTR(error);
+}
+/*
+ * extent size hint validation is somewhat cumbersome. Rules are:
+ *
+ * 1. extent size hint is only valid for directories and regular files
+ * 2. XFS_XFLAG_EXTSIZE is only valid for regular files
+ * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories.
+ * 4. can only be changed on regular files if no extents are allocated
+ * 5. can be changed on directories at any time
+ * 6. extsize hint of 0 turns off hints, clears inode flags.
+ * 7. Extent size must be a multiple of the appropriate block size.
+ * 8. for non-realtime files, the extent size hint must be limited
+ *    to half the AG size to avoid alignment extending the extent beyond the
+ *    limits of the AG.
+ */
+static int
+xfs_ioctl_setattr_check_extsize(
+        struct xfs_inode        *ip,
+        struct fsxattr          *fa)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
+                return -EINVAL;
+        if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) &&
+            !S_ISDIR(ip->i_d.di_mode))
+                return -EINVAL;
+        if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents &&
+            ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
+                return -EINVAL;
+        if (fa->fsx_extsize != 0) {
+                xfs_extlen_t    size;
+                xfs_fsblock_t   extsize_fsb;
+                extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+                if (extsize_fsb > MAXEXTLEN)
+                        return -EINVAL;
+                if (XFS_IS_REALTIME_INODE(ip) ||
+                    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+                        size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
+                } else {
+                        size = mp->m_sb.sb_blocksize;
+                        if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
+                                return -EINVAL;
+                }
+                if (fa->fsx_extsize % size)
+                        return -EINVAL;
+        } else
+                fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT);
+        return 0;
+}
+static int
+xfs_ioctl_setattr_check_projid(
+        struct xfs_inode        *ip,
+        struct fsxattr          *fa)
+{
+        /* Disallow 32bit project ids if projid32bit feature is not enabled. */
+        if (fa->fsx_projid > (__uint16_t)-1 &&
+            !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
+                return -EINVAL;
+        /*
+         * Project Quota ID state is only allowed to change from within the init
+         * namespace. Enforce that restriction only if we are trying to change
+         * the quota ID state. Everything else is allowed in user namespaces.
+         */
+        if (current_user_ns() == &init_user_ns)
+                return 0;
+        if (xfs_get_projid(ip) != fa->fsx_projid)
+                return -EINVAL;
+        if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) !=
+            (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
+                return -EINVAL;
+        return 0;
+}
 STATIC int
 xfs_ioctl_setattr(
        xfs_inode_t             *ip,
-        struct fsxattr          *fa,
+        struct fsxattr          *fa)
-        int                     mask)
 {
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp;
-        unsigned int            lock_flags = 0;
        struct xfs_dquot        *udqp = NULL;
        struct xfs_dquot        *pdqp = NULL;
        struct xfs_dquot        *olddquot = NULL;
@@ -1034,17 +1170,9 @@ xfs_ioctl_setattr(
        trace_xfs_ioctl_setattr(ip);
-        if (mp->m_flags & XFS_MOUNT_RDONLY)
+        code = xfs_ioctl_setattr_check_projid(ip, fa);
-                return -EROFS;
+        if (code)
-        if (XFS_FORCED_SHUTDOWN(mp))
+                return code;
-                return -EIO;
-        /*
-         * Disallow 32bit project ids when projid32bit feature is not enabled.
-         */
-        if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
-                        !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
-                return -EINVAL;
        /*
         * If disk quotas is on, we make sure that the dquots do exist on disk,
@@ -1054,7 +1182,7 @@ xfs_ioctl_setattr(
         * If the IDs do change before we take the ilock, we're covered
         * because the i_*dquot fields will get updated anyway.
         */
-        if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
+        if (XFS_IS_QUOTA_ON(mp)) {
                code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
                                         ip->i_d.di_gid, fa->fsx_projid,
                                         XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
@@ -1062,175 +1190,49 @@ xfs_ioctl_setattr(
                        return code;
        }
-        /*
+        tp = xfs_ioctl_setattr_get_trans(ip);
-         * For the other attributes, we acquire the inode lock and
+        if (IS_ERR(tp)) {
-         * first do an error checking pass.
+                code = PTR_ERR(tp);
-         */
+                goto error_free_dquots;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-        code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-        if (code)
-                goto error_return;
-        lock_flags = XFS_ILOCK_EXCL;
-        xfs_ilock(ip, lock_flags);
-        /*
-         * CAP_FOWNER overrides the following restrictions:
-         *
-         * The user ID of the calling process must be equal
-         * to the file owner ID, except in cases where the
-         * CAP_FSETID capability is applicable.
-         */
-        if (!inode_owner_or_capable(VFS_I(ip))) {
-                code = -EPERM;
-                goto error_return;
-        }
-        /*
-         * Do a quota reservation only if projid is actually going to change.
-         * Only allow changing of projid from init_user_ns since it is a
-         * non user namespace aware identifier.
-         */
-        if (mask & FSX_PROJID) {
-                if (current_user_ns() != &init_user_ns) {
-                        code = -EINVAL;
-                        goto error_return;
-                }
-                if (XFS_IS_QUOTA_RUNNING(mp) &&
-                    XFS_IS_PQUOTA_ON(mp) &&
-                    xfs_get_projid(ip) != fa->fsx_projid) {
-                        ASSERT(tp);
-                        code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL,
-                                                pdqp, capable(CAP_FOWNER) ?
-                                                XFS_QMOPT_FORCE_RES : 0);
-                        if (code)       /* out of quota */
-                                goto error_return;
-                }
        }
-        if (mask & FSX_EXTSIZE) {
-                /*
-                 * Can't change extent size if any extents are allocated.
-                 */
-                if (ip->i_d.di_nextents &&
-                    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
-                     fa->fsx_extsize)) {
-                        code = -EINVAL; /* EFBIG? */
-                        goto error_return;
-                }
-                /*
+        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
-                 * Extent size must be a multiple of the appropriate block
+            xfs_get_projid(ip) != fa->fsx_projid) {
-                 * size, if set at all. It must also be smaller than the
+                code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp,
-                 * maximum extent size supported by the filesystem.
+                                capable(CAP_FOWNER) ?  XFS_QMOPT_FORCE_RES : 0);
-                 *
+                if (code)       /* out of quota */
-                 * Also, for non-realtime files, limit the extent size hint to
+                        goto error_trans_cancel;
-                 * half the size of the AGs in the filesystem so alignment
-                 * doesn't result in extents larger than an AG.
-                 */
-                if (fa->fsx_extsize != 0) {
-                        xfs_extlen_t    size;
-                        xfs_fsblock_t   extsize_fsb;
-                        extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
-                        if (extsize_fsb > MAXEXTLEN) {
-                                code = -EINVAL;
-                                goto error_return;
-                        }
-                        if (XFS_IS_REALTIME_INODE(ip) ||
-                            ((mask & FSX_XFLAGS) &&
-                            (fa->fsx_xflags & XFS_XFLAG_REALTIME))) {
-                                size = mp->m_sb.sb_rextsize <<
-                                       mp->m_sb.sb_blocklog;
-                        } else {
-                                size = mp->m_sb.sb_blocksize;
-                                if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
-                                        code = -EINVAL;
-                                        goto error_return;
-                                }
-                        }
-                        if (fa->fsx_extsize % size) {
-                                code = -EINVAL;
-                                goto error_return;
-                        }
-                }
        }
+        code = xfs_ioctl_setattr_check_extsize(ip, fa);
+        if (code)
+                goto error_trans_cancel;
-        if (mask & FSX_XFLAGS) {
+        code = xfs_ioctl_setattr_xflags(tp, ip, fa);
-                /*
+        if (code)
-                 * Can't change realtime flag if any extents are allocated.
+                goto error_trans_cancel;
-                 */
-                if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
-                    (XFS_IS_REALTIME_INODE(ip)) !=
-                    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
-                        code = -EINVAL; /* EFBIG? */
-                        goto error_return;
-                }
-                /*
-                 * If realtime flag is set then must have realtime data.
-                 */
-                if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
-                        if ((mp->m_sb.sb_rblocks == 0) ||
-                            (mp->m_sb.sb_rextsize == 0) ||
-                            (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
-                                code = -EINVAL;
-                                goto error_return;
-                        }
-                }
-                /*
-                 * Can't modify an immutable/append-only file unless
-                 * we have appropriate permission.
-                 */
-                if ((ip->i_d.di_flags &
-                                (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
-                     (fa->fsx_xflags &
-                                (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
-                    !capable(CAP_LINUX_IMMUTABLE)) {
-                        code = -EPERM;
-                        goto error_return;
-                }
-        }
-        xfs_trans_ijoin(tp, ip, 0);
        /*
-         * Change file ownership.  Must be the owner or privileged.
+         * Change file ownership.  Must be the owner or privileged.  CAP_FSETID
+         * overrides the following restrictions:
+         *
+         * The set-user-ID and set-group-ID bits of a file will be cleared upon
+         * successful return from chown()
         */
-        if (mask & FSX_PROJID) {
-                /*
-                 * CAP_FSETID overrides the following restrictions:
-                 *
-                 * The set-user-ID and set-group-ID bits of a file will be
-                 * cleared upon successful return from chown()
-                 */
-                if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
-                    !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
-                        ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
-                /*
-                 * Change the ownerships and register quota modifications
-                 * in the transaction.
-                 */
-                if (xfs_get_projid(ip) != fa->fsx_projid) {
-                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
-                                olddquot = xfs_qm_vop_chown(tp, ip,
-                                                        &ip->i_pdquot, pdqp);
-                        }
-                        ASSERT(ip->i_d.di_version > 1);
-                        xfs_set_projid(ip, fa->fsx_projid);
-                }
-        }
+        if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+            !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
+                ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
-        if (mask & FSX_XFLAGS) {
+        /* Change the ownerships and register project quota modifications */
-                xfs_set_diflags(ip, fa->fsx_xflags);
+        if (xfs_get_projid(ip) != fa->fsx_projid) {
-                xfs_diflags_to_linux(ip);
+                if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
+                        olddquot = xfs_qm_vop_chown(tp, ip,
+                                                &ip->i_pdquot, pdqp);
+                }
+                ASSERT(ip->i_d.di_version > 1);
+                xfs_set_projid(ip, fa->fsx_projid);
        }
        /*
@@ -1238,34 +1240,12 @@ xfs_ioctl_setattr(
         * extent size hint should be set on the inode. If no extent size flags
         * are set on the inode then unconditionally clear the extent size hint.
         */
-        if (mask & FSX_EXTSIZE) {
+        if (ip->i_d.di_flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
-                int     extsize = 0;
+                ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
+        else
-                if (ip->i_d.di_flags &
+                ip->i_d.di_extsize = 0;
-                                (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
-                        extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
-                ip->i_d.di_extsize = extsize;
-        }
-        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        XFS_STATS_INC(xs_ig_attrchg);
-        /*
-         * If this is a synchronous mount, make sure that the
-         * transaction goes to disk before returning to the user.
-         * This is slightly sub-optimal in that truncates require
-         * two sync transactions instead of one for wsync filesystems.
-         * One for the truncate and one for the timestamps since we
-         * don't want to change the timestamps unless we're sure the
-         * truncate worked.  Truncates are less than 1% of the laddis
-         * mix so this probably isn't worth the trouble to optimize.
-         */
-        if (mp->m_flags & XFS_MOUNT_WSYNC)
-                xfs_trans_set_sync(tp);
        code = xfs_trans_commit(tp, 0);
-        xfs_iunlock(ip, lock_flags);
        /*
         * Release any dquot(s) the inode had kept before chown.
@@ -1276,12 +1256,11 @@ xfs_ioctl_setattr(
        return code;
- error_return:
+error_trans_cancel:
+        xfs_trans_cancel(tp, 0);
+error_free_dquots:
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(pdqp);
-        xfs_trans_cancel(tp, 0);
-        if (lock_flags)
-                xfs_iunlock(ip, lock_flags);
        return code;
 }
@@ -1292,20 +1271,15 @@ xfs_ioc_fssetxattr(
        void                    __user *arg)
 {
        struct fsxattr          fa;
-        unsigned int            mask;
        int error;
        if (copy_from_user(&fa, arg, sizeof(fa)))
                return -EFAULT;
-        mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID;
-        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-                mask |= FSX_NONBLOCK;
        error = mnt_want_write_file(filp);
        if (error)
                return error;
-        error = xfs_ioctl_setattr(ip, &fa, mask);
+        error = xfs_ioctl_setattr(ip, &fa);
        mnt_drop_write_file(filp);
        return error;
 }
@@ -1325,14 +1299,14 @@ xfs_ioc_getxflags(
 STATIC int
 xfs_ioc_setxflags(
-        xfs_inode_t             *ip,
+        struct xfs_inode        *ip,
        struct file             *filp,
        void                    __user *arg)
 {
+        struct xfs_trans        *tp;
        struct fsxattr          fa;
        unsigned int            flags;
-        unsigned int            mask;
+        int                     error;
-        int error;
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
@@ -1342,15 +1316,26 @@ xfs_ioc_setxflags(
                      FS_SYNC_FL))
                return -EOPNOTSUPP;
-        mask = FSX_XFLAGS;
-        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-                mask |= FSX_NONBLOCK;
        fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
        error = mnt_want_write_file(filp);
        if (error)
                return error;
-        error = xfs_ioctl_setattr(ip, &fa, mask);
+        tp = xfs_ioctl_setattr_get_trans(ip);
+        if (IS_ERR(tp)) {
+                error = PTR_ERR(tp);
+                goto out_drop_write;
+        }
+        error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                goto out_drop_write;
+        }
+        error = xfs_trans_commit(tp, 0);
+out_drop_write:
        mnt_drop_write_file(filp);
        return error;
 }
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index ec6772866f3d..bfc7c7c8a0c8 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -423,7 +423,7 @@ xfs_compat_attrmulti_by_handle(
        ops = memdup_user(compat_ptr(am_hreq.ops), size);
        if (IS_ERR(ops)) {
-                error = -PTR_ERR(ops);
+                error = PTR_ERR(ops);
                goto out_dput;
        }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index c980e2a5086b..ccb1dd0d509e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -802,7 +802,7 @@ int
 xfs_iomap_write_unwritten(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
-        size_t          count)
+        xfs_off_t       count)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 411fbb8919ef..8688e663d744 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -27,6 +27,6 @@ int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
                        struct xfs_bmbt_irec *);
 int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
                        struct xfs_bmbt_irec *);
-int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
+int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c50311cae1b1..ce80eeb8faa4 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -380,18 +380,27 @@ xfs_vn_rename(
        struct inode    *odir,
        struct dentry   *odentry,
        struct inode    *ndir,
-        struct dentry   *ndentry)
+        struct dentry   *ndentry,
+        unsigned int    flags)
 {
        struct inode    *new_inode = ndentry->d_inode;
+        int             omode = 0;
        struct xfs_name oname;
        struct xfs_name nname;
-        xfs_dentry_to_name(&oname, odentry, 0);
+        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+                return -EINVAL;
+        /* if we are exchanging files, we need to set i_mode of both files */
+        if (flags & RENAME_EXCHANGE)
+                omode = ndentry->d_inode->i_mode;
+        xfs_dentry_to_name(&oname, odentry, omode);
        xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
        return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
-                          XFS_I(ndir), &nname, new_inode ?
+                          XFS_I(ndir), &nname,
-                                                XFS_I(new_inode) : NULL);
+                          new_inode ? XFS_I(new_inode) : NULL, flags);
 }
 /*
@@ -1144,7 +1153,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
         */
        .rmdir                  = xfs_vn_unlink,
        .mknod                  = xfs_vn_mknod,
-        .rename                 = xfs_vn_rename,
+        .rename2                = xfs_vn_rename,
        .get_acl                = xfs_get_acl,
        .set_acl                = xfs_set_acl,
        .getattr                = xfs_vn_getattr,
@@ -1172,7 +1181,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
         */
        .rmdir                  = xfs_vn_unlink,
        .mknod                  = xfs_vn_mknod,
-        .rename                 = xfs_vn_rename,
+        .rename2                = xfs_vn_rename,
        .get_acl                = xfs_get_acl,
        .set_acl                = xfs_set_acl,
        .getattr                = xfs_vn_getattr,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e408bf5a3ff7..bcc7cfabb787 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -33,6 +33,7 @@
 #include "xfs_fsops.h"
 #include "xfs_cksum.h"
 #include "xfs_sysfs.h"
+#include "xfs_sb.h"
 kmem_zone_t     *xfs_log_ticket_zone;
@@ -1290,9 +1291,20 @@ xfs_log_worker(
        struct xfs_mount        *mp = log->l_mp;
        /* dgc: errors ignored - not fatal and nowhere to report them */
-        if (xfs_log_need_covered(mp))
+        if (xfs_log_need_covered(mp)) {
-                xfs_fs_log_dummy(mp);
+                /*
-        else
+                 * Dump a transaction into the log that contains no real change.
+                 * This is needed to stamp the current tail LSN into the log
+                 * during the covering operation.
+                 *
+                 * We cannot use an inode here for this - that will push dirty
+                 * state back up into the VFS and then periodic inode flushing
+                 * will prevent log covering from making progress. Hence we
+                 * synchronously log the superblock instead to ensure the
+                 * superblock is immediately unpinned and can be written back.
+                 */
+                xfs_sync_sb(mp, true);
+        } else
                xfs_log_force(mp, 0);
        /* start pushing all the metadata that is currently dirty */
@@ -1395,6 +1407,8 @@ xlog_alloc_log(
        ASSERT(xfs_buf_islocked(bp));
        xfs_buf_unlock(bp);
+        /* use high priority wq for log I/O completion */
+        bp->b_ioend_wq = mp->m_log_workqueue;
        bp->b_iodone = xlog_iodone;
        log->l_xbuf = bp;
@@ -1427,6 +1441,8 @@ xlog_alloc_log(
                ASSERT(xfs_buf_islocked(bp));
                xfs_buf_unlock(bp);
+                /* use high priority wq for log I/O completion */
+                bp->b_ioend_wq = mp->m_log_workqueue;
                bp->b_iodone = xlog_iodone;
                iclog->ic_bp = bp;
                iclog->ic_data = bp->b_addr;
@@ -1806,8 +1822,6 @@ xlog_sync(
        XFS_BUF_ZEROFLAGS(bp);
        XFS_BUF_ASYNC(bp);
        bp->b_flags |= XBF_SYNCIO;
-        /* use high priority completion wq */
-        bp->b_ioend_wq = log->l_mp->m_log_workqueue;
        if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
                bp->b_flags |= XBF_FUA;
@@ -1856,8 +1870,6 @@ xlog_sync(
                bp->b_flags |= XBF_SYNCIO;
                if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
                        bp->b_flags |= XBF_FUA;
-                /* use high priority completion wq */
-                bp->b_ioend_wq = log->l_mp->m_log_workqueue;
                ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
                ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -2027,7 +2039,7 @@ xlog_print_tic_res(
                "  total reg   = %u bytes (o/flow = %u bytes)\n"
                "  ophdrs      = %u (ophdr space = %u bytes)\n"
                "  ophdr + reg = %u bytes\n"
-                "  num regions = %u\n",
+                "  num regions = %u",
                ((ticket->t_trans_type <= 0 ||
                  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
                  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d3d38836f87f..4fa80e63eea2 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -408,11 +408,11 @@ xfs_update_alignment(xfs_mount_t *mp)
                if (xfs_sb_version_hasdalign(sbp)) {
                        if (sbp->sb_unit != mp->m_dalign) {
                                sbp->sb_unit = mp->m_dalign;
-                                mp->m_update_flags |= XFS_SB_UNIT;
+                                mp->m_update_sb = true;
                        }
                        if (sbp->sb_width != mp->m_swidth) {
                                sbp->sb_width = mp->m_swidth;
-                                mp->m_update_flags |= XFS_SB_WIDTH;
+                                mp->m_update_sb = true;
                        }
                } else {
                        xfs_warn(mp,
@@ -583,38 +583,19 @@ int
 xfs_mount_reset_sbqflags(
        struct xfs_mount        *mp)
 {
-        int                     error;
-        struct xfs_trans        *tp;
        mp->m_qflags = 0;
-        /*
+        /* It is OK to look at sb_qflags in the mount path without m_sb_lock. */
-         * It is OK to look at sb_qflags here in mount path,
-         * without m_sb_lock.
-         */
        if (mp->m_sb.sb_qflags == 0)
                return 0;
        spin_lock(&mp->m_sb_lock);
        mp->m_sb.sb_qflags = 0;
        spin_unlock(&mp->m_sb_lock);
-        /*
+        if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
-         * If the fs is readonly, let the incore superblock run
-         * with quotas off but don't flush the update out to disk
-         */
-        if (mp->m_flags & XFS_MOUNT_RDONLY)
                return 0;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+        return xfs_sync_sb(mp, false);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                xfs_alert(mp, "%s: Superblock update failed!", __func__);
-                return error;
-        }
-        xfs_mod_sb(tp, XFS_SB_QFLAGS);
-        return xfs_trans_commit(tp, 0);
 }
 __uint64_t
@@ -659,26 +640,25 @@ xfs_mountfs(
        xfs_sb_mount_common(mp, sbp);
        /*
-         * Check for a mismatched features2 values.  Older kernels
+         * Check for a mismatched features2 values.  Older kernels read & wrote
-         * read & wrote into the wrong sb offset for sb_features2
+         * into the wrong sb offset for sb_features2 on some platforms due to
-         * on some platforms due to xfs_sb_t not being 64bit size aligned
+         * xfs_sb_t not being 64bit size aligned when sb_features2 was added,
-         * when sb_features2 was added, which made older superblock
+         * which made older superblock reading/writing routines swap it as a
-         * reading/writing routines swap it as a 64-bit value.
+         * 64-bit value.
         *
         * For backwards compatibility, we make both slots equal.
         *
-         * If we detect a mismatched field, we OR the set bits into the
+         * If we detect a mismatched field, we OR the set bits into the existing
-         * existing features2 field in case it has already been modified; we
+         * features2 field in case it has already been modified; we don't want
-         * don't want to lose any features.  We then update the bad location
+         * to lose any features.  We then update the bad location with the ORed
-         * with the ORed value so that older kernels will see any features2
+         * value so that older kernels will see any features2 flags. The
-         * flags, and mark the two fields as needing updates once the
+         * superblock writeback code ensures the new sb_features2 is copied to
-         * transaction subsystem is online.
+         * sb_bad_features2 before it is logged or written to disk.
         */
        if (xfs_sb_has_mismatched_features2(sbp)) {
                xfs_warn(mp, "correcting sb_features alignment problem");
                sbp->sb_features2 |= sbp->sb_bad_features2;
-                sbp->sb_bad_features2 = sbp->sb_features2;
+                mp->m_update_sb = true;
-                mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
                /*
                 * Re-check for ATTR2 in case it was found in bad_features2
@@ -692,17 +672,17 @@ xfs_mountfs(
        if (xfs_sb_version_hasattr2(&mp->m_sb) &&
           (mp->m_flags & XFS_MOUNT_NOATTR2)) {
                xfs_sb_version_removeattr2(&mp->m_sb);
-                mp->m_update_flags |= XFS_SB_FEATURES2;
+                mp->m_update_sb = true;
                /* update sb_versionnum for the clearing of the morebits */
                if (!sbp->sb_features2)
-                        mp->m_update_flags |= XFS_SB_VERSIONNUM;
+                        mp->m_update_sb = true;
        }
        /* always use v2 inodes by default now */
        if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
                mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
-                mp->m_update_flags |= XFS_SB_VERSIONNUM;
+                mp->m_update_sb = true;
        }
        /*
@@ -895,8 +875,8 @@ xfs_mountfs(
         * the next remount into writeable mode.  Otherwise we would never
         * perform the update e.g. for the root filesystem.
         */
-        if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+        if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                error = xfs_mount_log_sb(mp, mp->m_update_flags);
+                error = xfs_sync_sb(mp, false);
                if (error) {
                        xfs_warn(mp, "failed to write sb changes");
                        goto out_rtunmount;
@@ -1103,9 +1083,6 @@ xfs_fs_writable(
 int
 xfs_log_sbcount(xfs_mount_t *mp)
 {
-        xfs_trans_t     *tp;
-        int             error;
        /* allow this to proceed during the freeze sequence... */
        if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
                return 0;
@@ -1119,17 +1096,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
        if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
                return 0;
-        tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
+        return xfs_sync_sb(mp, true);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
-        xfs_trans_set_sync(tp);
-        error = xfs_trans_commit(tp, 0);
-        return error;
 }
 /*
@@ -1423,34 +1390,6 @@ xfs_freesb(
 }
 /*
- * Used to log changes to the superblock unit and width fields which could
- * be altered by the mount options, as well as any potential sb_features2
- * fixup. Only the first superblock is updated.
- */
-int
-xfs_mount_log_sb(
-        xfs_mount_t     *mp,
-        __int64_t       fields)
-{
-        xfs_trans_t     *tp;
-        int             error;
-        ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
-                         XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 |
-                         XFS_SB_VERSIONNUM));
-        tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_mod_sb(tp, fields);
-        error = xfs_trans_commit(tp, 0);
-        return error;
-}
-/*
 * If the underlying (data/log/rt) device is readonly, there are some
 * operations that cannot proceed.
 */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 22ccf69d4d3c..a5b2ff822653 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -162,8 +162,7 @@ typedef struct xfs_mount {
        struct delayed_work     m_reclaim_work; /* background inode reclaim */
        struct delayed_work     m_eofblocks_work; /* background eof blocks
                                                     trimming */
-        __int64_t               m_update_flags; /* sb flags we need to update
+        bool                    m_update_sb;    /* sb needs update in mount */
-                                                   on the next remount,rw */
        int64_t                 m_low_space[XFS_LOWSP_MAX];
                                                /* low free space thresholds */
        struct xfs_kobj         m_kobj;
@@ -378,7 +377,7 @@ extern void	xfs_unmountfs(xfs_mount_t *);
 extern int      xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
 extern int      xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
                        uint, int);
-extern int      xfs_mount_log_sb(xfs_mount_t *, __int64_t);
+extern int      xfs_mount_log_sb(xfs_mount_t *);
 extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int      xfs_readsb(xfs_mount_t *, int);
 extern void     xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 79fb19dd9c83..3e8186279541 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -714,7 +714,6 @@ STATIC int
 xfs_qm_qino_alloc(
        xfs_mount_t     *mp,
        xfs_inode_t     **ip,
-        __int64_t       sbfields,
        uint            flags)
 {
        xfs_trans_t     *tp;
@@ -777,11 +776,6 @@ xfs_qm_qino_alloc(
        spin_lock(&mp->m_sb_lock);
        if (flags & XFS_QMOPT_SBVERSION) {
                ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
-                ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                        XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) ==
-                                (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                                 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
-                                 XFS_SB_QFLAGS));
                xfs_sb_version_addquota(&mp->m_sb);
                mp->m_sb.sb_uquotino = NULLFSINO;
@@ -798,7 +792,7 @@ xfs_qm_qino_alloc(
        else
                mp->m_sb.sb_pquotino = (*ip)->i_ino;
        spin_unlock(&mp->m_sb_lock);
-        xfs_mod_sb(tp, sbfields);
+        xfs_log_sb(tp);
        if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
                xfs_alert(mp, "%s failed (error %d)!", __func__, error);
@@ -1451,7 +1445,7 @@ xfs_qm_mount_quotas(
        spin_unlock(&mp->m_sb_lock);
        if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
-                if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
+                if (xfs_sync_sb(mp, false)) {
                        /*
                         * We could only have been turning quotas off.
                         * We aren't in very good shape actually because
@@ -1482,7 +1476,6 @@ xfs_qm_init_quotainos(
        struct xfs_inode        *gip = NULL;
        struct xfs_inode        *pip = NULL;
        int                     error;
-        __int64_t               sbflags = 0;
        uint                    flags = 0;
        ASSERT(mp->m_quotainfo);
@@ -1517,9 +1510,6 @@ xfs_qm_init_quotainos(
                }
        } else {
                flags |= XFS_QMOPT_SBVERSION;
-                sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                            XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
-                            XFS_SB_QFLAGS);
        }
        /*
@@ -1530,7 +1520,6 @@ xfs_qm_init_quotainos(
         */
        if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
                error = xfs_qm_qino_alloc(mp, &uip,
-                                              sbflags | XFS_SB_UQUOTINO,
                                              flags | XFS_QMOPT_UQUOTA);
                if (error)
                        goto error_rele;
@@ -1539,7 +1528,6 @@ xfs_qm_init_quotainos(
        }
        if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
                error = xfs_qm_qino_alloc(mp, &gip,
-                                          sbflags | XFS_SB_GQUOTINO,
                                          flags | XFS_QMOPT_GQUOTA);
                if (error)
                        goto error_rele;
@@ -1548,7 +1536,6 @@ xfs_qm_init_quotainos(
        }
        if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
                error = xfs_qm_qino_alloc(mp, &pip,
-                                          sbflags | XFS_SB_PQUOTINO,
                                          flags | XFS_QMOPT_PQUOTA);
                if (error)
                        goto error_rele;
@@ -1587,32 +1574,6 @@ xfs_qm_dqfree_one(
        xfs_qm_dqdestroy(dqp);
 }
-/*
- * Start a transaction and write the incore superblock changes to
- * disk. flags parameter indicates which fields have changed.
- */
-int
-xfs_qm_write_sb_changes(
-        xfs_mount_t     *mp,
-        __int64_t       flags)
-{
-        xfs_trans_t     *tp;
-        int             error;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_mod_sb(tp, flags);
-        error = xfs_trans_commit(tp, 0);
-        return error;
-}
 /* --------------- utility functions for vnodeops ---------------- */
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 3a07a937e232..0d4d3590cf85 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -157,7 +157,6 @@ struct xfs_dquot_acct {
 #define XFS_QM_RTBWARNLIMIT     5
 extern void             xfs_qm_destroy_quotainfo(struct xfs_mount *);
-extern int              xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
 /* dquot stuff */
 extern void             xfs_qm_dqpurge_all(struct xfs_mount *, uint);
@@ -166,9 +165,9 @@ extern void		xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
 /* quota ops */
 extern int              xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
 extern int              xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
-                                        uint, struct fs_disk_quota *);
+                                        uint, struct qc_dqblk *);
 extern int              xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
-                                        struct fs_disk_quota *);
+                                        struct qc_dqblk *);
 extern int              xfs_qm_scall_getqstat(struct xfs_mount *,
                                        struct fs_quota_stat *);
 extern int              xfs_qm_scall_getqstatv(struct xfs_mount *,
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 74fca68e43b6..9b965db45800 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -39,7 +39,6 @@ STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int      xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
                                        uint);
 STATIC uint     xfs_qm_export_flags(uint);
-STATIC uint     xfs_qm_export_qtype_flags(uint);
 /*
 * Turn off quota accounting and/or enforcement for all udquots and/or
@@ -92,8 +91,7 @@ xfs_qm_scall_quotaoff(
                mutex_unlock(&q->qi_quotaofflock);
                /* XXX what to do if error ? Revert back to old vals incore ? */
-                error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
+                return xfs_sync_sb(mp, false);
-                return error;
        }
        dqtype = 0;
@@ -314,7 +312,6 @@ xfs_qm_scall_quotaon(
 {
        int             error;
        uint            qf;
-        __int64_t       sbflags;
        flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
        /*
@@ -322,30 +319,22 @@ xfs_qm_scall_quotaon(
         */
        flags &= ~(XFS_ALL_QUOTA_ACCT);
-        sbflags = 0;
        if (flags == 0) {
                xfs_debug(mp, "%s: zero flags, m_qflags=%x",
                        __func__, mp->m_qflags);
                return -EINVAL;
        }
-        /* No fs can turn on quotas with a delayed effect */
-        ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0);
        /*
         * Can't enforce without accounting. We check the superblock
         * qflags here instead of m_qflags because rootfs can have
         * quota acct on ondisk without m_qflags' knowing.
         */
-        if (((flags & XFS_UQUOTA_ACCT) == 0 &&
+        if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
-             (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
             (flags & XFS_UQUOTA_ENFD)) ||
-            ((flags & XFS_GQUOTA_ACCT) == 0 &&
+            ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
-             (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
             (flags & XFS_GQUOTA_ENFD)) ||
-            ((flags & XFS_PQUOTA_ACCT) == 0 &&
+            ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
-             (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
             (flags & XFS_PQUOTA_ENFD))) {
                xfs_debug(mp,
                        "%s: Can't enforce without acct, flags=%x sbflags=%x",
@@ -370,11 +359,11 @@ xfs_qm_scall_quotaon(
        /*
         * There's nothing to change if it's the same.
         */
-        if ((qf & flags) == flags && sbflags == 0)
+        if ((qf & flags) == flags)
                return -EEXIST;
-        sbflags |= XFS_SB_QFLAGS;
-        if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
+        error = xfs_sync_sb(mp, false);
+        if (error)
                return error;
        /*
         * If we aren't trying to switch on quota enforcement, we are done.
@@ -384,8 +373,7 @@ xfs_qm_scall_quotaon(
             ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) !=
             (mp->m_qflags & XFS_PQUOTA_ACCT)) ||
             ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
-             (mp->m_qflags & XFS_GQUOTA_ACCT)) ||
+             (mp->m_qflags & XFS_GQUOTA_ACCT)))
-            (flags & XFS_ALL_QUOTA_ENFD) == 0)
                return 0;
        if (! XFS_IS_QUOTA_RUNNING(mp))
@@ -422,20 +410,12 @@ xfs_qm_scall_getqstat(
        memset(out, 0, sizeof(fs_quota_stat_t));
        out->qs_version = FS_QSTAT_VERSION;
-        if (!xfs_sb_version_hasquota(&mp->m_sb)) {
-                out->qs_uquota.qfs_ino = NULLFSINO;
-                out->qs_gquota.qfs_ino = NULLFSINO;
-                return 0;
-        }
        out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
                                                        (XFS_ALL_QUOTA_ACCT|
                                                         XFS_ALL_QUOTA_ENFD));
-        if (q) {
+        uip = q->qi_uquotaip;
-                uip = q->qi_uquotaip;
+        gip = q->qi_gquotaip;
-                gip = q->qi_gquotaip;
+        pip = q->qi_pquotaip;
-                pip = q->qi_pquotaip;
-        }
        if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
                if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
                                        0, 0, &uip) == 0)
@@ -481,14 +461,13 @@ xfs_qm_scall_getqstat(
                if (temppqip)
                        IRELE(pip);
        }
-        if (q) {
+        out->qs_incoredqs = q->qi_dquots;
-                out->qs_incoredqs = q->qi_dquots;
+        out->qs_btimelimit = q->qi_btimelimit;
-                out->qs_btimelimit = q->qi_btimelimit;
+        out->qs_itimelimit = q->qi_itimelimit;
-                out->qs_itimelimit = q->qi_itimelimit;
+        out->qs_rtbtimelimit = q->qi_rtbtimelimit;
-                out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+        out->qs_bwarnlimit = q->qi_bwarnlimit;
-                out->qs_bwarnlimit = q->qi_bwarnlimit;
+        out->qs_iwarnlimit = q->qi_iwarnlimit;
-                out->qs_iwarnlimit = q->qi_iwarnlimit;
-        }
        return 0;
 }
@@ -509,13 +488,6 @@ xfs_qm_scall_getqstatv(
        bool                    tempgqip = false;
        bool                    temppqip = false;
-        if (!xfs_sb_version_hasquota(&mp->m_sb)) {
-                out->qs_uquota.qfs_ino = NULLFSINO;
-                out->qs_gquota.qfs_ino = NULLFSINO;
-                out->qs_pquota.qfs_ino = NULLFSINO;
-                return 0;
-        }
        out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
                                                        (XFS_ALL_QUOTA_ACCT|
                                                         XFS_ALL_QUOTA_ENFD));
@@ -523,11 +495,9 @@ xfs_qm_scall_getqstatv(
        out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
        out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino;
-        if (q) {
+        uip = q->qi_uquotaip;
-                uip = q->qi_uquotaip;
+        gip = q->qi_gquotaip;
-                gip = q->qi_gquotaip;
+        pip = q->qi_pquotaip;
-                pip = q->qi_pquotaip;
-        }
        if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
                if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
                                        0, 0, &uip) == 0)
@@ -562,19 +532,18 @@ xfs_qm_scall_getqstatv(
                if (temppqip)
                        IRELE(pip);
        }
-        if (q) {
+        out->qs_incoredqs = q->qi_dquots;
-                out->qs_incoredqs = q->qi_dquots;
+        out->qs_btimelimit = q->qi_btimelimit;
-                out->qs_btimelimit = q->qi_btimelimit;
+        out->qs_itimelimit = q->qi_itimelimit;
-                out->qs_itimelimit = q->qi_itimelimit;
+        out->qs_rtbtimelimit = q->qi_rtbtimelimit;
-                out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+        out->qs_bwarnlimit = q->qi_bwarnlimit;
-                out->qs_bwarnlimit = q->qi_bwarnlimit;
+        out->qs_iwarnlimit = q->qi_iwarnlimit;
-                out->qs_iwarnlimit = q->qi_iwarnlimit;
-        }
        return 0;
 }
-#define XFS_DQ_MASK \
+#define XFS_QC_MASK \
-        (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
+        (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
 /*
 * Adjust quota limits, and start/stop timers accordingly.
@@ -584,7 +553,7 @@ xfs_qm_scall_setqlim(
        struct xfs_mount        *mp,
        xfs_dqid_t              id,
        uint                    type,
-        fs_disk_quota_t         *newlim)
+        struct qc_dqblk         *newlim)
 {
        struct xfs_quotainfo    *q = mp->m_quotainfo;
        struct xfs_disk_dquot   *ddq;
@@ -593,9 +562,9 @@ xfs_qm_scall_setqlim(
        int                     error;
        xfs_qcnt_t              hard, soft;
-        if (newlim->d_fieldmask & ~XFS_DQ_MASK)
+        if (newlim->d_fieldmask & ~XFS_QC_MASK)
                return -EINVAL;
-        if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
+        if ((newlim->d_fieldmask & XFS_QC_MASK) == 0)
                return 0;
        /*
@@ -633,11 +602,11 @@ xfs_qm_scall_setqlim(
        /*
         * Make sure that hardlimits are >= soft limits before changing.
         */
-        hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
+        hard = (newlim->d_fieldmask & QC_SPC_HARD) ?
-                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
+                (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) :
                        be64_to_cpu(ddq->d_blk_hardlimit);
-        soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
+        soft = (newlim->d_fieldmask & QC_SPC_SOFT) ?
-                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
+                (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) :
                        be64_to_cpu(ddq->d_blk_softlimit);
        if (hard == 0 || hard >= soft) {
                ddq->d_blk_hardlimit = cpu_to_be64(hard);
@@ -650,11 +619,11 @@ xfs_qm_scall_setqlim(
        } else {
                xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
        }
-        hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
+        hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ?
-                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
+                (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) :
                        be64_to_cpu(ddq->d_rtb_hardlimit);
-        soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
+        soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ?
-                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
+                (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) :
                        be64_to_cpu(ddq->d_rtb_softlimit);
        if (hard == 0 || hard >= soft) {
                ddq->d_rtb_hardlimit = cpu_to_be64(hard);
@@ -667,10 +636,10 @@ xfs_qm_scall_setqlim(
                xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
        }
-        hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
+        hard = (newlim->d_fieldmask & QC_INO_HARD) ?
                (xfs_qcnt_t) newlim->d_ino_hardlimit :
                        be64_to_cpu(ddq->d_ino_hardlimit);
-        soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
+        soft = (newlim->d_fieldmask & QC_INO_SOFT) ?
                (xfs_qcnt_t) newlim->d_ino_softlimit :
                        be64_to_cpu(ddq->d_ino_softlimit);
        if (hard == 0 || hard >= soft) {
@@ -687,12 +656,12 @@ xfs_qm_scall_setqlim(
        /*
         * Update warnings counter(s) if requested
         */
-        if (newlim->d_fieldmask & FS_DQ_BWARNS)
+        if (newlim->d_fieldmask & QC_SPC_WARNS)
-                ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns);
+                ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns);
-        if (newlim->d_fieldmask & FS_DQ_IWARNS)
+        if (newlim->d_fieldmask & QC_INO_WARNS)
-                ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns);
+                ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns);
-        if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
+        if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
-                ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns);
+                ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns);
        if (id == 0) {
                /*
@@ -702,24 +671,24 @@ xfs_qm_scall_setqlim(
                 * soft and hard limit values (already done, above), and
                 * for warnings.
                 */
-                if (newlim->d_fieldmask & FS_DQ_BTIMER) {
+                if (newlim->d_fieldmask & QC_SPC_TIMER) {
-                        q->qi_btimelimit = newlim->d_btimer;
+                        q->qi_btimelimit = newlim->d_spc_timer;
-                        ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
+                        ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer);
                }
-                if (newlim->d_fieldmask & FS_DQ_ITIMER) {
+                if (newlim->d_fieldmask & QC_INO_TIMER) {
-                        q->qi_itimelimit = newlim->d_itimer;
+                        q->qi_itimelimit = newlim->d_ino_timer;
-                        ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
+                        ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer);
                }
-                if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
+                if (newlim->d_fieldmask & QC_RT_SPC_TIMER) {
-                        q->qi_rtbtimelimit = newlim->d_rtbtimer;
+                        q->qi_rtbtimelimit = newlim->d_rt_spc_timer;
-                        ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
+                        ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer);
                }
-                if (newlim->d_fieldmask & FS_DQ_BWARNS)
+                if (newlim->d_fieldmask & QC_SPC_WARNS)
-                        q->qi_bwarnlimit = newlim->d_bwarns;
+                        q->qi_bwarnlimit = newlim->d_spc_warns;
-                if (newlim->d_fieldmask & FS_DQ_IWARNS)
+                if (newlim->d_fieldmask & QC_INO_WARNS)
-                        q->qi_iwarnlimit = newlim->d_iwarns;
+                        q->qi_iwarnlimit = newlim->d_ino_warns;
-                if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
+                if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
-                        q->qi_rtbwarnlimit = newlim->d_rtbwarns;
+                        q->qi_rtbwarnlimit = newlim->d_rt_spc_warns;
        } else {
                /*
                 * If the user is now over quota, start the timelimit.
@@ -801,7 +770,7 @@ xfs_qm_log_quotaoff(
        mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
        spin_unlock(&mp->m_sb_lock);
-        xfs_mod_sb(tp, XFS_SB_QFLAGS);
+        xfs_log_sb(tp);
        /*
         * We have to make sure that the transaction is secure on disk before we
@@ -824,7 +793,7 @@ xfs_qm_scall_getquota(
        struct xfs_mount        *mp,
        xfs_dqid_t              id,
        uint                    type,
-        struct fs_disk_quota    *dst)
+        struct qc_dqblk         *dst)
 {
        struct xfs_dquot        *dqp;
        int                     error;
@@ -848,28 +817,25 @@ xfs_qm_scall_getquota(
        }
        memset(dst, 0, sizeof(*dst));
-        dst->d_version = FS_DQUOT_VERSION;
+        dst->d_spc_hardlimit =
-        dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags);
+                XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
-        dst->d_id = be32_to_cpu(dqp->q_core.d_id);
+        dst->d_spc_softlimit =
-        dst->d_blk_hardlimit =
+                XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
-                XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
-        dst->d_blk_softlimit =
-                XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
        dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
        dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
-        dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount);
+        dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount);
-        dst->d_icount = dqp->q_res_icount;
+        dst->d_ino_count = dqp->q_res_icount;
-        dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer);
+        dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer);
-        dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer);
+        dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer);
-        dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns);
+        dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns);
-        dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns);
+        dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns);
-        dst->d_rtb_hardlimit =
+        dst->d_rt_spc_hardlimit =
-                XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
+                XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
-        dst->d_rtb_softlimit =
+        dst->d_rt_spc_softlimit =
-                XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
+                XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
-        dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount);
+        dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount);
-        dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer);
+        dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
-        dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns);
+        dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
        /*
         * Internally, we don't reset all the timers when quota enforcement
@@ -882,23 +848,23 @@ xfs_qm_scall_getquota(
             dqp->q_core.d_flags == XFS_DQ_GROUP) ||
            (!XFS_IS_PQUOTA_ENFORCED(mp) &&
             dqp->q_core.d_flags == XFS_DQ_PROJ)) {
-                dst->d_btimer = 0;
+                dst->d_spc_timer = 0;
-                dst->d_itimer = 0;
+                dst->d_ino_timer = 0;
-                dst->d_rtbtimer = 0;
+                dst->d_rt_spc_timer = 0;
        }
 #ifdef DEBUG
-        if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
+        if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
-             (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
+             (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
-             (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
+             (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
-            dst->d_id != 0) {
+            id != 0) {
-                if ((dst->d_bcount > dst->d_blk_softlimit) &&
+                if ((dst->d_space > dst->d_spc_softlimit) &&
-                    (dst->d_blk_softlimit > 0)) {
+                    (dst->d_spc_softlimit > 0)) {
-                        ASSERT(dst->d_btimer != 0);
+                        ASSERT(dst->d_spc_timer != 0);
                }
-                if ((dst->d_icount > dst->d_ino_softlimit) &&
+                if ((dst->d_ino_count > dst->d_ino_softlimit) &&
                    (dst->d_ino_softlimit > 0)) {
-                        ASSERT(dst->d_itimer != 0);
+                        ASSERT(dst->d_ino_timer != 0);
                }
        }
 #endif
@@ -908,26 +874,6 @@ out_put:
 }
 STATIC uint
-xfs_qm_export_qtype_flags(
-        uint flags)
-{
-        /*
-         * Can't be more than one, or none.
-         */
-        ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
-                (FS_PROJ_QUOTA | FS_USER_QUOTA));
-        ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
-                (FS_PROJ_QUOTA | FS_GROUP_QUOTA));
-        ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
-                (FS_USER_QUOTA | FS_GROUP_QUOTA));
-        ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
-        return (flags & XFS_DQ_USER) ?
-                FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
-                        FS_PROJ_QUOTA : FS_GROUP_QUOTA;
-}
-STATIC uint
 xfs_qm_export_flags(
        uint flags)
 {
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 7542bbeca6a1..6923905ab33d 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -64,19 +64,10 @@ xfs_fs_get_xstatev(
        return xfs_qm_scall_getqstatv(mp, fqs);
 }
-STATIC int
+static unsigned int
-xfs_fs_set_xstate(
+xfs_quota_flags(unsigned int uflags)
-        struct super_block      *sb,
-        unsigned int            uflags,
-        int                     op)
 {
-        struct xfs_mount        *mp = XFS_M(sb);
+        unsigned int flags = 0;
-        unsigned int            flags = 0;
-        if (sb->s_flags & MS_RDONLY)
-                return -EROFS;
-        if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
-                return -ENOSYS;
        if (uflags & FS_QUOTA_UDQ_ACCT)
                flags |= XFS_UQUOTA_ACCT;
@@ -91,16 +82,39 @@ xfs_fs_set_xstate(
        if (uflags & FS_QUOTA_PDQ_ENFD)
                flags |= XFS_PQUOTA_ENFD;
-        switch (op) {
+        return flags;
-        case Q_XQUOTAON:
+}
-                return xfs_qm_scall_quotaon(mp, flags);
-        case Q_XQUOTAOFF:
+STATIC int
-                if (!XFS_IS_QUOTA_ON(mp))
+xfs_quota_enable(
-                        return -EINVAL;
+        struct super_block      *sb,
-                return xfs_qm_scall_quotaoff(mp, flags);
+        unsigned int            uflags)
-        }
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        if (sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        if (!XFS_IS_QUOTA_RUNNING(mp))
+                return -ENOSYS;
+        return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags));
+}
+STATIC int
+xfs_quota_disable(
+        struct super_block      *sb,
+        unsigned int            uflags)
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        if (sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        if (!XFS_IS_QUOTA_RUNNING(mp))
+                return -ENOSYS;
+        if (!XFS_IS_QUOTA_ON(mp))
+                return -EINVAL;
-        return -EINVAL;
+        return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags));
 }
 STATIC int
@@ -131,7 +145,7 @@ STATIC int
 xfs_fs_get_dqblk(
        struct super_block      *sb,
        struct kqid             qid,
-        struct fs_disk_quota    *fdq)
+        struct qc_dqblk         *qdq)
 {
        struct xfs_mount        *mp = XFS_M(sb);
@@ -141,14 +155,14 @@ xfs_fs_get_dqblk(
                return -ESRCH;
        return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
-                                      xfs_quota_type(qid.type), fdq);
+                                      xfs_quota_type(qid.type), qdq);
 }
 STATIC int
 xfs_fs_set_dqblk(
        struct super_block      *sb,
        struct kqid             qid,
-        struct fs_disk_quota    *fdq)
+        struct qc_dqblk         *qdq)
 {
        struct xfs_mount        *mp = XFS_M(sb);
@@ -160,13 +174,14 @@ xfs_fs_set_dqblk(
                return -ESRCH;
        return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
-                                     xfs_quota_type(qid.type), fdq);
+                                     xfs_quota_type(qid.type), qdq);
 }
 const struct quotactl_ops xfs_quotactl_operations = {
        .get_xstatev            = xfs_fs_get_xstatev,
        .get_xstate             = xfs_fs_get_xstate,
-        .set_xstate             = xfs_fs_set_xstate,
+        .quota_enable           = xfs_quota_enable,
+        .quota_disable          = xfs_quota_disable,
        .rm_xquota              = xfs_fs_rm_xquota,
        .get_dqblk              = xfs_fs_get_dqblk,
        .set_dqblk              = xfs_fs_set_dqblk,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 19cbda196369..f2449fd86926 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -685,7 +685,7 @@ xfs_blkdev_get(
                                    mp);
        if (IS_ERR(*bdevp)) {
                error = PTR_ERR(*bdevp);
-                xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
+                xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
        }
        return error;
@@ -1111,6 +1111,11 @@ xfs_fs_statfs(
                                        statp->f_files,
                                        mp->m_maxicount);
+        /* If sb_icount overshot maxicount, report actual allocation */
+        statp->f_files = max_t(typeof(statp->f_files),
+                                        statp->f_files,
+                                        sbp->sb_icount);
        /* make sure statp->f_ffree does not underflow */
        ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
        statp->f_ffree = max_t(__int64_t, ffree, 0);
@@ -1257,13 +1262,13 @@ xfs_fs_remount(
                 * If this is the first remount to writeable state we
                 * might have some superblock changes to update.
                 */
-                if (mp->m_update_flags) {
+                if (mp->m_update_sb) {
-                        error = xfs_mount_log_sb(mp, mp->m_update_flags);
+                        error = xfs_sync_sb(mp, false);
                        if (error) {
                                xfs_warn(mp, "failed to write sb changes");
                                return error;
                        }
-                        mp->m_update_flags = 0;
+                        mp->m_update_sb = false;
                }
                /*
@@ -1293,8 +1298,9 @@ xfs_fs_remount(
 /*
 * Second stage of a freeze. The data is already frozen so we only
- * need to take care of the metadata. Once that's done write a dummy
+ * need to take care of the metadata. Once that's done sync the superblock
- * record to dirty the log in case of a crash while frozen.
+ * to the log to dirty it in case of a crash while frozen. This ensures that we
+ * will recover the unlinked inode lists on the next mount.
 */
 STATIC int
 xfs_fs_freeze(
@@ -1304,7 +1310,7 @@ xfs_fs_freeze(
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-        return xfs_fs_log_dummy(mp);
+        return xfs_sync_sb(mp, true);
 }
 STATIC int
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 1743b9f8e23d..a0c8067cea6f 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -149,24 +149,6 @@ static struct ctl_table xfs_table[] = {
                .extra2         = &xfs_params.inherit_noatim.max
        },
        {
-                .procname       = "xfsbufd_centisecs",
-                .data           = &xfs_params.xfs_buf_timer.val,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &xfs_params.xfs_buf_timer.min,
-                .extra2         = &xfs_params.xfs_buf_timer.max
-        },
-        {
-                .procname       = "age_buffer_centisecs",
-                .data           = &xfs_params.xfs_buf_age.val,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &xfs_params.xfs_buf_age.min,
-                .extra2         = &xfs_params.xfs_buf_age.max
-        },
-        {
                .procname       = "inherit_nosymlinks",
                .data           = &xfs_params.inherit_nosym.val,
                .maxlen         = sizeof(int),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fa3135b9bf04..eb90cd59a0ec 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -472,6 +472,7 @@ xfs_trans_apply_sb_deltas(
                whole = 1;
        }
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
        if (whole)
                /*
                 * Log the whole thing, the fields are noncontiguous.
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 0a4d4ab6d9a9..75798412859a 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -327,9 +327,10 @@ xfs_trans_read_buf_map(
                return -EIO;
        }
-        if (tp)
+        if (tp) {
                _xfs_trans_bjoin(tp, bp, 1);
-        trace_xfs_trans_read_buf(bp->b_fspriv);
+                trace_xfs_trans_read_buf(bp->b_fspriv);
+        }
        *bpp = bp;
        return 0;
author	Trond Myklebust <trond.myklebust@primarydata.com>	2015-02-18 10:28:37 -0500
committer	Trond Myklebust <trond.myklebust@primarydata.com>	2015-02-18 10:28:37 -0500
commit	65d2918e716afb89359cfa59734d76c1ff8700cb (patch)
tree	4685404f96642243d62c3a1a823340913d087090 /fs
parent	bf40e5561fd288a505d5d8d8bf45eef96fe7253d (diff)
parent	338d00cfef07d74a072f96821c64b20f98517d72 (diff)