66 files changed, 489 insertions, 393 deletions
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 1feb68ecef95..842d00048a65 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -94,25 +94,21 @@ static struct dentry *get_next_positive_subdir(struct dentry *prev,
 {
        struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
        struct list_head *next;
-        struct dentry *p, *q;
+        struct dentry *q;
        spin_lock(&sbi->lookup_lock);
+        spin_lock(&root->d_lock);
-        if (prev == NULL) {
+        if (prev)
-                spin_lock(&root->d_lock);
+                next = prev->d_u.d_child.next;
+        else {
                prev = dget_dlock(root);
                next = prev->d_subdirs.next;
-                p = prev;
-                goto start;
        }
-        p = prev;
+cont:
-        spin_lock(&p->d_lock);
-again:
-        next = p->d_u.d_child.next;
-start:
        if (next == &root->d_subdirs) {
-                spin_unlock(&p->d_lock);
+                spin_unlock(&root->d_lock);
                spin_unlock(&sbi->lookup_lock);
                dput(prev);
                return NULL;
@@ -121,16 +117,15 @@ start:
        q = list_entry(next, struct dentry, d_u.d_child);
        spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
-        /* Negative dentry - try next */
+        /* Already gone or negative dentry (under construction) - try next */
-        if (!simple_positive(q)) {
+        if (q->d_count == 0 || !simple_positive(q)) {
-                spin_unlock(&p->d_lock);
+                spin_unlock(&q->d_lock);
-                lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_);
+                next = q->d_u.d_child.next;
-                p = q;
+                goto cont;
-                goto again;
        }
        dget_dlock(q);
        spin_unlock(&q->d_lock);
-        spin_unlock(&p->d_lock);
+        spin_unlock(&root->d_lock);
        spin_unlock(&sbi->lookup_lock);
        dput(prev);
@@ -404,11 +399,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                        DPRINTK("checking mountpoint %p %.*s",
                                dentry, (int)dentry->d_name.len, dentry->d_name.name);
-                        /* Path walk currently on this dentry? */
-                        ino_count = atomic_read(&ino->count) + 2;
-                        if (dentry->d_count > ino_count)
-                                goto next;
                        /* Can we umount this guy */
                        if (autofs4_mount_busy(mnt, dentry))
                                goto next;
diff --git a/fs/bio.c b/fs/bio.c
index 73922abba832..71072ab99128 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -73,7 +73,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
 {
        unsigned int sz = sizeof(struct bio) + extra_size;
        struct kmem_cache *slab = NULL;
-        struct bio_slab *bslab;
+        struct bio_slab *bslab, *new_bio_slabs;
        unsigned int i, entry = -1;
        mutex_lock(&bio_slab_lock);
@@ -97,11 +97,12 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
        if (bio_slab_nr == bio_slab_max && entry == -1) {
                bio_slab_max <<= 1;
-                bio_slabs = krealloc(bio_slabs,
+                new_bio_slabs = krealloc(bio_slabs,
-                                     bio_slab_max * sizeof(struct bio_slab),
+                                         bio_slab_max * sizeof(struct bio_slab),
-                                     GFP_KERNEL);
+                                         GFP_KERNEL);
-                if (!bio_slabs)
+                if (!new_bio_slabs)
                        goto out_unlock;
+                bio_slabs = new_bio_slabs;
        }
        if (entry == -1)
                entry = bio_slab_nr++;
@@ -1312,7 +1313,7 @@ EXPORT_SYMBOL(bio_copy_kern);
 * Note that this code is very hard to test under normal circumstances because
 * direct-io pins the pages with get_user_pages().  This makes
 * is_page_cache_freeable return false, and the VM will not clean the pages.
- * But other code (eg, pdflush) could clean the pages if they are mapped
+ * But other code (eg, flusher threads) could clean the pages if they are mapped
 * pagecache.
 *
 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1e519195d45b..38e721b35d45 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1578,10 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                         unsigned long nr_segs, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
+        struct blk_plug plug;
        ssize_t ret;
        BUG_ON(iocb->ki_pos != pos);
+        blk_start_plug(&plug);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        if (ret > 0 || ret == -EIOCBQUEUED) {
                ssize_t err;
@@ -1590,6 +1592,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (err < 0 && ret > 0)
                        ret = err;
        }
+        blk_finish_plug(&plug);
        return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_aio_write);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 83baec24946d..6e8f416773d4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -324,7 +324,8 @@ static noinline int add_async_extent(struct async_cow *cow,
 * If this code finds it can't get good compression, it puts an
 * entry onto the work queue to write the uncompressed bytes.  This
 * makes sure that both compressed inodes and uncompressed inodes
- * are written in the same order that pdflush sent them down.
+ * are written in the same order that the flusher thread sent them
+ * down.
 */
 static noinline int compress_file_range(struct inode *inode,
                                        struct page *locked_page,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bc2f6ffff3cf..7bb755677a22 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -664,10 +664,6 @@ static noinline int btrfs_mksubvol(struct path *parent,
        struct dentry *dentry;
        int error;
-        error = mnt_want_write(parent->mnt);
-        if (error)
-                return error;
        mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
        dentry = lookup_one_len(name, parent->dentry, namelen);
@@ -703,7 +699,6 @@ out_dput:
        dput(dentry);
 out_unlock:
        mutex_unlock(&dir->i_mutex);
-        mnt_drop_write(parent->mnt);
        return error;
 }
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 643335a4fe3c..051c7fe551dd 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -596,7 +596,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
        /*
         * pages in the range can be dirty, clean or writeback.  We
         * start IO on any dirty ones so the wait doesn't stall waiting
-         * for pdflush to find them
+         * for the flusher thread to find them
         */
        if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
                filemap_fdatawrite_range(inode->i_mapping, start, end);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8c6e61d6eed5..f2eb24c477a3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -100,10 +100,6 @@ static void __save_error_info(struct btrfs_fs_info *fs_info)
        fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
 }
-/* NOTE:
- *      We move write_super stuff at umount in order to avoid deadlock
- *      for umount hold all lock.
- */
 static void save_error_info(struct btrfs_fs_info *fs_info)
 {
        __save_error_info(fs_info);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b8708f994e67..e86ae04abe6a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1744,10 +1744,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        device->fs_devices = root->fs_info->fs_devices;
-        /*
-         * we don't want write_supers to jump in here with our device
-         * half setup
-         */
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
        list_add(&device->dev_alloc_list,
diff --git a/fs/buffer.c b/fs/buffer.c
index 9f6d2e41281d..58e2e7b77372 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -914,7 +914,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
 /*
 * Initialise the state of a blockdev page's buffers.
 */ 
-static void
+static sector_t
 init_page_buffers(struct page *page, struct block_device *bdev,
                        sector_t block, int size)
 {
@@ -936,33 +936,41 @@ init_page_buffers(struct page *page, struct block_device *bdev,
                block++;
                bh = bh->b_this_page;
        } while (bh != head);
+        /*
+         * Caller needs to validate requested block against end of device.
+         */
+        return end_block;
 }
 /*
 * Create the page-cache page that contains the requested block.
 *
- * This is user purely for blockdev mappings.
+ * This is used purely for blockdev mappings.
 */
-static struct page *
+static int
 grow_dev_page(struct block_device *bdev, sector_t block,
-                pgoff_t index, int size)
+                pgoff_t index, int size, int sizebits)
 {
        struct inode *inode = bdev->bd_inode;
        struct page *page;
        struct buffer_head *bh;
+        sector_t end_block;
+        int ret = 0;            /* Will call free_more_memory() */
        page = find_or_create_page(inode->i_mapping, index,
                (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
        if (!page)
-                return NULL;
+                return ret;
        BUG_ON(!PageLocked(page));
        if (page_has_buffers(page)) {
                bh = page_buffers(page);
                if (bh->b_size == size) {
-                        init_page_buffers(page, bdev, block, size);
+                        end_block = init_page_buffers(page, bdev,
-                        return page;
+                                                index << sizebits, size);
+                        goto done;
                }
                if (!try_to_free_buffers(page))
                        goto failed;
@@ -982,14 +990,14 @@ grow_dev_page(struct block_device *bdev, sector_t block,
         */
        spin_lock(&inode->i_mapping->private_lock);
        link_dev_buffers(page, bh);
-        init_page_buffers(page, bdev, block, size);
+        end_block = init_page_buffers(page, bdev, index << sizebits, size);
        spin_unlock(&inode->i_mapping->private_lock);
-        return page;
+done:
+        ret = (block < end_block) ? 1 : -ENXIO;
 failed:
        unlock_page(page);
        page_cache_release(page);
-        return NULL;
+        return ret;
 }
 /*
@@ -999,7 +1007,6 @@ failed:
 static int
 grow_buffers(struct block_device *bdev, sector_t block, int size)
 {
-        struct page *page;
        pgoff_t index;
        int sizebits;
@@ -1023,22 +1030,14 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
                        bdevname(bdev, b));
                return -EIO;
        }
-        block = index << sizebits;
        /* Create a page with the proper size buffers.. */
-        page = grow_dev_page(bdev, block, index, size);
+        return grow_dev_page(bdev, block, index, size, sizebits);
-        if (!page)
-                return 0;
-        unlock_page(page);
-        page_cache_release(page);
-        return 1;
 }
 static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
-        int ret;
-        struct buffer_head *bh;
        /* Size must be multiple of hard sectorsize */
        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
                        (size < 512 || size > PAGE_SIZE))) {
@@ -1051,21 +1050,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
                return NULL;
        }
-retry:
+        for (;;) {
-        bh = __find_get_block(bdev, block, size);
+                struct buffer_head *bh;
-        if (bh)
+                int ret;
-                return bh;
-        ret = grow_buffers(bdev, block, size);
-        if (ret == 0) {
-                free_more_memory();
-                goto retry;
-        } else if (ret > 0) {
                bh = __find_get_block(bdev, block, size);
                if (bh)
                        return bh;
+                ret = grow_buffers(bdev, block, size);
+                if (ret < 0)
+                        return NULL;
+                if (ret == 0)
+                        free_more_memory();
        }
-        return NULL;
 }
 /*
@@ -1321,10 +1319,6 @@ EXPORT_SYMBOL(__find_get_block);
 * which corresponds to the passed block_device, block and size. The
 * returned buffer has its reference count incremented.
 *
- * __getblk() cannot fail - it just keeps trying.  If you pass it an
- * illegal block number, __getblk() will happily return a buffer_head
- * which represents the non-existent block.  Very weird.
- *
 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
 * attempt is failing.  FIXME, perhaps?
 */
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fb962efdacee..6d59006bfa27 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -201,6 +201,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
        int err = -ENOMEM;
        dout("ceph_fs_debugfs_init\n");
+        BUG_ON(!fsc->client->debugfs_dir);
        fsc->debugfs_congestion_kb =
                debugfs_create_file("writeback_congestion_kb",
                                    0600,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9fff9f3b17e4..4b5762ef7c2b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -992,11 +992,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
        if (rinfo->head->is_dentry) {
                struct inode *dir = req->r_locked_dir;
-                err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+                if (dir) {
-                                 session, req->r_request_started, -1,
+                        err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
-                                 &req->r_caps_reservation);
+                                         session, req->r_request_started, -1,
-                if (err < 0)
+                                         &req->r_caps_reservation);
-                        return err;
+                        if (err < 0)
+                                return err;
+                } else {
+                        WARN_ON_ONCE(1);
+                }
        }
        /*
@@ -1004,6 +1008,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
         * will have trouble splicing in the virtual snapdir later
         */
        if (rinfo->head->is_dentry && !req->r_aborted &&
+            req->r_locked_dir &&
            (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
                                               fsc->mount_options->snapdir_name,
                                               req->r_dentry->d_name.len))) {
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8e3fb69fbe62..1396ceb46797 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -42,7 +42,8 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
        /* validate striping parameters */
        if ((l->object_size & ~PAGE_MASK) ||
            (l->stripe_unit & ~PAGE_MASK) ||
-            ((unsigned)l->object_size % (unsigned)l->stripe_unit))
+            (l->stripe_unit != 0 &&
+             ((unsigned)l->object_size % (unsigned)l->stripe_unit)))
                return -EINVAL;
        /* make sure it's a valid data pool */
diff --git a/fs/compat.c b/fs/compat.c
index 6161255fac45..1bdb350ea5d3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1155,11 +1155,14 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
        struct file *file;
        int fput_needed;
        ssize_t ret;
+        loff_t pos;
        file = fget_light(fd, &fput_needed);
        if (!file)
                return -EBADF;
-        ret = compat_readv(file, vec, vlen, &file->f_pos);
+        pos = file->f_pos;
+        ret = compat_readv(file, vec, vlen, &pos);
+        file->f_pos = pos;
        fput_light(file, fput_needed);
        return ret;
 }
@@ -1221,11 +1224,14 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
        struct file *file;
        int fput_needed;
        ssize_t ret;
+        loff_t pos;
        file = fget_light(fd, &fput_needed);
        if (!file)
                return -EBADF;
-        ret = compat_writev(file, vec, vlen, &file->f_pos);
+        pos = file->f_pos;
+        ret = compat_writev(file, vec, vlen, &pos);
+        file->f_pos = pos;
        fput_light(file, fput_needed);
        return ret;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1faf4cb56f39..f86c720dba0e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1062,6 +1062,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        unsigned long user_addr;
        size_t bytes;
        struct buffer_head map_bh = { 0, };
+        struct blk_plug plug;
        if (rw & WRITE)
                rw = WRITE_ODIRECT;
@@ -1177,6 +1178,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                                PAGE_SIZE - user_addr / PAGE_SIZE);
        }
+        blk_start_plug(&plug);
        for (seg = 0; seg < nr_segs; seg++) {
                user_addr = (unsigned long)iov[seg].iov_base;
                sdio.size += bytes = iov[seg].iov_len;
@@ -1235,6 +1238,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        if (sdio.bio)
                dio_bio_submit(dio, &sdio);
+        blk_finish_plug(&plug);
        /*
         * It is possible that, we return short IO due to end of file.
         * In that case, we need to release all the pages we got hold on.
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1c8b55670804..eedec84c1809 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1654,8 +1654,8 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
                error = PTR_ERR(file);
                goto out_free_fd;
        }
-        fd_install(fd, file);
        ep->file = file;
+        fd_install(fd, file);
        return fd;
 out_free_fd:
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 5badb0c039de..1562c27a2fab 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,15 +37,12 @@
 #define EXOFS_DBGMSG2(M...) do {} while (0)
-enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), };
 unsigned exofs_max_io_pages(struct ore_layout *layout,
                            unsigned expected_pages)
 {
-        unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
+        unsigned pages = min_t(unsigned, expected_pages,
+                               layout->max_io_length / PAGE_SIZE);
-        /* TODO: easily support bio chaining */
-        pages =  min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE);
        return pages;
 }
@@ -101,7 +98,8 @@ static void _pcol_reset(struct page_collect *pcol)
         * it might not end here. don't be left with nothing
         */
        if (!pcol->expected_pages)
-                pcol->expected_pages = MAX_PAGES_KMALLOC;
+                pcol->expected_pages =
+                                exofs_max_io_pages(&pcol->sbi->layout, ~0);
 }
 static int pcol_try_alloc(struct page_collect *pcol)
@@ -389,6 +387,8 @@ static int readpage_strip(void *data, struct page *page)
        size_t len;
        int ret;
+        BUG_ON(!PageLocked(page));
        /* FIXME: Just for debugging, will be removed */
        if (PageUptodate(page))
                EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
@@ -572,8 +572,16 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
        if (!pcol->that_locked_page ||
            (pcol->that_locked_page->index != index)) {
-                struct page *page = find_get_page(pcol->inode->i_mapping, index);
+                struct page *page;
+                loff_t i_size = i_size_read(pcol->inode);
+                if (offset >= i_size) {
+                        *uptodate = true;
+                        EXOFS_DBGMSG("offset >= i_size index=0x%lx\n", index);
+                        return ZERO_PAGE(0);
+                }
+                page =  find_get_page(pcol->inode->i_mapping, index);
                if (!page) {
                        page = find_or_create_page(pcol->inode->i_mapping,
                                                   index, GFP_NOFS);
@@ -602,12 +610,13 @@ static void __r4w_put_page(void *priv, struct page *page)
 {
        struct page_collect *pcol = priv;
-        if (pcol->that_locked_page != page) {
+        if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
                EXOFS_DBGMSG("index=0x%lx\n", page->index);
                page_cache_release(page);
                return;
        }
-        EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index);
+        EXOFS_DBGMSG("that_locked_page index=0x%lx\n",
+                     ZERO_PAGE(0) == page ? -1 : page->index);
 }
 static const struct _ore_r4w_op _r4w_op = {
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 24a49d47e935..1585db1aa365 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -837,11 +837,11 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
                                bio->bi_rw |= REQ_WRITE;
                        }
-                        osd_req_write(or, _ios_obj(ios, dev), per_dev->offset,
+                        osd_req_write(or, _ios_obj(ios, cur_comp),
-                                      bio, per_dev->length);
+                                      per_dev->offset, bio, per_dev->length);
                        ORE_DBGMSG("write(0x%llx) offset=0x%llx "
                                      "length=0x%llx dev=%d\n",
-                                     _LLU(_ios_obj(ios, dev)->id),
+                                     _LLU(_ios_obj(ios, cur_comp)->id),
                                     _LLU(per_dev->offset),
                                     _LLU(per_dev->length), dev);
                } else if (ios->kern_buff) {
@@ -853,20 +853,20 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
                               (ios->si.unit_off + ios->length >
                                ios->layout->stripe_unit));
-                        ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
+                        ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp),
                                                 per_dev->offset,
                                                 ios->kern_buff, ios->length);
                        if (unlikely(ret))
                                goto out;
                        ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
                                      "length=0x%llx dev=%d\n",
-                                     _LLU(_ios_obj(ios, dev)->id),
+                                     _LLU(_ios_obj(ios, cur_comp)->id),
                                     _LLU(per_dev->offset),
                                     _LLU(ios->length), per_dev->dev);
                } else {
-                        osd_req_set_attributes(or, _ios_obj(ios, dev));
+                        osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
                        ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
-                                     _LLU(_ios_obj(ios, dev)->id),
+                                     _LLU(_ios_obj(ios, cur_comp)->id),
                                     ios->out_attr_len, dev);
                }
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 433783624d10..dde41a75c7c8 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -400,8 +400,6 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
        ret = ore_write(ios);
        if (unlikely(ret))
                EXOFS_ERR("%s: ore_write failed.\n", __func__);
-        else
-                sb->s_dirt = 0;
        unlock_super(sb);
@@ -412,14 +410,6 @@ out:
        return ret;
 }
-static void exofs_write_super(struct super_block *sb)
-{
-        if (!(sb->s_flags & MS_RDONLY))
-                exofs_sync_fs(sb, 1);
-        else
-                sb->s_dirt = 0;
-}
 static void _exofs_print_device(const char *msg, const char *dev_path,
                                struct osd_dev *od, u64 pid)
 {
@@ -952,7 +942,6 @@ static const struct super_operations exofs_sops = {
        .write_inode    = exofs_write_inode,
        .evict_inode    = exofs_evict_inode,
        .put_super      = exofs_put_super,
-        .write_super    = exofs_write_super,
        .sync_fs        = exofs_sync_fs,
        .statfs         = exofs_statfs,
 };
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 9a4a5c48b1c9..a07597307fd1 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3459,14 +3459,6 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
 * inode out, but prune_icache isn't a user-visible syncing function.
 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
 * we start and wait on commits.
- *
- * Is this efficient/effective?  Well, we're being nice to the system
- * by cleaning up our inodes proactively so they can be reaped
- * without I/O.  But we are potentially leaving up to five seconds'
- * worth of inodes floating about which prune_icache wants us to
- * write out.  One way to fix that would be to get prune_icache()
- * to do a write_super() to free up some memory.  It has the desired
- * effect.
 */
 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
 {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index ff9bcdc5b0d5..8c892e93d8e7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -64,11 +64,6 @@ static int ext3_freeze(struct super_block *sb);
 /*
 * Wrappers for journal_start/end.
- *
- * The only special thing we need to do here is to make sure that all
- * journal_end calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
 */
 handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
 {
@@ -90,12 +85,6 @@ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
        return journal_start(journal, nblocks);
 }
-/*
- * The only special thing we need to do here is to make sure that all
- * journal_stop calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- */
 int __ext3_journal_stop(const char *where, handle_t *handle)
 {
        struct super_block *sb;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d23b31ca9d7a..1b5089067d01 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -280,14 +280,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
        return desc;
 }
-static int ext4_valid_block_bitmap(struct super_block *sb,
+/*
-                                   struct ext4_group_desc *desc,
+ * Return the block number which was discovered to be invalid, or 0 if
-                                   unsigned int block_group,
+ * the block bitmap is valid.
-                                   struct buffer_head *bh)
+ */
+static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
+                                            struct ext4_group_desc *desc,
+                                            unsigned int block_group,
+                                            struct buffer_head *bh)
 {
        ext4_grpblk_t offset;
        ext4_grpblk_t next_zero_bit;
-        ext4_fsblk_t bitmap_blk;
+        ext4_fsblk_t blk;
        ext4_fsblk_t group_first_block;
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
@@ -297,37 +301,33 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
                 * or it has to also read the block group where the bitmaps
                 * are located to verify they are set.
                 */
-                return 1;
+                return 0;
        }
        group_first_block = ext4_group_first_block_no(sb, block_group);
        /* check whether block bitmap block number is set */
-        bitmap_blk = ext4_block_bitmap(sb, desc);
+        blk = ext4_block_bitmap(sb, desc);
-        offset = bitmap_blk - group_first_block;
+        offset = blk - group_first_block;
        if (!ext4_test_bit(offset, bh->b_data))
                /* bad block bitmap */
-                goto err_out;
+                return blk;
        /* check whether the inode bitmap block number is set */
-        bitmap_blk = ext4_inode_bitmap(sb, desc);
+        blk = ext4_inode_bitmap(sb, desc);
-        offset = bitmap_blk - group_first_block;
+        offset = blk - group_first_block;
        if (!ext4_test_bit(offset, bh->b_data))
                /* bad block bitmap */
-                goto err_out;
+                return blk;
        /* check whether the inode table block number is set */
-        bitmap_blk = ext4_inode_table(sb, desc);
+        blk = ext4_inode_table(sb, desc);
-        offset = bitmap_blk - group_first_block;
+        offset = blk - group_first_block;
        next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
                                offset + EXT4_SB(sb)->s_itb_per_group,
                                offset);
-        if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
+        if (next_zero_bit < offset + EXT4_SB(sb)->s_itb_per_group)
-                /* good bitmap for inode tables */
+                /* bad bitmap for inode tables */
-                return 1;
+                return blk;
-err_out:
-        ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
-                        block_group, bitmap_blk);
        return 0;
 }
@@ -336,14 +336,26 @@ void ext4_validate_block_bitmap(struct super_block *sb,
                               unsigned int block_group,
                               struct buffer_head *bh)
 {
+        ext4_fsblk_t    blk;
        if (buffer_verified(bh))
                return;
        ext4_lock_group(sb, block_group);
-        if (ext4_valid_block_bitmap(sb, desc, block_group, bh) &&
+        blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
-            ext4_block_bitmap_csum_verify(sb, block_group, desc, bh,
+        if (unlikely(blk != 0)) {
-                                          EXT4_BLOCKS_PER_GROUP(sb) / 8))
+                ext4_unlock_group(sb, block_group);
-                set_buffer_verified(bh);
+                ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
+                           block_group, blk);
+                return;
+        }
+        if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
+                        desc, bh, EXT4_BLOCKS_PER_GROUP(sb) / 8))) {
+                ext4_unlock_group(sb, block_group);
+                ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+                return;
+        }
+        set_buffer_verified(bh);
        ext4_unlock_group(sb, block_group);
 }
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index f8716eab9995..5c2d1813ebe9 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -79,7 +79,6 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
        if (provided == calculated)
                return 1;
-        ext4_error(sb, "Bad block bitmap checksum: block_group = %u", group);
        return 0;
 }
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index cd0c7ed06772..aabbb3f53683 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2662,6 +2662,7 @@ cont:
                }
                path[0].p_depth = depth;
                path[0].p_hdr = ext_inode_hdr(inode);
+                i = 0;
                if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
                        err = -EIO;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6324f74e0342..dff171c3a123 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1970,7 +1970,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 * This function can get called via...
 *   - ext4_da_writepages after taking page lock (have journal handle)
 *   - journal_submit_inode_data_buffers (no journal handle)
- *   - shrink_page_list via pdflush (no journal handle)
+ *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
 *   - grab_page_cache when doing write_begin (have journal handle)
 *
 * We don't do any block allocation in this function. If we have page with
@@ -4589,14 +4589,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
 * inode out, but prune_icache isn't a user-visible syncing function.
 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
 * we start and wait on commits.
- *
- * Is this efficient/effective?  Well, we're being nice to the system
- * by cleaning up our inodes proactively so they can be reaped
- * without I/O.  But we are potentially leaving up to five seconds'
- * worth of inodes floating about which prune_icache wants us to
- * write out.  One way to fix that would be to get prune_icache()
- * to do a write_super() to free up some memory.  It has the desired
- * effect.
 */
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d76ec8277d3f..c6e0cb3d1f4a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -326,11 +326,6 @@ static void ext4_put_nojournal(handle_t *handle)
 /*
 * Wrappers for jbd2_journal_start/end.
- *
- * The only special thing we need to do here is to make sure that all
- * journal_end calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
 */
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 {
@@ -356,12 +351,6 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
        return jbd2_journal_start(journal, nblocks);
 }
-/*
- * The only special thing we need to do here is to make sure that all
- * jbd2_journal_stop calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- */
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
 {
        struct super_block *sb;
@@ -959,6 +948,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_reserved_meta_blocks = 0;
        ei->i_allocated_meta_blocks = 0;
        ei->i_da_metadata_calc_len = 0;
+        ei->i_da_metadata_calc_last_lblock = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
 #ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
@@ -3119,6 +3109,10 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
        ext4_group_t            i, ngroups = ext4_get_groups_count(sb);
        int                     s, j, count = 0;
+        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+                return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+                        sbi->s_itb_per_group + 2);
        first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
                (grp * EXT4_BLOCKS_PER_GROUP(sb));
        last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
@@ -4430,6 +4424,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
                ext4_commit_super(sb, 1);
                jbd2_journal_clear_err(journal);
+                jbd2_journal_update_sb_errno(journal);
        }
 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8964cf3999b2..324bc0850534 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -383,6 +383,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
        struct fuse_entry_out outentry;
        struct fuse_file *ff;
+        /* Userspace expects S_IFREG in create mode */
+        BUG_ON((mode & S_IFMT) != S_IFREG);
        forget = fuse_alloc_forget();
        err = -ENOMEM;
        if (!forget)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 93d8d6c9494d..aba15f1b7ad2 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -703,13 +703,16 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                                  unsigned long nr_segs, loff_t pos)
 {
        struct inode *inode = iocb->ki_filp->f_mapping->host;
+        struct fuse_conn *fc = get_fuse_conn(inode);
-        if (pos + iov_length(iov, nr_segs) > i_size_read(inode)) {
+        /*
+         * In auto invalidate mode, always update attributes on read.
+         * Otherwise, only update if we attempt to read past EOF (to ensure
+         * i_size is up to date).
+         */
+        if (fc->auto_inval_data ||
+            (pos + iov_length(iov, nr_segs) > i_size_read(inode))) {
                int err;
-                /*
-                 * If trying to read past EOF, make sure the i_size
-                 * attribute is up-to-date.
-                 */
                err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
                if (err)
                        return err;
@@ -1700,7 +1703,7 @@ static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
        size_t n;
        u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
-        for (n = 0; n < count; n++) {
+        for (n = 0; n < count; n++, iov++) {
                if (iov->iov_len > (size_t) max)
                        return -ENOMEM;
                max -= iov->iov_len;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 771fb6322c07..e24dd74e3068 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -484,6 +484,9 @@ struct fuse_conn {
        /** Is fallocate not implemented by fs? */
        unsigned no_fallocate:1;
+        /** Use enhanced/automatic page cache invalidation. */
+        unsigned auto_inval_data:1;
        /** The number of requests waiting for completion */
        atomic_t num_waiting;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1cd61652018c..ce0a2838ccd0 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -197,6 +197,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
        loff_t oldsize;
+        struct timespec old_mtime;
        spin_lock(&fc->lock);
        if (attr_version != 0 && fi->attr_version > attr_version) {
@@ -204,15 +205,35 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
                return;
        }
+        old_mtime = inode->i_mtime;
        fuse_change_attributes_common(inode, attr, attr_valid);
        oldsize = inode->i_size;
        i_size_write(inode, attr->size);
        spin_unlock(&fc->lock);
-        if (S_ISREG(inode->i_mode) && oldsize != attr->size) {
+        if (S_ISREG(inode->i_mode)) {
-                truncate_pagecache(inode, oldsize, attr->size);
+                bool inval = false;
-                invalidate_inode_pages2(inode->i_mapping);
+                if (oldsize != attr->size) {
+                        truncate_pagecache(inode, oldsize, attr->size);
+                        inval = true;
+                } else if (fc->auto_inval_data) {
+                        struct timespec new_mtime = {
+                                .tv_sec = attr->mtime,
+                                .tv_nsec = attr->mtimensec,
+                        };
+                        /*
+                         * Auto inval mode also checks and invalidates if mtime
+                         * has changed.
+                         */
+                        if (!timespec_equal(&old_mtime, &new_mtime))
+                                inval = true;
+                }
+                if (inval)
+                        invalidate_inode_pages2(inode->i_mapping);
        }
 }
@@ -834,6 +855,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                                fc->big_writes = 1;
                        if (arg->flags & FUSE_DONT_MASK)
                                fc->dont_mask = 1;
+                        if (arg->flags & FUSE_AUTO_INVAL_DATA)
+                                fc->auto_inval_data = 1;
                } else {
                        ra_pages = fc->max_read / PAGE_CACHE_SIZE;
                        fc->no_lock = 1;
@@ -859,7 +882,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
        arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
-                FUSE_FLOCK_LOCKS;
+                FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
+                FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA;
        req->in.h.opcode = FUSE_INIT;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(*arg);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 3a56c8d94de0..22255d96b27e 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -52,7 +52,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
                /*
                 * If it's a fully non-blocking write attempt and we cannot
                 * lock the buffer then redirty the page.  Note that this can
-                 * potentially cause a busy-wait loop from pdflush and kswapd
+                 * potentially cause a busy-wait loop from flusher thread and kswapd
                 * activity, but those code paths have their own higher-level
                 * throttling.
                 */
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 5fd51a5833ff..b7ec224910c5 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -236,10 +236,10 @@ out:
 * hfs_mdb_commit()
 *
 * Description:
- *   This updates the MDB on disk (look also at hfs_write_super()).
+ *   This updates the MDB on disk.
 *   It does not check, if the superblock has been modified, or
 *   if the filesystem has been mounted read-only. It is mainly
- *   called by hfs_write_super() and hfs_btree_extend().
+ *   called by hfs_sync_fs() and flush_mdb().
 * Input Variable(s):
 *   struct hfs_mdb *mdb: Pointer to the hfs MDB
 *   int backup;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 425c2f2cf170..a2862339323b 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -534,8 +534,8 @@ int journal_start_commit(journal_t *journal, tid_t *ptid)
                ret = 1;
        } else if (journal->j_committing_transaction) {
                /*
-                 * If ext3_write_super() recently started a commit, then we
+                 * If commit has been started, then we have to wait for
-                 * have to wait for completion of that transaction
+                 * completion of that transaction.
                 */
                if (ptid)
                        *ptid = journal->j_committing_transaction->t_tid;
@@ -1113,6 +1113,11 @@ static void mark_journal_empty(journal_t *journal)
        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
        spin_lock(&journal->j_state_lock);
+        /* Is it already empty? */
+        if (sb->s_start == 0) {
+                spin_unlock(&journal->j_state_lock);
+                return;
+        }
        jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n",
                  journal->j_tail_sequence);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e9a3c4c85594..e149b99a7ffb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -612,8 +612,8 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
                ret = 1;
        } else if (journal->j_committing_transaction) {
                /*
-                 * If ext3_write_super() recently started a commit, then we
+                 * If commit has been started, then we have to wait for
-                 * have to wait for completion of that transaction
+                 * completion of that transaction.
                 */
                if (ptid)
                        *ptid = journal->j_committing_transaction->t_tid;
@@ -1377,7 +1377,7 @@ static void jbd2_mark_journal_empty(journal_t *journal)
 * Update a journal's errno.  Write updated superblock to disk waiting for IO
 * to complete.
 */
-static void jbd2_journal_update_sb_errno(journal_t *journal)
+void jbd2_journal_update_sb_errno(journal_t *journal)
 {
        journal_superblock_t *sb = journal->j_superblock;
@@ -1390,6 +1390,7 @@ static void jbd2_journal_update_sb_errno(journal_t *journal)
        jbd2_write_superblock(journal, WRITE_SYNC);
 }
+EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
 /*
 * Read the superblock for a given journal, performing initial
diff --git a/fs/namei.c b/fs/namei.c
index 1b464390dde8..dd1ed1b8e98e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -352,6 +352,7 @@ int __inode_permission(struct inode *inode, int mask)
 /**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
+ * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
@@ -656,6 +657,7 @@ int sysctl_protected_hardlinks __read_mostly = 1;
 /**
 * may_follow_link - Check symlink following for unsafe situations
 * @link: The path of the symlink
+ * @nd: nameidata pathwalk data
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
@@ -2414,7 +2416,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
                goto out;
        }
-        mode = op->mode & S_IALLUGO;
+        mode = op->mode;
        if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
                mode &= ~current_umask();
@@ -2452,7 +2454,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
        }
        if (open_flag & O_CREAT) {
-                error = may_o_create(&nd->path, dentry, op->mode);
+                error = may_o_create(&nd->path, dentry, mode);
                if (error) {
                        create_error = error;
                        if (open_flag & O_EXCL)
@@ -2489,6 +2491,10 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
                        dput(dentry);
                        dentry = file->f_path.dentry;
                }
+                if (create_error && dentry->d_inode == NULL) {
+                        error = create_error;
+                        goto out;
+                }
                goto looked_up;
        }
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 8bf3a3f6925a..b7db60897f91 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -12,19 +12,19 @@ nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o
 nfs-$(CONFIG_SYSCTL)    += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
-obj-$(CONFIG_NFS_V2) += nfs2.o
+obj-$(CONFIG_NFS_V2) += nfsv2.o
-nfs2-y := nfs2super.o proc.o nfs2xdr.o
+nfsv2-y := nfs2super.o proc.o nfs2xdr.o
-obj-$(CONFIG_NFS_V3) += nfs3.o
+obj-$(CONFIG_NFS_V3) += nfsv3.o
-nfs3-y := nfs3super.o nfs3client.o nfs3proc.o nfs3xdr.o
+nfsv3-y := nfs3super.o nfs3client.o nfs3proc.o nfs3xdr.o
-nfs3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
+nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
-obj-$(CONFIG_NFS_V4) += nfs4.o
+obj-$(CONFIG_NFS_V4) += nfsv4.o
-nfs4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
+nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \
          delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \
          nfs4namespace.o nfs4getroot.o nfs4client.o
-nfs4-$(CONFIG_SYSCTL)   += nfs4sysctl.o
+nfsv4-$(CONFIG_SYSCTL)  += nfs4sysctl.o
-nfs4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
+nfsv4-$(CONFIG_NFS_V4_1)        += pnfs.o pnfs_dev.o
 obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
 nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 9fc0d9dfc91b..99694442b93f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -105,7 +105,7 @@ struct nfs_subversion *get_nfs_version(unsigned int version)
        if (IS_ERR(nfs)) {
                mutex_lock(&nfs_version_mutex);
-                request_module("nfs%d", version);
+                request_module("nfsv%d", version);
                nfs = find_nfs_version(version);
                mutex_unlock(&nfs_version_mutex);
        }
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index b701358c39c3..a850079467d8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -61,6 +61,12 @@ struct idmap {
        struct mutex            idmap_mutex;
 };
+struct idmap_legacy_upcalldata {
+        struct rpc_pipe_msg pipe_msg;
+        struct idmap_msg idmap_msg;
+        struct idmap *idmap;
+};
 /**
 * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
 * @fattr: fully initialised struct nfs_fattr
@@ -324,6 +330,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
                ret = nfs_idmap_request_key(&key_type_id_resolver_legacy,
                                            name, namelen, type, data,
                                            data_size, idmap);
+                idmap->idmap_key_cons = NULL;
                mutex_unlock(&idmap->idmap_mutex);
        }
        return ret;
@@ -380,11 +387,13 @@ static const match_table_t nfs_idmap_tokens = {
 static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *);
 static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
                                   size_t);
+static void idmap_release_pipe(struct inode *);
 static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
 static const struct rpc_pipe_ops idmap_upcall_ops = {
        .upcall         = rpc_pipe_generic_upcall,
        .downcall       = idmap_pipe_downcall,
+        .release_pipe   = idmap_release_pipe,
        .destroy_msg    = idmap_pipe_destroy_msg,
 };
@@ -616,7 +625,8 @@ void nfs_idmap_quit(void)
        nfs_idmap_quit_keyring();
 }
-static int nfs_idmap_prepare_message(char *desc, struct idmap_msg *im,
+static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
+                                     struct idmap_msg *im,
                                     struct rpc_pipe_msg *msg)
 {
        substring_t substr;
@@ -659,6 +669,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
                                   const char *op,
                                   void *aux)
 {
+        struct idmap_legacy_upcalldata *data;
        struct rpc_pipe_msg *msg;
        struct idmap_msg *im;
        struct idmap *idmap = (struct idmap *)aux;
@@ -666,15 +677,15 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
        int ret = -ENOMEM;
        /* msg and im are freed in idmap_pipe_destroy_msg */
-        msg = kmalloc(sizeof(*msg), GFP_KERNEL);
+        data = kmalloc(sizeof(*data), GFP_KERNEL);
-        if (!msg)
+        if (!data)
-                goto out0;
-        im = kmalloc(sizeof(*im), GFP_KERNEL);
-        if (!im)
                goto out1;
-        ret = nfs_idmap_prepare_message(key->description, im, msg);
+        msg = &data->pipe_msg;
+        im = &data->idmap_msg;
+        data->idmap = idmap;
+        ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
        if (ret < 0)
                goto out2;
@@ -683,15 +694,15 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
        ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
        if (ret < 0)
-                goto out2;
+                goto out3;
        return ret;
+out3:
+        idmap->idmap_key_cons = NULL;
 out2:
-        kfree(im);
+        kfree(data);
 out1:
-        kfree(msg);
-out0:
        complete_request_key(cons, ret);
        return ret;
 }
@@ -749,9 +760,8 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
        }
        if (!(im.im_status & IDMAP_STATUS_SUCCESS)) {
-                ret = mlen;
+                ret = -ENOKEY;
-                complete_request_key(cons, -ENOKEY);
+                goto out;
-                goto out_incomplete;
        }
        namelen_in = strnlen(im.im_name, IDMAP_NAMESZ);
@@ -768,16 +778,32 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 out:
        complete_request_key(cons, ret);
-out_incomplete:
        return ret;
 }
 static void
 idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
 {
+        struct idmap_legacy_upcalldata *data = container_of(msg,
+                        struct idmap_legacy_upcalldata,
+                        pipe_msg);
+        struct idmap *idmap = data->idmap;
+        struct key_construction *cons;
+        if (msg->errno) {
+                cons = ACCESS_ONCE(idmap->idmap_key_cons);
+                idmap->idmap_key_cons = NULL;
+                complete_request_key(cons, msg->errno);
+        }
        /* Free memory allocated in nfs_idmap_legacy_upcall() */
-        kfree(msg->data);
+        kfree(data);
-        kfree(msg);
+}
+static void
+idmap_release_pipe(struct inode *inode)
+{
+        struct rpc_inode *rpci = RPC_I(inode);
+        struct idmap *idmap = (struct idmap *)rpci->private;
+        idmap->idmap_key_cons = NULL;
 }
 int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 0952c791df36..d6b3b5f2d779 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -69,7 +69,7 @@ do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,
        nfs_fattr_init(info->fattr);
        status = rpc_call_sync(client, &msg, 0);
        dprintk("%s: reply fsinfo: %d\n", __func__, status);
-        if (!(info->fattr->valid & NFS_ATTR_FATTR)) {
+        if (status == 0 && !(info->fattr->valid & NFS_ATTR_FATTR)) {
                msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
                msg.rpc_resp = info->fattr;
                status = rpc_call_sync(client, &msg, 0);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 3b950dd81e81..da0618aeeadb 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -205,6 +205,9 @@ extern const struct dentry_operations nfs4_dentry_operations;
 int nfs_atomic_open(struct inode *, struct dentry *, struct file *,
                    unsigned, umode_t, int *);
+/* super.c */
+extern struct file_system_type nfs4_fs_type;
 /* nfs4namespace.c */
 rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
 struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index cbcdfaf32505..24eb663f8ed5 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -74,7 +74,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
        return clp;
 error:
-        kfree(clp);
+        nfs_free_client(clp);
        return ERR_PTR(err);
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a99a8d948721..635274140b18 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3737,9 +3737,10 @@ out:
 static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size_t pgbase, size_t acl_len)
 {
        struct nfs4_cached_acl *acl;
+        size_t buflen = sizeof(*acl) + acl_len;
-        if (pages && acl_len <= PAGE_SIZE) {
+        if (pages && buflen <= PAGE_SIZE) {
-                acl = kmalloc(sizeof(*acl) + acl_len, GFP_KERNEL);
+                acl = kmalloc(buflen, GFP_KERNEL);
                if (acl == NULL)
                        goto out;
                acl->cached = 1;
@@ -3819,7 +3820,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
        if (ret)
                goto out_free;
-        acl_len = res.acl_len - res.acl_data_offset;
+        acl_len = res.acl_len;
        if (acl_len > args.acl_len)
                nfs4_write_cached_acl(inode, NULL, 0, acl_len);
        else
@@ -6223,11 +6224,58 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
        dprintk("<-- %s\n", __func__);
 }
+static size_t max_response_pages(struct nfs_server *server)
+{
+        u32 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
+        return nfs_page_array_len(0, max_resp_sz);
+}
+static void nfs4_free_pages(struct page **pages, size_t size)
+{
+        int i;
+        if (!pages)
+                return;
+        for (i = 0; i < size; i++) {
+                if (!pages[i])
+                        break;
+                __free_page(pages[i]);
+        }
+        kfree(pages);
+}
+static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
+{
+        struct page **pages;
+        int i;
+        pages = kcalloc(size, sizeof(struct page *), gfp_flags);
+        if (!pages) {
+                dprintk("%s: can't alloc array of %zu pages\n", __func__, size);
+                return NULL;
+        }
+        for (i = 0; i < size; i++) {
+                pages[i] = alloc_page(gfp_flags);
+                if (!pages[i]) {
+                        dprintk("%s: failed to allocate page\n", __func__);
+                        nfs4_free_pages(pages, size);
+                        return NULL;
+                }
+        }
+        return pages;
+}
 static void nfs4_layoutget_release(void *calldata)
 {
        struct nfs4_layoutget *lgp = calldata;
+        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+        size_t max_pages = max_response_pages(server);
        dprintk("--> %s\n", __func__);
+        nfs4_free_pages(lgp->args.layout.pages, max_pages);
        put_nfs_open_context(lgp->args.ctx);
        kfree(calldata);
        dprintk("<-- %s\n", __func__);
@@ -6239,9 +6287,10 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = {
        .rpc_release = nfs4_layoutget_release,
 };
-int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
+void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
 {
        struct nfs_server *server = NFS_SERVER(lgp->args.inode);
+        size_t max_pages = max_response_pages(server);
        struct rpc_task *task;
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET],
@@ -6259,12 +6308,19 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
        dprintk("--> %s\n", __func__);
+        lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
+        if (!lgp->args.layout.pages) {
+                nfs4_layoutget_release(lgp);
+                return;
+        }
+        lgp->args.layout.pglen = max_pages * PAGE_SIZE;
        lgp->res.layoutp = &lgp->args.layout;
        lgp->res.seq_res.sr_slot = NULL;
        nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0);
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
-                return PTR_ERR(task);
+                return;
        status = nfs4_wait_for_completion_rpc_task(task);
        if (status == 0)
                status = task->tk_status;
@@ -6272,7 +6328,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
                status = pnfs_layout_process(lgp);
        rpc_put_task(task);
        dprintk("<-- %s status=%d\n", __func__, status);
-        return status;
+        return;
 }
 static void
@@ -6304,12 +6360,8 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
                return;
        }
        spin_lock(&lo->plh_inode->i_lock);
-        if (task->tk_status == 0) {
+        if (task->tk_status == 0 && lrp->res.lrs_present)
-                if (lrp->res.lrs_present) {
+                pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
-                        pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
-                } else
-                        BUG_ON(!list_empty(&lo->plh_segs));
-        }
        lo->plh_block_lgets--;
        spin_unlock(&lo->plh_inode->i_lock);
        dprintk("<-- %s\n", __func__);
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 12a31a9dbcdd..bd61221ad2c5 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -23,14 +23,6 @@ static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
 static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data);
-static struct file_system_type nfs4_fs_type = {
-        .owner          = THIS_MODULE,
-        .name           = "nfs4",
-        .mount          = nfs_fs_mount,
-        .kill_sb        = nfs_kill_super,
-        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
-};
 static struct file_system_type nfs4_remote_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "nfs4",
@@ -344,14 +336,8 @@ static int __init init_nfs_v4(void)
        if (err)
                goto out1;
-        err = register_filesystem(&nfs4_fs_type);
-        if (err < 0)
-                goto out2;
        register_nfs_version(&nfs_v4);
        return 0;
-out2:
-        nfs4_unregister_sysctl();
 out1:
        nfs_idmap_quit();
 out:
@@ -361,7 +347,6 @@ out:
 static void __exit exit_nfs_v4(void)
 {
        unregister_nfs_version(&nfs_v4);
-        unregister_filesystem(&nfs4_fs_type);
        nfs4_unregister_sysctl();
        nfs_idmap_quit();
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index ca13483edd60..1bfbd67c556d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5045,22 +5045,19 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
                         struct nfs_getaclres *res)
 {
        unsigned int savep;
-        __be32 *bm_p;
        uint32_t attrlen,
                 bitmap[3] = {0};
        int status;
-        size_t page_len = xdr->buf->page_len;
+        unsigned int pg_offset;
        res->acl_len = 0;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
                goto out;
-        bm_p = xdr->p;
+        xdr_enter_page(xdr, xdr->buf->page_len);
-        res->acl_data_offset = be32_to_cpup(bm_p) + 2;
-        res->acl_data_offset <<= 2;
+        /* Calculate the offset of the page data */
-        /* Check if the acl data starts beyond the allocated buffer */
+        pg_offset = xdr->buf->head[0].iov_len;
-        if (res->acl_data_offset > page_len)
-                return -ERANGE;
        if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
                goto out;
@@ -5074,23 +5071,20 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
                /* The bitmap (xdr len + bitmaps) and the attr xdr len words
                 * are stored with the acl data to handle the problem of
                 * variable length bitmaps.*/
-                xdr->p = bm_p;
+                res->acl_data_offset = xdr_stream_pos(xdr) - pg_offset;
                /* We ignore &savep and don't do consistency checks on
                 * the attr length.  Let userspace figure it out.... */
-                attrlen += res->acl_data_offset;
+                res->acl_len = attrlen;
-                if (attrlen > page_len) {
+                if (attrlen > (xdr->nwords << 2)) {
                        if (res->acl_flags & NFS4_ACL_LEN_REQUEST) {
                                /* getxattr interface called with a NULL buf */
-                                res->acl_len = attrlen;
                                goto out;
                        }
-                        dprintk("NFS: acl reply: attrlen %u > page_len %zu\n",
+                        dprintk("NFS: acl reply: attrlen %u > page_len %u\n",
-                                        attrlen, page_len);
+                                        attrlen, xdr->nwords << 2);
                        return -EINVAL;
                }
-                xdr_read_pages(xdr, attrlen);
-                res->acl_len = attrlen;
        } else
                status = -EOPNOTSUPP;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index f50d3e8d6f22..ea6d111b03e9 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -570,17 +570,66 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
                return false;
        return pgio->pg_count + req->wb_bytes <=
-                        OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
+                        (unsigned long)pgio->pg_layout_private;
+}
+void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+        pnfs_generic_pg_init_read(pgio, req);
+        if (unlikely(pgio->pg_lseg == NULL))
+                return; /* Not pNFS */
+        pgio->pg_layout_private = (void *)
+                                OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
+}
+static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout,
+                                   unsigned long *stripe_end)
+{
+        u32 stripe_off;
+        unsigned stripe_size;
+        if (layout->raid_algorithm == PNFS_OSD_RAID_0)
+                return true;
+        stripe_size = layout->stripe_unit *
+                                (layout->group_width - layout->parity);
+        div_u64_rem(offset, stripe_size, &stripe_off);
+        if (!stripe_off)
+                return true;
+        *stripe_end = stripe_size - stripe_off;
+        return false;
+}
+void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
+{
+        unsigned long stripe_end = 0;
+        pnfs_generic_pg_init_write(pgio, req);
+        if (unlikely(pgio->pg_lseg == NULL))
+                return; /* Not pNFS */
+        if (req->wb_offset ||
+            !aligned_on_raid_stripe(req->wb_index * PAGE_SIZE,
+                               &OBJIO_LSEG(pgio->pg_lseg)->layout,
+                               &stripe_end)) {
+                pgio->pg_layout_private = (void *)stripe_end;
+        } else {
+                pgio->pg_layout_private = (void *)
+                                OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length;
+        }
 }
 static const struct nfs_pageio_ops objio_pg_read_ops = {
-        .pg_init = pnfs_generic_pg_init_read,
+        .pg_init = objio_init_read,
        .pg_test = objio_pg_test,
        .pg_doio = pnfs_generic_pg_readpages,
 };
 static const struct nfs_pageio_ops objio_pg_write_ops = {
-        .pg_init = pnfs_generic_pg_init_write,
+        .pg_init = objio_init_write,
        .pg_test = objio_pg_test,
        .pg_doio = pnfs_generic_pg_writepages,
 };
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 1a6732ed04a4..311a79681e2b 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -49,6 +49,7 @@ void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
        hdr->io_start = req_offset(hdr->req);
        hdr->good_bytes = desc->pg_count;
        hdr->dreq = desc->pg_dreq;
+        hdr->layout_private = desc->pg_layout_private;
        hdr->release = release;
        hdr->completion_ops = desc->pg_completion_ops;
        if (hdr->completion_ops->init_hdr)
@@ -268,6 +269,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
        desc->pg_error = 0;
        desc->pg_lseg = NULL;
        desc->pg_dreq = NULL;
+        desc->pg_layout_private = NULL;
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 76875bfcf19c..2e00feacd4be 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -583,9 +583,6 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        struct nfs_server *server = NFS_SERVER(ino);
        struct nfs4_layoutget *lgp;
        struct pnfs_layout_segment *lseg = NULL;
-        struct page **pages = NULL;
-        int i;
-        u32 max_resp_sz, max_pages;
        dprintk("--> %s\n", __func__);
@@ -594,20 +591,6 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        if (lgp == NULL)
                return NULL;
-        /* allocate pages for xdr post processing */
-        max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
-        max_pages = nfs_page_array_len(0, max_resp_sz);
-        pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
-        if (!pages)
-                goto out_err_free;
-        for (i = 0; i < max_pages; i++) {
-                pages[i] = alloc_page(gfp_flags);
-                if (!pages[i])
-                        goto out_err_free;
-        }
        lgp->args.minlength = PAGE_CACHE_SIZE;
        if (lgp->args.minlength > range->length)
                lgp->args.minlength = range->length;
@@ -616,39 +599,19 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        lgp->args.type = server->pnfs_curr_ld->id;
        lgp->args.inode = ino;
        lgp->args.ctx = get_nfs_open_context(ctx);
-        lgp->args.layout.pages = pages;
-        lgp->args.layout.pglen = max_pages * PAGE_SIZE;
        lgp->lsegpp = &lseg;
        lgp->gfp_flags = gfp_flags;
        /* Synchronously retrieve layout information from server and
         * store in lseg.
         */
-        nfs4_proc_layoutget(lgp);
+        nfs4_proc_layoutget(lgp, gfp_flags);
        if (!lseg) {
                /* remember that LAYOUTGET failed and suspend trying */
                set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
        }
-        /* free xdr pages */
-        for (i = 0; i < max_pages; i++)
-                __free_page(pages[i]);
-        kfree(pages);
        return lseg;
-out_err_free:
-        /* free any allocated xdr pages, lgp as it's not used */
-        if (pages) {
-                for (i = 0; i < max_pages; i++) {
-                        if (!pages[i])
-                                break;
-                        __free_page(pages[i]);
-                }
-                kfree(pages);
-        }
-        kfree(lgp);
-        return NULL;
 }
 /*
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 2c6c80503ba4..745aa1b39e7c 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -172,7 +172,7 @@ extern int nfs4_proc_getdevicelist(struct nfs_server *server,
                                   struct pnfs_devicelist *devlist);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
                                   struct pnfs_device *dev);
-extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+extern void nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
 extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 /* pnfs.c */
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ac6a3c55dce4..239aff7338eb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -319,6 +319,34 @@ EXPORT_SYMBOL_GPL(nfs_sops);
 static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *);
 static int nfs4_validate_mount_data(void *options,
        struct nfs_parsed_mount_data *args, const char *dev_name);
+struct file_system_type nfs4_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "nfs4",
+        .mount          = nfs_fs_mount,
+        .kill_sb        = nfs_kill_super,
+        .fs_flags       = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
+};
+EXPORT_SYMBOL_GPL(nfs4_fs_type);
+static int __init register_nfs4_fs(void)
+{
+        return register_filesystem(&nfs4_fs_type);
+}
+static void unregister_nfs4_fs(void)
+{
+        unregister_filesystem(&nfs4_fs_type);
+}
+#else
+static int __init register_nfs4_fs(void)
+{
+        return 0;
+}
+static void unregister_nfs4_fs(void)
+{
+}
 #endif
 static struct shrinker acl_shrinker = {
@@ -337,12 +365,18 @@ int __init register_nfs_fs(void)
        if (ret < 0)
                goto error_0;
-        ret = nfs_register_sysctl();
+        ret = register_nfs4_fs();
        if (ret < 0)
                goto error_1;
+        ret = nfs_register_sysctl();
+        if (ret < 0)
+                goto error_2;
        register_shrinker(&acl_shrinker);
        return 0;
+error_2:
+        unregister_nfs4_fs();
 error_1:
        unregister_filesystem(&nfs_fs_type);
 error_0:
@@ -356,6 +390,7 @@ void __exit unregister_nfs_fs(void)
 {
        unregister_shrinker(&acl_shrinker);
        nfs_unregister_sysctl();
+        unregister_nfs4_fs();
        unregister_filesystem(&nfs_fs_type);
 }
@@ -2645,4 +2680,6 @@ MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
 module_param(send_implementation_id, ushort, 0644);
 MODULE_PARM_DESC(send_implementation_id,
                "Send implementation ID with NFSv4.1 exchange_id");
+MODULE_ALIAS("nfs4");
 #endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5829d0ce7cfb..e3b55372726c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1814,19 +1814,19 @@ int __init nfs_init_writepagecache(void)
        nfs_wdata_mempool = mempool_create_slab_pool(MIN_POOL_WRITE,
                                                     nfs_wdata_cachep);
        if (nfs_wdata_mempool == NULL)
-                return -ENOMEM;
+                goto out_destroy_write_cache;
        nfs_cdata_cachep = kmem_cache_create("nfs_commit_data",
                                             sizeof(struct nfs_commit_data),
                                             0, SLAB_HWCACHE_ALIGN,
                                             NULL);
        if (nfs_cdata_cachep == NULL)
-                return -ENOMEM;
+                goto out_destroy_write_mempool;
        nfs_commit_mempool = mempool_create_slab_pool(MIN_POOL_COMMIT,
                                                      nfs_wdata_cachep);
        if (nfs_commit_mempool == NULL)
-                return -ENOMEM;
+                goto out_destroy_commit_cache;
        /*
         * NFS congestion size, scale with available memory.
@@ -1849,11 +1849,20 @@ int __init nfs_init_writepagecache(void)
                nfs_congestion_kb = 256*1024;
        return 0;
+out_destroy_commit_cache:
+        kmem_cache_destroy(nfs_cdata_cachep);
+out_destroy_write_mempool:
+        mempool_destroy(nfs_wdata_mempool);
+out_destroy_write_cache:
+        kmem_cache_destroy(nfs_wdata_cachep);
+        return -ENOMEM;
 }
 void nfs_destroy_writepagecache(void)
 {
        mempool_destroy(nfs_commit_mempool);
+        kmem_cache_destroy(nfs_cdata_cachep);
        mempool_destroy(nfs_wdata_mempool);
        kmem_cache_destroy(nfs_wdata_cachep);
 }
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index cbaf4f8bb7b7..4c7bd35b1876 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -651,12 +651,12 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
        if (clp->cl_minorversion == 0) {
                if (!clp->cl_cred.cr_principal &&
-                                (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+                                (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5))
                        return -EINVAL;
                args.client_name = clp->cl_cred.cr_principal;
                args.prognumber = conn->cb_prog,
                args.protocol = XPRT_TRANSPORT_TCP;
-                args.authflavor = clp->cl_flavor;
+                args.authflavor = clp->cl_cred.cr_flavor;
                clp->cl_cb_ident = conn->cb_ident;
        } else {
                if (!conn->cb_xprt)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e6173147f982..22bd0a66c356 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -231,7 +231,6 @@ struct nfs4_client {
        nfs4_verifier           cl_verifier;    /* generated by client */
        time_t                  cl_time;        /* time of last lease renewal */
        struct sockaddr_storage cl_addr;        /* client ipaddress */
-        u32                     cl_flavor;      /* setclientid pseudoflavor */
        struct svc_cred         cl_cred;        /* setclientid principal */
        clientid_t              cl_clientid;    /* generated by server */
        nfs4_verifier           cl_confirm;     /* generated by server */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6522cac6057c..6a10812711c1 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -676,17 +676,13 @@ static const struct super_operations nilfs_sops = {
        .alloc_inode    = nilfs_alloc_inode,
        .destroy_inode  = nilfs_destroy_inode,
        .dirty_inode    = nilfs_dirty_inode,
-        /* .write_inode    = nilfs_write_inode, */
-        /* .drop_inode    = nilfs_drop_inode, */
        .evict_inode    = nilfs_evict_inode,
        .put_super      = nilfs_put_super,
-        /* .write_super    = nilfs_write_super, */
        .sync_fs        = nilfs_sync_fs,
        .freeze_fs      = nilfs_freeze,
        .unfreeze_fs    = nilfs_unfreeze,
        .statfs         = nilfs_statfs,
        .remount_fs     = nilfs_remount,
-        /* .umount_begin */
        .show_options = nilfs_show_options
 };
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 6eee4177807b..be1267a34cea 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -107,8 +107,6 @@ struct the_nilfs {
         * used for
         * - loading the latest checkpoint exclusively.
         * - allocating a new full segment.
-         * - protecting s_dirt in the super_block struct
-         *   (see nilfs_write_super) and the following fields.
         */
        struct buffer_head     *ns_sbh[2];
        struct nilfs_super_block *ns_sbp[2];
diff --git a/fs/open.c b/fs/open.c
index f3d96e7e7b19..e1f2cdb91a4d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -717,7 +717,7 @@ cleanup_all:
                         * here, so just reset the state.
                         */
                        file_reset_write(f);
-                        mnt_drop_write(f->f_path.mnt);
+                        __mnt_drop_write(f->f_path.mnt);
                }
        }
 cleanup_file:
@@ -852,9 +852,10 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
        int lookup_flags = 0;
        int acc_mode;
-        if (!(flags & O_CREAT))
+        if (flags & O_CREAT)
-                mode = 0;
+                op->mode = (mode & S_IALLUGO) | S_IFREG;
-        op->mode = mode;
+        else
+                op->mode = 0;
        /* Must never be set by userspace */
        flags &= ~FMODE_NONOTIFY;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 36a29b753c79..c495a3055e2a 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1589,10 +1589,10 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
                goto out;
        }
-        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;
+        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!dquots[cnt])
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 4c0c7d163d15..a98b7740a0fc 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1334,9 +1334,7 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
        else if (bitmap == 0)
                block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
-        reiserfs_write_unlock(sb);
        bh = sb_bread(sb, block);
-        reiserfs_write_lock(sb);
        if (bh == NULL)
                reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
                                 "reading failed", __func__, block);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a6d4268fb6c1..855da58db145 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -76,10 +76,10 @@ void reiserfs_evict_inode(struct inode *inode)
                ;
        }
      out:
+        reiserfs_write_unlock_once(inode->i_sb, depth);
        clear_inode(inode);     /* note this must go after the journal_end to prevent deadlock */
        dquot_drop(inode);
        inode->i_blocks = 0;
-        reiserfs_write_unlock_once(inode->i_sb, depth);
        return;
 no_delete:
diff --git a/fs/super.c b/fs/super.c
index b05cf47463d0..0902cfa6a12e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -537,46 +537,6 @@ void drop_super(struct super_block *sb)
 EXPORT_SYMBOL(drop_super);
 /**
- * sync_supers - helper for periodic superblock writeback
- *
- * Call the write_super method if present on all dirty superblocks in
- * the system.  This is for the periodic writeback used by most older
- * filesystems.  For data integrity superblock writeback use
- * sync_filesystems() instead.
- *
- * Note: check the dirty flag before waiting, so we don't
- * hold up the sync while mounting a device. (The newly
- * mounted device won't need syncing.)
- */
-void sync_supers(void)
-{
-        struct super_block *sb, *p = NULL;
-        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (hlist_unhashed(&sb->s_instances))
-                        continue;
-                if (sb->s_op->write_super && sb->s_dirt) {
-                        sb->s_count++;
-                        spin_unlock(&sb_lock);
-                        down_read(&sb->s_umount);
-                        if (sb->s_root && sb->s_dirt && (sb->s_flags & MS_BORN))
-                                sb->s_op->write_super(sb);
-                        up_read(&sb->s_umount);
-                        spin_lock(&sb_lock);
-                        if (p)
-                                __put_super(p);
-                        p = sb;
-                }
-        }
-        if (p)
-                __put_super(p);
-        spin_unlock(&sb_lock);
-}
-/**
 *      iterate_supers - call function for all active superblocks
 *      @f: function to call
 *      @arg: argument to pass to it
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 8b8cc4e945f4..760de723dadb 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -167,7 +167,7 @@ struct ubifs_global_debug_info {
 #define ubifs_dbg_msg(type, fmt, ...) \
        pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
-#define DBG_KEY_BUF_LEN 32
+#define DBG_KEY_BUF_LEN 48
 #define ubifs_dbg_msg_key(type, key, fmt, ...) do {                            \
        char __tmp_key_buf[DBG_KEY_BUF_LEN];                                   \
        pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__,             \
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 35389ca2d267..7bd6e72afd11 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -37,11 +37,11 @@
 *
 * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we
 * implement. However, this is not true for 'ubifs_writepage()', which may be
- * called with @i_mutex unlocked. For example, when pdflush is doing background
+ * called with @i_mutex unlocked. For example, when flusher thread is doing
- * write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. At "normal"
+ * background write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex.
- * work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. in the
+ * At "normal" work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g.
- * "sys_write -> alloc_pages -> direct reclaim path". So, in 'ubifs_writepage()'
+ * in the "sys_write -> alloc_pages -> direct reclaim path". So, in
- * we are only guaranteed that the page is locked.
+ * 'ubifs_writepage()' we are only guaranteed that the page is locked.
 *
 * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
 * read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index ce33b2beb151..8640920766ed 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1749,7 +1749,10 @@ int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr)
        return 0;
 out_err:
-        ubifs_lpt_free(c, 0);
+        if (wr)
+                ubifs_lpt_free(c, 1);
+        if (rd)
+                ubifs_lpt_free(c, 0);
        return err;
 }
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index c30d976b4be8..edeec499c048 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -788,7 +788,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 corrupted_rescan:
        /* Re-scan the corrupted data with verbose messages */
-        ubifs_err("corruptio %d", ret);
+        ubifs_err("corruption %d", ret);
        ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
 corrupted:
        ubifs_scanned_corruption(c, lnum, offs, buf);
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index eba46d4a7619..94d78fc5d4e0 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -1026,7 +1026,6 @@ int ubifs_replay_journal(struct ubifs_info *c)
        c->replaying = 1;
        lnum = c->ltail_lnum = c->lhead_lnum;
-        lnum = UBIFS_LOG_LNUM;
        do {
                err = replay_log_leb(c, lnum, 0, c->sbuf);
                if (err == 1)
@@ -1035,7 +1034,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
                if (err)
                        goto out;
                lnum = ubifs_next_log_lnum(c, lnum);
-        } while (lnum != UBIFS_LOG_LNUM);
+        } while (lnum != c->ltail_lnum);
        err = replay_buds(c);
        if (err)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1c766c39c038..71a197f0f93d 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -303,7 +303,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
        mutex_lock(&ui->ui_mutex);
        /*
         * Due to races between write-back forced by budgeting
-         * (see 'sync_some_inodes()') and pdflush write-back, the inode may
+         * (see 'sync_some_inodes()') and background write-back, the inode may
         * have already been synchronized, do not do this again. This might
         * also happen if it was synchronized in an VFS operation, e.g.
         * 'ubifs_link()'.
@@ -1157,9 +1157,6 @@ static int check_free_space(struct ubifs_info *c)
 *
 * This function mounts UBIFS file system. Returns zero in case of success and
 * a negative error code in case of failure.
- *
- * Note, the function does not de-allocate resources it it fails half way
- * through, and the caller has to do this instead.
 */
 static int mount_ubifs(struct ubifs_info *c)
 {
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index fafaad795cd6..aa233469b3c1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1124,14 +1124,17 @@ int udf_setsize(struct inode *inode, loff_t newsize)
                                if (err)
                                        return err;
                                down_write(&iinfo->i_data_sem);
-                        } else
+                        } else {
                                iinfo->i_lenAlloc = newsize;
+                                goto set_size;
+                        }
                }
                err = udf_extend_file(inode, newsize);
                if (err) {
                        up_write(&iinfo->i_data_sem);
                        return err;
                }
+set_size:
                truncate_setsize(inode, newsize);
                up_write(&iinfo->i_data_sem);
        } else {
diff --git a/fs/udf/super.c b/fs/udf/super.c
index dcbf98722afc..18fc038a438d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1344,6 +1344,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
                udf_err(sb, "error loading logical volume descriptor: "
                        "Partition table too long (%u > %lu)\n", table_len,
                        sb->s_blocksize - sizeof(*lvd));
+                ret = 1;
                goto out_bh;
        }
@@ -1388,8 +1389,10 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
                                                UDF_ID_SPARABLE,
                                                strlen(UDF_ID_SPARABLE))) {
                                if (udf_load_sparable_map(sb, map,
-                                    (struct sparablePartitionMap *)gpm) < 0)
+                                    (struct sparablePartitionMap *)gpm) < 0) {
+                                        ret = 1;
                                        goto out_bh;
+                                }
                        } else if (!strncmp(upm2->partIdent.ident,
                                                UDF_ID_METADATA,
                                                strlen(UDF_ID_METADATA))) {
@@ -2000,6 +2003,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
                        if (!silent)
                                pr_notice("Rescanning with blocksize %d\n",
                                          UDF_DEFAULT_BLOCKSIZE);
+                        brelse(sbi->s_lvid_bh);
+                        sbi->s_lvid_bh = NULL;
                        uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
                        ret = udf_load_vrs(sb, &uopt, silent, &fileset);
                }