33 files changed, 457 insertions, 221 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 8855aad3929c..22c64fff1bd5 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
                return PTR_ERR(fspath);
        if (fspath > fspath_min) {
-                ipath->fspath->val[i] = (u64)fspath;
+                ipath->fspath->val[i] = (u64)(unsigned long)fspath;
                ++ipath->fspath->elem_cnt;
                ipath->fspath->bytes_left = fspath - fspath_min;
        } else {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0fe615e4ea38..dede441bdeee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct extent_buffer *buf)
 {
+        /* ensure we can see the force_cow */
+        smp_rmb();
+        /*
+         * We do not need to cow a block if
+         * 1) this block is not created or changed in this transaction;
+         * 2) this block does not belong to TREE_RELOC tree;
+         * 3) the root is not forced COW.
+         *
+         * What is forced COW:
+         *    when we create snapshot during commiting the transaction,
+         *    after we've finished coping src root, we must COW the shared
+         *    block to ensure the metadata consistency.
+         */
        if (btrfs_header_generation(buf) == trans->transid &&
            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
-              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
+            !root->force_cow)
                return 0;
        return 1;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9ba59ff9292..04a5dfcee5a1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -848,7 +848,8 @@ struct btrfs_free_cluster {
 enum btrfs_caching_type {
        BTRFS_CACHE_NO          = 0,
        BTRFS_CACHE_STARTED     = 1,
-        BTRFS_CACHE_FINISHED    = 2,
+        BTRFS_CACHE_FAST        = 2,
+        BTRFS_CACHE_FINISHED    = 3,
 };
 enum btrfs_disk_cache_state {
@@ -1271,6 +1272,8 @@ struct btrfs_root {
         * for stat.  It may be used for more later
         */
        dev_t anon_dev;
+        int force_cow;
 };
 struct btrfs_ioctl_defrag_range_args {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 622654fe051f..b09175901521 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -620,7 +620,7 @@ out:
 static int btree_io_failed_hook(struct bio *failed_bio,
                         struct page *page, u64 start, u64 end,
-                         u64 mirror_num, struct extent_state *state)
+                         int mirror_num, struct extent_state *state)
 {
        struct extent_io_tree *tree;
        unsigned long len;
@@ -2569,22 +2569,10 @@ static int write_dev_supers(struct btrfs_device *device,
        int errors = 0;
        u32 crc;
        u64 bytenr;
-        int last_barrier = 0;
        if (max_mirrors == 0)
                max_mirrors = BTRFS_SUPER_MIRROR_MAX;
-        /* make sure only the last submit_bh does a barrier */
-        if (do_barriers) {
-                for (i = 0; i < max_mirrors; i++) {
-                        bytenr = btrfs_sb_offset(i);
-                        if (bytenr + BTRFS_SUPER_INFO_SIZE >=
-                            device->total_bytes)
-                                break;
-                        last_barrier = i;
-                }
-        }
        for (i = 0; i < max_mirrors; i++) {
                bytenr = btrfs_sb_offset(i);
                if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2630,17 +2618,136 @@ static int write_dev_supers(struct btrfs_device *device,
                        bh->b_end_io = btrfs_end_buffer_write_sync;
                }
-                if (i == last_barrier && do_barriers)
+                /*
-                        ret = submit_bh(WRITE_FLUSH_FUA, bh);
+                 * we fua the first super.  The others we allow
-                else
+                 * to go down lazy.
-                        ret = submit_bh(WRITE_SYNC, bh);
+                 */
+                ret = submit_bh(WRITE_FUA, bh);
                if (ret)
                        errors++;
        }
        return errors < i ? 0 : -1;
 }
+/*
+ * endio for the write_dev_flush, this will wake anyone waiting
+ * for the barrier when it is done
+ */
+static void btrfs_end_empty_barrier(struct bio *bio, int err)
+{
+        if (err) {
+                if (err == -EOPNOTSUPP)
+                        set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+        }
+        if (bio->bi_private)
+                complete(bio->bi_private);
+        bio_put(bio);
+}
+/*
+ * trigger flushes for one the devices.  If you pass wait == 0, the flushes are
+ * sent down.  With wait == 1, it waits for the previous flush.
+ *
+ * any device where the flush fails with eopnotsupp are flagged as not-barrier
+ * capable
+ */
+static int write_dev_flush(struct btrfs_device *device, int wait)
+{
+        struct bio *bio;
+        int ret = 0;
+        if (device->nobarriers)
+                return 0;
+        if (wait) {
+                bio = device->flush_bio;
+                if (!bio)
+                        return 0;
+                wait_for_completion(&device->flush_wait);
+                if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+                        printk("btrfs: disabling barriers on dev %s\n",
+                               device->name);
+                        device->nobarriers = 1;
+                }
+                if (!bio_flagged(bio, BIO_UPTODATE)) {
+                        ret = -EIO;
+                }
+                /* drop the reference from the wait == 0 run */
+                bio_put(bio);
+                device->flush_bio = NULL;
+                return ret;
+        }
+        /*
+         * one reference for us, and we leave it for the
+         * caller
+         */
+        device->flush_bio = NULL;;
+        bio = bio_alloc(GFP_NOFS, 0);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_end_io = btrfs_end_empty_barrier;
+        bio->bi_bdev = device->bdev;
+        init_completion(&device->flush_wait);
+        bio->bi_private = &device->flush_wait;
+        device->flush_bio = bio;
+        bio_get(bio);
+        submit_bio(WRITE_FLUSH, bio);
+        return 0;
+}
+/*
+ * send an empty flush down to each device in parallel,
+ * then wait for them
+ */
+static int barrier_all_devices(struct btrfs_fs_info *info)
+{
+        struct list_head *head;
+        struct btrfs_device *dev;
+        int errors = 0;
+        int ret;
+        /* send down all the barriers */
+        head = &info->fs_devices->devices;
+        list_for_each_entry_rcu(dev, head, dev_list) {
+                if (!dev->bdev) {
+                        errors++;
+                        continue;
+                }
+                if (!dev->in_fs_metadata || !dev->writeable)
+                        continue;
+                ret = write_dev_flush(dev, 0);
+                if (ret)
+                        errors++;
+        }
+        /* wait for all the barriers */
+        list_for_each_entry_rcu(dev, head, dev_list) {
+                if (!dev->bdev) {
+                        errors++;
+                        continue;
+                }
+                if (!dev->in_fs_metadata || !dev->writeable)
+                        continue;
+                ret = write_dev_flush(dev, 1);
+                if (ret)
+                        errors++;
+        }
+        if (errors)
+                return -EIO;
+        return 0;
+}
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
        struct list_head *head;
@@ -2662,6 +2769,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        head = &root->fs_info->fs_devices->devices;
+        if (do_barriers)
+                barrier_all_devices(root->fs_info);
        list_for_each_entry_rcu(dev, head, dev_list) {
                if (!dev->bdev) {
                        total_errors++;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b232150b5b6b..930ae8949737 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                             struct btrfs_root *root,
                             int load_cache_only)
 {
+        DEFINE_WAIT(wait);
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct btrfs_caching_control *caching_ctl;
        int ret = 0;
-        smp_mb();
+        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
-        if (cache->cached != BTRFS_CACHE_NO)
+        BUG_ON(!caching_ctl);
+        INIT_LIST_HEAD(&caching_ctl->list);
+        mutex_init(&caching_ctl->mutex);
+        init_waitqueue_head(&caching_ctl->wait);
+        caching_ctl->block_group = cache;
+        caching_ctl->progress = cache->key.objectid;
+        atomic_set(&caching_ctl->count, 1);
+        caching_ctl->work.func = caching_thread;
+        spin_lock(&cache->lock);
+        /*
+         * This should be a rare occasion, but this could happen I think in the
+         * case where one thread starts to load the space cache info, and then
+         * some other thread starts a transaction commit which tries to do an
+         * allocation while the other thread is still loading the space cache
+         * info.  The previous loop should have kept us from choosing this block
+         * group, but if we've moved to the state where we will wait on caching
+         * block groups we need to first check if we're doing a fast load here,
+         * so we can wait for it to finish, otherwise we could end up allocating
+         * from a block group who's cache gets evicted for one reason or
+         * another.
+         */
+        while (cache->cached == BTRFS_CACHE_FAST) {
+                struct btrfs_caching_control *ctl;
+                ctl = cache->caching_ctl;
+                atomic_inc(&ctl->count);
+                prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
+                spin_unlock(&cache->lock);
+                schedule();
+                finish_wait(&ctl->wait, &wait);
+                put_caching_control(ctl);
+                spin_lock(&cache->lock);
+        }
+        if (cache->cached != BTRFS_CACHE_NO) {
+                spin_unlock(&cache->lock);
+                kfree(caching_ctl);
                return 0;
+        }
+        WARN_ON(cache->caching_ctl);
+        cache->caching_ctl = caching_ctl;
+        cache->cached = BTRFS_CACHE_FAST;
+        spin_unlock(&cache->lock);
        /*
         * We can't do the read from on-disk cache during a commit since we need
@@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        if (trans && (!trans->transaction->in_commit) &&
            (root && root != root->fs_info->tree_root) &&
            btrfs_test_opt(root, SPACE_CACHE)) {
-                spin_lock(&cache->lock);
-                if (cache->cached != BTRFS_CACHE_NO) {
-                        spin_unlock(&cache->lock);
-                        return 0;
-                }
-                cache->cached = BTRFS_CACHE_STARTED;
-                spin_unlock(&cache->lock);
                ret = load_free_space_cache(fs_info, cache);
                spin_lock(&cache->lock);
                if (ret == 1) {
+                        cache->caching_ctl = NULL;
                        cache->cached = BTRFS_CACHE_FINISHED;
                        cache->last_byte_to_unpin = (u64)-1;
                } else {
-                        cache->cached = BTRFS_CACHE_NO;
+                        if (load_cache_only) {
+                                cache->caching_ctl = NULL;
+                                cache->cached = BTRFS_CACHE_NO;
+                        } else {
+                                cache->cached = BTRFS_CACHE_STARTED;
+                        }
                }
                spin_unlock(&cache->lock);
+                wake_up(&caching_ctl->wait);
                if (ret == 1) {
+                        put_caching_control(caching_ctl);
                        free_excluded_extents(fs_info->extent_root, cache);
                        return 0;
                }
+        } else {
+                /*
+                 * We are not going to do the fast caching, set cached to the
+                 * appropriate value and wakeup any waiters.
+                 */
+                spin_lock(&cache->lock);
+                if (load_cache_only) {
+                        cache->caching_ctl = NULL;
+                        cache->cached = BTRFS_CACHE_NO;
+                } else {
+                        cache->cached = BTRFS_CACHE_STARTED;
+                }
+                spin_unlock(&cache->lock);
+                wake_up(&caching_ctl->wait);
        }
-        if (load_cache_only)
+        if (load_cache_only) {
-                return 0;
+                put_caching_control(caching_ctl);
-        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
-        BUG_ON(!caching_ctl);
-        INIT_LIST_HEAD(&caching_ctl->list);
-        mutex_init(&caching_ctl->mutex);
-        init_waitqueue_head(&caching_ctl->wait);
-        caching_ctl->block_group = cache;
-        caching_ctl->progress = cache->key.objectid;
-        /* one for caching kthread, one for caching block group list */
-        atomic_set(&caching_ctl->count, 2);
-        caching_ctl->work.func = caching_thread;
-        spin_lock(&cache->lock);
-        if (cache->cached != BTRFS_CACHE_NO) {
-                spin_unlock(&cache->lock);
-                kfree(caching_ctl);
                return 0;
        }
-        cache->caching_ctl = caching_ctl;
-        cache->cached = BTRFS_CACHE_STARTED;
-        spin_unlock(&cache->lock);
        down_write(&fs_info->extent_commit_sem);
+        atomic_inc(&caching_ctl->count);
        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
        up_write(&fs_info->extent_commit_sem);
@@ -5178,13 +5219,15 @@ search:
                }
 have_block_group:
-                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+                cached = block_group_cache_done(block_group);
+                if (unlikely(!cached)) {
                        u64 free_percent;
+                        found_uncached_bg = true;
                        ret = cache_block_group(block_group, trans,
                                                orig_root, 1);
                        if (block_group->cached == BTRFS_CACHE_FINISHED)
-                                goto have_block_group;
+                                goto alloc;
                        free_percent = btrfs_block_group_used(&block_group->item);
                        free_percent *= 100;
@@ -5206,7 +5249,6 @@ have_block_group:
                                                        orig_root, 0);
                                BUG_ON(ret);
                        }
-                        found_uncached_bg = true;
                        /*
                         * If loop is set for cached only, try the next block
@@ -5216,10 +5258,7 @@ have_block_group:
                                goto loop;
                }
-                cached = block_group_cache_done(block_group);
+alloc:
-                if (unlikely(!cached))
-                        found_uncached_bg = true;
                if (unlikely(block_group->ro))
                        goto loop;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1f87c4d0e7a0..9472d3de5e52 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2285,8 +2285,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                clean_io_failure(start, page);
                }
                if (!uptodate) {
-                        u64 failed_mirror;
+                        int failed_mirror;
-                        failed_mirror = (u64)bio->bi_bdev;
+                        failed_mirror = (int)(unsigned long)bio->bi_bdev;
                        if (tree->ops && tree->ops->readpage_io_failed_hook)
                                ret = tree->ops->readpage_io_failed_hook(
                                                bio, page, start, end,
@@ -3366,6 +3366,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                return -ENOMEM;
        path->leave_spinning = 1;
+        start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
+        len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
        /*
         * lookup the last file extent.  We're not using i_size here
         * because there might be preallocation past i_size
@@ -3413,7 +3416,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
                         &cached_state, GFP_NOFS);
-        em = get_extent_skip_holes(inode, off, last_for_get_extent,
+        em = get_extent_skip_holes(inode, start, last_for_get_extent,
                                   get_extent);
        if (!em)
                goto out;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index feb9be0e23bc..7604c3001322 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -70,7 +70,7 @@ struct extent_io_ops {
                              unsigned long bio_flags);
        int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
        int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
-                                       u64 start, u64 end, u64 failed_mirror,
+                                       u64 start, u64 end, int failed_mirror,
                                       struct extent_state *state);
        int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
                                        u64 start, u64 end,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 181760f9d2ab..6e5b7e463698 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -351,6 +351,11 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
                }
        }
+        for (i = 0; i < io_ctl->num_pages; i++) {
+                clear_page_dirty_for_io(io_ctl->pages[i]);
+                set_page_extent_mapped(io_ctl->pages[i]);
+        }
        return 0;
 }
@@ -1844,7 +1849,13 @@ again:
                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
                                          1, 0);
                if (!info) {
-                        WARN_ON(1);
+                        /* the tree logging code might be calling us before we
+                         * have fully loaded the free space rbtree for this
+                         * block group.  So it is possible the entry won't
+                         * be in the rbtree yet at all.  The caching code
+                         * will make sure not to put it in the rbtree if
+                         * the logging code has pinned it.
+                         */
                        goto out_lock;
                }
        }
@@ -2451,16 +2462,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry;
-        struct rb_node *node;
        int ret = -ENOSPC;
+        u64 bitmap_offset = offset_to_bitmap(ctl, offset);
        if (ctl->total_bitmaps == 0)
                return -ENOSPC;
        /*
-         * First check our cached list of bitmaps and see if there is an entry
+         * The bitmap that covers offset won't be in the list unless offset
-         * here that will work.
+         * is just its start offset.
         */
+        entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
+        if (entry->offset != bitmap_offset) {
+                entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
+                if (entry && list_empty(&entry->list))
+                        list_add(&entry->list, bitmaps);
+        }
        list_for_each_entry(entry, bitmaps, list) {
                if (entry->bytes < min_bytes)
                        continue;
@@ -2471,38 +2489,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
        }
        /*
-         * If we do have entries on our list and we are here then we didn't find
+         * The bitmaps list has all the bitmaps that record free space
-         * anything, so go ahead and get the next entry after the last entry in
+         * starting after offset, so no more search is required.
-         * this list and start the search from there.
         */
-        if (!list_empty(bitmaps)) {
+        return -ENOSPC;
-                entry = list_entry(bitmaps->prev, struct btrfs_free_space,
-                                   list);
-                node = rb_next(&entry->offset_index);
-                if (!node)
-                        return -ENOSPC;
-                entry = rb_entry(node, struct btrfs_free_space, offset_index);
-                goto search;
-        }
-        entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
-        if (!entry)
-                return -ENOSPC;
-search:
-        node = &entry->offset_index;
-        do {
-                entry = rb_entry(node, struct btrfs_free_space, offset_index);
-                node = rb_next(&entry->offset_index);
-                if (!entry->bitmap)
-                        continue;
-                if (entry->bytes < min_bytes)
-                        continue;
-                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
-                                           bytes, min_bytes);
-        } while (ret && node);
-        return ret;
 }
 /*
@@ -2520,8 +2510,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                             u64 offset, u64 bytes, u64 empty_size)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
-        struct list_head bitmaps;
        struct btrfs_free_space *entry, *tmp;
+        LIST_HEAD(bitmaps);
        u64 min_bytes;
        int ret;
@@ -2560,7 +2550,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                goto out;
        }
-        INIT_LIST_HEAD(&bitmaps);
        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
                                      bytes, min_bytes);
        if (ret)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 116ab67a06df..526dd51a1966 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6794,11 +6794,13 @@ static int btrfs_getattr(struct vfsmount *mnt,
                         struct dentry *dentry, struct kstat *stat)
 {
        struct inode *inode = dentry->d_inode;
+        u32 blocksize = inode->i_sb->s_blocksize;
        generic_fillattr(inode, stat);
        stat->dev = BTRFS_I(inode)->root->anon_dev;
        stat->blksize = PAGE_CACHE_SIZE;
-        stat->blocks = (inode_get_bytes(inode) +
+        stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
-                        BTRFS_I(inode)->delalloc_bytes) >> 9;
+                ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
        return 0;
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4a34c472f126..a90e749ed6d2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1216,12 +1216,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                *devstr = '\0';
                devstr = vol_args->name;
                devid = simple_strtoull(devstr, &end, 10);
-                printk(KERN_INFO "resizing devid %llu\n",
+                printk(KERN_INFO "btrfs: resizing devid %llu\n",
                       (unsigned long long)devid);
        }
        device = btrfs_find_device(root, devid, NULL, NULL);
        if (!device) {
-                printk(KERN_INFO "resizer unable to find device %llu\n",
+                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
                       (unsigned long long)devid);
                ret = -EINVAL;
                goto out_unlock;
@@ -1267,7 +1267,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        do_div(new_size, root->sectorsize);
        new_size *= root->sectorsize;
-        printk(KERN_INFO "new size for %s is %llu\n",
+        printk(KERN_INFO "btrfs: new size for %s is %llu\n",
                device->name, (unsigned long long)new_size);
        if (new_size > old_size) {
@@ -2930,11 +2930,13 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
                goto out;
        for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
-                rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val;
+                rel_ptr = ipath->fspath->val[i] -
+                          (u64)(unsigned long)ipath->fspath->val;
                ipath->fspath->val[i] = rel_ptr;
        }
-        ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size);
+        ret = copy_to_user((void *)(unsigned long)ipa->fspath,
+                           (void *)(unsigned long)ipath->fspath, size);
        if (ret) {
                ret = -EFAULT;
                goto out;
@@ -3017,7 +3019,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
        if (ret < 0)
                goto out;
-        ret = copy_to_user((void *)loi->inodes, (void *)inodes, size);
+        ret = copy_to_user((void *)(unsigned long)loi->inodes,
+                           (void *)(unsigned long)inodes, size);
        if (ret)
                ret = -EFAULT;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f4190f22edfb..fab420db5121 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -272,7 +272,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
                        swarn->logical, swarn->dev->name,
                        (unsigned long long)swarn->sector, root, inum, offset,
                        min(isize - offset, (u64)PAGE_SIZE), nlink,
-                        (char *)ipath->fspath->val[i]);
+                        (char *)(unsigned long)ipath->fspath->val[i]);
        free_ipath(ipath);
        return 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 6a0574e923bc..81376d94cd3c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -785,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        btrfs_save_ino_cache(root, trans);
+                        /* see comments in should_cow_block() */
+                        root->force_cow = 0;
+                        smp_wmb();
                        if (root->commit_root != root->node) {
                                mutex_lock(&root->fs_commit_mutex);
                                switch_commit_root(root);
@@ -947,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_tree_unlock(old);
        free_extent_buffer(old);
+        /* see comments in should_cow_block() */
+        root->force_cow = 1;
+        smp_wmb();
        btrfs_set_root_node(new_root_item, tmp);
        /* record when the snapshot was created in key.offset */
        key.offset = trans->transid;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ab5b1c49f352..78f2d4d4f37f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -100,6 +100,12 @@ struct btrfs_device {
        struct reada_zone *reada_curr_zone;
        struct radix_tree_root reada_zones;
        struct radix_tree_root reada_extents;
+        /* for sending down flush barriers */
+        struct bio *flush_bio;
+        struct completion flush_wait;
+        int nobarriers;
 };
 struct btrfs_fs_devices {
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 2abd0dfad7f8..bca3948e9dbf 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1143,7 +1143,7 @@ static void ceph_d_prune(struct dentry *dentry)
 {
        struct ceph_dentry_info *di;
-        dout("d_release %p\n", dentry);
+        dout("ceph_d_prune %p\n", dentry);
        /* do we have a valid parent? */
        if (!dentry->d_parent || IS_ROOT(dentry))
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e392bfce84a3..116f36502f17 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1328,12 +1328,13 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
 */
 void ceph_queue_writeback(struct inode *inode)
 {
+        ihold(inode);
        if (queue_work(ceph_inode_to_client(inode)->wb_wq,
                       &ceph_inode(inode)->i_wb_work)) {
                dout("ceph_queue_writeback %p\n", inode);
-                ihold(inode);
        } else {
                dout("ceph_queue_writeback %p failed\n", inode);
+                iput(inode);
        }
 }
@@ -1353,12 +1354,13 @@ static void ceph_writeback_work(struct work_struct *work)
 */
 void ceph_queue_invalidate(struct inode *inode)
 {
+        ihold(inode);
        if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
                       &ceph_inode(inode)->i_pg_inv_work)) {
                dout("ceph_queue_invalidate %p\n", inode);
-                ihold(inode);
        } else {
                dout("ceph_queue_invalidate %p failed\n", inode);
+                iput(inode);
        }
 }
@@ -1434,13 +1436,14 @@ void ceph_queue_vmtruncate(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
+        ihold(inode);
        if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
                       &ci->i_vmtruncate_work)) {
                dout("ceph_queue_vmtruncate %p\n", inode);
-                ihold(inode);
        } else {
                dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
                     inode, ci->i_truncate_pending);
+                iput(inode);
        }
 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a90846fac759..8dc73a594a90 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -638,10 +638,12 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
        if (err == 0) {
                dout("open_root_inode success\n");
                if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
-                    fsc->sb->s_root == NULL)
+                    fsc->sb->s_root == NULL) {
                        root = d_alloc_root(req->r_target_inode);
-                else
+                        ceph_init_dentry(root);
+                } else {
                        root = d_obtain_alias(req->r_target_inode);
+                }
                req->r_target_inode = NULL;
                dout("open_root_inode success, root dentry is %p\n", root);
        } else {
diff --git a/fs/dcache.c b/fs/dcache.c
index a901c6901bce..10ba92def3f6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -36,6 +36,7 @@
 #include <linux/bit_spinlock.h>
 #include <linux/rculist_bl.h>
 #include <linux/prefetch.h>
+#include <linux/ratelimit.h>
 #include "internal.h"
 /*
@@ -2383,8 +2384,16 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
                                actual = __d_unalias(inode, dentry, alias);
                        }
                        write_sequnlock(&rename_lock);
-                        if (IS_ERR(actual))
+                        if (IS_ERR(actual)) {
+                                if (PTR_ERR(actual) == -ELOOP)
+                                        pr_warn_ratelimited(
+                                                "VFS: Lookup of '%s' in %s %s"
+                                                " would have caused loop\n",
+                                                dentry->d_name.name,
+                                                inode->i_sb->s_type->name,
+                                                inode->i_sb->s_id);
                                dput(alias);
+                        }
                        goto out_nolock;
                }
        }
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f6dba4505f1c..12ccacda44e0 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -565,7 +565,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
        brelse(bitmap_bh);
        printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu"
               ", computed = %llu, %llu\n",
-               EXT4_B2C(sbi, ext4_free_blocks_count(es)),
+               EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)),
               desc_count, bitmap_count);
        return bitmap_count;
 #else
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 240f6e2dc7ee..fffec40d5996 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2270,6 +2270,7 @@ retry:
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
+                        blk_finish_plug(&plug);
                        goto out_writepages;
                }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 877350ef0253..1c7bbd00e7e5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1683,7 +1683,9 @@ static int parse_options(char *options, struct super_block *sb,
                        data_opt = EXT4_MOUNT_WRITEBACK_DATA;
                datacheck:
                        if (is_remount) {
-                                if (test_opt(sb, DATA_FLAGS) != data_opt) {
+                                if (!sbi->s_journal)
+                                        ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
+                                else if (test_opt(sb, DATA_FLAGS) != data_opt) {
                                        ext4_msg(sb, KERN_ERR,
                                                "Cannot change data mode on remount");
                                        return 0;
@@ -3098,8 +3100,6 @@ static void ext4_destroy_lazyinit_thread(void)
 }
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
-                                __releases(kernel_lock)
-                                __acquires(kernel_lock)
 {
        char *orig_data = kstrdup(data, GFP_KERNEL);
        struct buffer_head *bh;
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 3f32bcb0d9bd..ef175cb8cfd8 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -16,38 +16,26 @@
 #include <linux/bitops.h>
 #include <linux/sched.h>
-static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 };
 static DEFINE_SPINLOCK(bitmap_lock);
-static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits)
+/*
+ * bitmap consists of blocks filled with 16bit words
+ * bit set == busy, bit clear == free
+ * endianness is a mess, but for counting zero bits it really doesn't matter...
+ */
+static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits)
 {
-        unsigned i, j, sum = 0;
+        __u32 sum = 0;
-        struct buffer_head *bh;
+        unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8);
-  
-        for (i=0; i<numblocks-1; i++) {
-                if (!(bh=map[i])) 
-                        return(0);
-                for (j=0; j<bh->b_size; j++)
-                        sum += nibblemap[bh->b_data[j] & 0xf]
-                                + nibblemap[(bh->b_data[j]>>4) & 0xf];
-        }
-        if (numblocks==0 || !(bh=map[numblocks-1]))
+        while (blocks--) {
-                return(0);
+                unsigned words = blocksize / 2;
-        i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2;
+                __u16 *p = (__u16 *)(*map++)->b_data;
-        for (j=0; j<i; j++) {
+                while (words--)
-                sum += nibblemap[bh->b_data[j] & 0xf]
+                        sum += 16 - hweight16(*p++);
-                        + nibblemap[(bh->b_data[j]>>4) & 0xf];
        }
-        i = numbits%16;
+        return sum;
-        if (i!=0) {
-                i = *(__u16 *)(&bh->b_data[j]) | ~((1<<i) - 1);
-                sum += nibblemap[i & 0xf] + nibblemap[(i>>4) & 0xf];
-                sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf];
-        }
-        return(sum);
 }
 void minix_free_block(struct inode *inode, unsigned long block)
@@ -105,10 +93,12 @@ int minix_new_block(struct inode * inode)
        return 0;
 }
-unsigned long minix_count_free_blocks(struct minix_sb_info *sbi)
+unsigned long minix_count_free_blocks(struct super_block *sb)
 {
-        return (count_free(sbi->s_zmap, sbi->s_zmap_blocks,
+        struct minix_sb_info *sbi = minix_sb(sb);
-                sbi->s_nzones - sbi->s_firstdatazone + 1)
+        u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1);
+        return (count_free(sbi->s_zmap, sb->s_blocksize, bits)
                << sbi->s_log_zone_size);
 }
@@ -273,7 +263,10 @@ struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
        return inode;
 }
-unsigned long minix_count_free_inodes(struct minix_sb_info *sbi)
+unsigned long minix_count_free_inodes(struct super_block *sb)
 {
-        return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1);
+        struct minix_sb_info *sbi = minix_sb(sb);
+        u32 bits = sbi->s_ninodes + 1;
+        return count_free(sbi->s_imap, sb->s_blocksize, bits);
 }
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 64cdcd662ffc..1d9e33966db0 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -279,6 +279,27 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
        else if (sbi->s_mount_state & MINIX_ERROR_FS)
                printk("MINIX-fs: mounting file system with errors, "
                        "running fsck is recommended\n");
+        /* Apparently minix can create filesystems that allocate more blocks for
+         * the bitmaps than needed.  We simply ignore that, but verify it didn't
+         * create one with not enough blocks and bail out if so.
+         */
+        block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize);
+        if (sbi->s_imap_blocks < block) {
+                printk("MINIX-fs: file system does not have enough "
+                                "imap blocks allocated.  Refusing to mount\n");
+                goto out_iput;
+        }
+        block = minix_blocks_needed(
+                        (sbi->s_nzones - (sbi->s_firstdatazone + 1)),
+                        s->s_blocksize);
+        if (sbi->s_zmap_blocks < block) {
+                printk("MINIX-fs: file system does not have enough "
+                                "zmap blocks allocated.  Refusing to mount.\n");
+                goto out_iput;
+        }
        return 0;
 out_iput:
@@ -339,10 +360,10 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_type = sb->s_magic;
        buf->f_bsize = sb->s_blocksize;
        buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size;
-        buf->f_bfree = minix_count_free_blocks(sbi);
+        buf->f_bfree = minix_count_free_blocks(sb);
        buf->f_bavail = buf->f_bfree;
        buf->f_files = sbi->s_ninodes;
-        buf->f_ffree = minix_count_free_inodes(sbi);
+        buf->f_ffree = minix_count_free_inodes(sb);
        buf->f_namelen = sbi->s_namelen;
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 341e2122879a..26bbd55e82ea 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -48,10 +48,10 @@ extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, stru
 extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
 extern struct inode * minix_new_inode(const struct inode *, int, int *);
 extern void minix_free_inode(struct inode * inode);
-extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
+extern unsigned long minix_count_free_inodes(struct super_block *sb);
 extern int minix_new_block(struct inode * inode);
 extern void minix_free_block(struct inode *inode, unsigned long block);
-extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi);
+extern unsigned long minix_count_free_blocks(struct super_block *sb);
 extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len);
@@ -88,6 +88,11 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
        return list_entry(inode, struct minix_inode_info, vfs_inode);
 }
+static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize)
+{
+        return DIV_ROUND_UP(bits, blocksize * 8);
+}
 #if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \
        defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
@@ -125,7 +130,7 @@ static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size)
        if (!size)
                return 0;
-        size = (size >> 4) + ((size & 15) > 0);
+        size >>= 4;
        while (*p++ == 0xffff) {
                if (--size == 0)
                        return (p - addr) << 4;
diff --git a/fs/namespace.c b/fs/namespace.c
index 50ee30345b4f..6d3a1963879b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2493,6 +2493,7 @@ EXPORT_SYMBOL(create_mnt_ns);
 struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
 {
        struct mnt_namespace *ns;
+        struct super_block *s;
        struct path path;
        int err;
@@ -2509,10 +2510,11 @@ struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
                return ERR_PTR(err);
        /* trade a vfsmount reference for active sb one */
-        atomic_inc(&path.mnt->mnt_sb->s_active);
+        s = path.mnt->mnt_sb;
+        atomic_inc(&s->s_active);
        mntput(path.mnt);
        /* lock the sucker */
-        down_write(&path.mnt->mnt_sb->s_umount);
+        down_write(&s->s_umount);
        /* ... and return the root of (sub)tree on it */
        return path.dentry;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b238d95ac48c..ac2899098147 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1468,12 +1468,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
+                        case -EISDIR:
                        case -ENOTDIR:
                                goto no_open;
                        case -ELOOP:
                                if (!(nd->intent.open.flags & O_NOFOLLOW))
                                        goto no_open;
-                        /* case -EISDIR: */
                        /* case -EINVAL: */
                        default:
                                res = ERR_CAST(inode);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 0a1f8312b4dc..eca56d4b39c0 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -40,48 +40,8 @@
 #define NFSDBG_FACILITY         NFSDBG_FILE
-static int nfs_file_open(struct inode *, struct file *);
-static int nfs_file_release(struct inode *, struct file *);
-static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin);
-static int  nfs_file_mmap(struct file *, struct vm_area_struct *);
-static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos,
-                                        struct pipe_inode_info *pipe,
-                                        size_t count, unsigned int flags);
-static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
-                                unsigned long nr_segs, loff_t pos);
-static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
-                                        struct file *filp, loff_t *ppos,
-                                        size_t count, unsigned int flags);
-static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
-                                unsigned long nr_segs, loff_t pos);
-static int  nfs_file_flush(struct file *, fl_owner_t id);
-static int  nfs_file_fsync(struct file *, loff_t, loff_t, int datasync);
-static int nfs_check_flags(int flags);
-static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
-static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
-static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
 static const struct vm_operations_struct nfs_file_vm_ops;
-const struct file_operations nfs_file_operations = {
-        .llseek         = nfs_file_llseek,
-        .read           = do_sync_read,
-        .write          = do_sync_write,
-        .aio_read       = nfs_file_read,
-        .aio_write      = nfs_file_write,
-        .mmap           = nfs_file_mmap,
-        .open           = nfs_file_open,
-        .flush          = nfs_file_flush,
-        .release        = nfs_file_release,
-        .fsync          = nfs_file_fsync,
-        .lock           = nfs_lock,
-        .flock          = nfs_flock,
-        .splice_read    = nfs_file_splice_read,
-        .splice_write   = nfs_file_splice_write,
-        .check_flags    = nfs_check_flags,
-        .setlease       = nfs_setlease,
-};
 const struct inode_operations nfs_file_inode_operations = {
        .permission     = nfs_permission,
        .getattr        = nfs_getattr,
@@ -886,3 +846,54 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
                        file->f_path.dentry->d_name.name, arg);
        return -EINVAL;
 }
+const struct file_operations nfs_file_operations = {
+        .llseek         = nfs_file_llseek,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+        .aio_read       = nfs_file_read,
+        .aio_write      = nfs_file_write,
+        .mmap           = nfs_file_mmap,
+        .open           = nfs_file_open,
+        .flush          = nfs_file_flush,
+        .release        = nfs_file_release,
+        .fsync          = nfs_file_fsync,
+        .lock           = nfs_lock,
+        .flock          = nfs_flock,
+        .splice_read    = nfs_file_splice_read,
+        .splice_write   = nfs_file_splice_write,
+        .check_flags    = nfs_check_flags,
+        .setlease       = nfs_setlease,
+};
+#ifdef CONFIG_NFS_V4
+static int
+nfs4_file_open(struct inode *inode, struct file *filp)
+{
+        /*
+         * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to
+         * this point, then something is very wrong
+         */
+        dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp);
+        return -ENOTDIR;
+}
+const struct file_operations nfs4_file_operations = {
+        .llseek         = nfs_file_llseek,
+        .read           = do_sync_read,
+        .write          = do_sync_write,
+        .aio_read       = nfs_file_read,
+        .aio_write      = nfs_file_write,
+        .mmap           = nfs_file_mmap,
+        .open           = nfs4_file_open,
+        .flush          = nfs_file_flush,
+        .release        = nfs_file_release,
+        .fsync          = nfs_file_fsync,
+        .lock           = nfs_lock,
+        .flock          = nfs_flock,
+        .splice_read    = nfs_file_splice_read,
+        .splice_write   = nfs_file_splice_write,
+        .check_flags    = nfs_check_flags,
+        .setlease       = nfs_setlease,
+};
+#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c07a55aec838..50a15fa8cf98 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                 */
                inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops;
                if (S_ISREG(inode->i_mode)) {
-                        inode->i_fop = &nfs_file_operations;
+                        inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
                        inode->i_data.a_ops = &nfs_file_aops;
                        inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
                } else if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index c1a1bd8ddf1c..3f4d95751d52 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -299,6 +299,8 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
                struct list_head *head);
+extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
+                struct inode *inode);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_readdata_release(struct nfs_read_data *rdata);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 85f1690ca08c..d4bc9ed91748 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -853,6 +853,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .dentry_ops     = &nfs_dentry_operations,
        .dir_inode_ops  = &nfs3_dir_inode_operations,
        .file_inode_ops = &nfs3_file_inode_operations,
+        .file_ops       = &nfs_file_operations,
        .getroot        = nfs3_proc_get_root,
        .getattr        = nfs3_proc_getattr,
        .setattr        = nfs3_proc_setattr,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b60fddf606f7..be2bbac13817 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2464,8 +2464,7 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst
                case -NFS4ERR_BADNAME:
                        return -ENOENT;
                case -NFS4ERR_MOVED:
-                        err = nfs4_get_referral(dir, name, fattr, fhandle);
+                        return nfs4_get_referral(dir, name, fattr, fhandle);
-                        break;
                case -NFS4ERR_WRONGSEC:
                        nfs_fixup_secinfo_attributes(fattr, fhandle);
                }
@@ -6253,6 +6252,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .dentry_ops     = &nfs4_dentry_operations,
        .dir_inode_ops  = &nfs4_dir_inode_operations,
        .file_inode_ops = &nfs4_file_inode_operations,
+        .file_ops       = &nfs4_file_operations,
        .getroot        = nfs4_proc_get_root,
        .getattr        = nfs4_proc_getattr,
        .setattr        = nfs4_proc_setattr,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index baf73536bc04..8e672a2b2d69 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1260,6 +1260,25 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
+static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
+{
+        struct nfs_pageio_descriptor pgio;
+        put_lseg(data->lseg);
+        data->lseg = NULL;
+        dprintk("pnfs write error = %d\n", data->pnfs_error);
+        nfs_pageio_init_read_mds(&pgio, data->inode);
+        while (!list_empty(&data->pages)) {
+                struct nfs_page *req = nfs_list_entry(data->pages.next);
+                nfs_list_remove_request(req);
+                nfs_pageio_add_request(&pgio, req);
+        }
+        nfs_pageio_complete(&pgio);
+}
 /*
 * Called by non rpc-based layout drivers
 */
@@ -1268,11 +1287,8 @@ void pnfs_ld_read_done(struct nfs_read_data *data)
        if (likely(!data->pnfs_error)) {
                __nfs4_read_done_cb(data);
                data->mds_ops->rpc_call_done(&data->task, data);
-        } else {
+        } else
-                put_lseg(data->lseg);
+                pnfs_ld_handle_read_error(data);
-                data->lseg = NULL;
-                dprintk("pnfs write error = %d\n", data->pnfs_error);
-        }
        data->mds_ops->rpc_release(data);
 }
 EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ac40b8535d7e..f48125da198a 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -710,6 +710,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .dentry_ops     = &nfs_dentry_operations,
        .dir_inode_ops  = &nfs_dir_inode_operations,
        .file_inode_ops = &nfs_file_inode_operations,
+        .file_ops       = &nfs_file_operations,
        .getroot        = nfs_proc_get_root,
        .getattr        = nfs_proc_getattr,
        .setattr        = nfs_proc_setattr,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 8b48ec63f722..cfa175c223dc 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -109,7 +109,7 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
        }
 }
-static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
+void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio,
                struct inode *inode)
 {
        nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops,
@@ -534,23 +534,13 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata)
 static void nfs_readpage_release_full(void *calldata)
 {
        struct nfs_read_data *data = calldata;
-        struct nfs_pageio_descriptor pgio;
-        if (data->pnfs_error) {
-                nfs_pageio_init_read_mds(&pgio, data->inode);
-                pgio.pg_recoalesce = 1;
-        }
        while (!list_empty(&data->pages)) {
                struct nfs_page *req = nfs_list_entry(data->pages.next);
                nfs_list_remove_request(req);
-                if (!data->pnfs_error)
+                nfs_readpage_release(req);
-                        nfs_readpage_release(req);
-                else
-                        nfs_pageio_add_request(&pgio, req);
        }
-        if (data->pnfs_error)
-                nfs_pageio_complete(&pgio);
        nfs_readdata_release(calldata);
 }